In [1]:
import datetime
import pandas as pd
import numpy as np
#from sklearn.cross_validation import train_test_split
#import xgboost as xgb
import random
from operator import itemgetter
import zipfile
#from sklearn.metrics import roc_auc_score
import time
import os
random.seed(2016)
os.chdir('/home/valesco/Datasets/Avito/')
#from gensim.models import Word2Vec
import nltk.corpus
from nltk import SnowballStemmer
import random as rnd, re
import json
import gc
import pickle

In [2]:
#Functions

def create_feature_map(features):
    outfile = open('xgb.fmap','w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i,feat))
    outfile.close()
    
def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap = 'xgb.fmap')
    importance = sorted(importance.items(), key = itemgetter(1), reverse = True)
    return importance

def intersect(a, b):
    return list(set(a) & set(b))

def print_features_importance(imp):
    for i in range(len(imp)):
        print('#' + str(imp[i][1]))
        print('output.remove(\'' + imp[i][0] + '\')')
        
def run_default_test(train, test, features, target, random_state = 0):
    eta = 0.1
    max_depth = 5
    subsample = 0.8
    colsample_bytree = 0.8
    start_time = time.time()
    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, 
            max_depth, subsample, colsample_bytree))
    
    params = {
        'objective': 'binary:logistic',
        'booster': 'gbtree',
        'eval_metric': 'auc',
        'eta': eta,
        'max_depth': max_depth,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'silent': 1,
        'seed': random_state
    }
    
    num_boost_round = 260
    early_stopping_rounds = 20
    test_size = 0.1
    
    X_train, X_valid = train_test_split(train, test_size = test_size, random_state = random_state)
    y_train = X_train[target]
    y_valid = X_valid[target]
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)
    
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals = watchlist, early_stopping_rounds = early_stopping_rounds,
                   verbose_eval = True)
    
    print('Validating...')
    check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit = gbm.best_ntree_limit)
    score = roc_auc_score(X_valid[target].values, check)
    print('Check error value: {:.6f}'.format(score))
    
    imp = get_importance(gbm, features)
    print('Importance array: ', imp)
    
    print('Predict test set...')
    test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit = gbm.best_ntree_limit)
    
    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return test_prediction.tolist(), score

def create_submission(score, test, prediction):
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(score) + '_' + str(now.strftime('%Y-%m-%d-%H-%M')) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('id, probability\n')
    total = 0
    for id in test['id']:
        str1 = str(id) + ',' + str(prediction[total])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()
    
def get_features(train, test):
    trainval = list(train.columns.values)
    testval = list(test.columns.values)
    output = intersect(trainval, testval)
    output.remove('itemID_1')
    output.remove('itemID_2')
    return output

stopw_r = [word for word in nltk.corpus.stopwords.words("russian") if word!="не"]
stopw_e = [word for word in nltk.corpus.stopwords.words("english")]
stopw = stopw_r + stopw_e
stem_r = SnowballStemmer('russian')
stem_e = SnowballStemmer('english')
#model = Word2Vec.load_word2vec_format('ruscorpora.model.bin', binary = True)


def str_stem(s):
    s = re.sub(u'[^a-zа-я0-9]', ' ', str(s).lower())
    s = (' ').join([stem_r.stem(w) if not re.search("[0-9a-z]", w) else stem_e.stem(w) for w in s.split() if len(w)>1 and w not in stopw])
    
    return s

def similarity(df):
    bow_title1 = str(df['title_1']).split(' ')
    bow_title2 = str(df['title_2']).split(' ')
    bow_desc1 = str(df['description_1']).split(' ')
    bow_desc2 = str(df['description_2']).split(' ')
    json1 = json.loads(str(df['JSON_1']))
    json2 = json.loads(str(df['JSON_2']))
    
    count1 = 0
    count2 = 0
    for word1 in bow_title1:
        for word2 in bow_title2:
            try:
                temp = model.similarity(word1, word2)
                if temp > .30 or word1 == word2:
                    count1 += 1
            except:
                count2 += 1
                pass
    title_sim_1 = count1/(len(bow_title1))
    
    count1 = 0
    count2 = 0
    for word1 in bow_title2:
        for word2 in bow_title1:
            try:
                temp = model.similarity(word1, word2)
                if temp > .30 or word1 == word2:
                    count1 += 1
            except:
                count2 += 1
                pass
    title_sim_2 = count1/(len(bow_title2))
    
    
    count1 = 0
    count2 = 0
    for word1 in bow_desc1:
        for word2 in bow_desc2:
            try:
                temp = model.similarity(word1, word2)
                if temp > .30 or word1 == word2:
                    count1 += 1
            except:
                count2 += 1
                pass
    desc_sim_1 = count1/(len(bow_desc1))
    
    count1 = 0
    count2 = 0
    for word1 in bow_desc2:
        for word2 in bow_desc1:
            try:
                temp = model.similarity(word1, word2)
                if temp > .30 or word1 == word2:
                    count1 += 1
            except:
                count2 += 1
                pass
    desc_sim_2 = count1/(len(bow_desc2))
    
    try:
        json_keys1 = json1.keys()
        json_keys2 = json2.keys()
    
    
        key_count1 = 0
        key_count2 = 0
        value_count1 = 0
        value_count2 = 0

        for key in json_keys1:
            if key in json_keys2:
                key_count1 += 1
                temp1 = str(json1[key]).split(' ')
                temp2 = str(json2[key]).split(' ')

                for word1 in temp1:
                    for word2 in temp2:
                        try:
                            temp = model.similarity(word1, word2)
                            if temp > .30 or word1 == word2:
                                value_count1 += 1
                        except:
                            pass

        value_sim_1 = value_count1/(len(temp1))


        for key in json_keys2:
            if key in json_keys1:
                key_count1 += 1
                temp1 = str(json1[key]).split(' ')
                temp2 = str(json2[key]).split(' ')

                for word1 in temp1:
                    for word2 in temp2:
                        try:
                            temp = model.similarity(word1, word2)
                            if temp > .30 or word1 == word2:
                                value_count2 += 1
                        except:
                            pass

        value_sim_2 = value_count2/(len(temp2))




            #else:
                #key_count2 += 0

        if len(json_keys1) > 0:
            json_keys_sim_1 = key_count1/(key_count1 + key_count2)
        else:
            json_keys_sim_1 = -1

        for key in json_keys2:
            if key in json_keys1:
                key_count1 += 1
            else:
                key_count2 += 0

        if len(json_keys2) > 0:
            json_keys_sim_2 = key_count1/(key_count1 + key_count2)
        else:
            json_keys_sim_2 = -1
    except:
        json_keys_sim_1 = -1
        json_keys_sim_2 = -1
        value_sim_1 = -1
        value_sim_2 = -1
    
    return pd.Series({'title_sim_1': title_sim_1, 'title_sim_2': title_sim_2, 'desc_sim_1': desc_sim_1, 
                     'desc_sim_2': desc_sim_2, 'json_keys_sim_1': json_keys_sim_1, 'json_keys_sim_2': json_keys_sim_2,
                     'json_values_sim_1': value_sim_1, 'json_values_sim_2': value_sim_2})

def title_compare(df):
    bow1 = df['title_1']
    bow2 = df['title_2']
    bow1 = str(bow1)
    bow2 = str(bow2)
    split1 = bow1.split(' ')
    split2 = bow2.split(' ')
    
    yes1 = 0
    no1 = 0
    
    for s1 in split1:
        if s1 in bow2:
            yes1 += 1
        else:
            no1 += 1
            
    yes2 = 0
    no2 = 0
    
    for s2 in split2:
        if s2 in bow1:
            yes2 += 1
        else:
            no2 += 1
            
    try:
        ratio1 = yes1/no1
    except:
        ratio1 = 10
        
    try:
        ratio2 = yes2/no2
    except:
        ratio2 = 10
        
    return pd.Series({'title_ratio_1': ratio1, 'title_ratio_2': ratio2})

def desc_compare(df):
    bow1 = df['description_1']
    bow2 = df['description_2']
    bow1 = str(bow1)
    bow2 = str(bow2)
    split1 = bow1.split(' ')
    split2 = bow2.split(' ')
    
    yes1 = 0
    no1 = 0
    
    for s1 in split1:
        if s1 in bow2:
            yes1 += 1
        else:
            no1 += 1
            
    yes2 = 0
    no2 = 0
    
    for s2 in split2:
        if s2 in bow1:
            yes2 += 1
        else:
            no2 += 1
            
    try:
        ratio1 = yes1/no1
    except:
        ratio1 = 10
        
    try:
        ratio2 = yes2/no2
    except:
        ratio2 = 10
        
    return pd.Series({'desc_ratio_1': ratio1, 'desc_ratio_2': ratio2})

def hDist(s1, s2):
    s1, s2 = str(s1), str(s2)
    return sum(bool(ord(ch1) - ord(ch2)) for ch1, ch2 in zip(s1, s2))

def compare_dhashes(df):
    if df['images_array_1'] == -1 or df['images_array_2'] == -1:
        hash_comp = -1
    else:
        hash_array1 = (str(df['images_array_1'])).replace(' ','')
        hash_array2 = (str(df['images_array_2'])).replace(' ','')

        hash_ls1 = hash_array1.split(',')
        hash_ls2 = hash_array2.split(',')

        num_images = len(hash_ls1)
        if len(hash_ls2) < num_images:
            num_images = len(hash_ls2)

        count = 0
        bound = 10

        dist_ls = []
        for h1 in hash_ls1:
            if int(h1) in dhash:
                for h2 in hash_ls2:
                    if int(h2) in dhash:
                        dist = hDist(dhash[int(h1)], dhash[int(h2)])
                        if dist < bound:
                            count += 1
                            
        hash_comp = count/num_images
                        
    return hash_comp

def compare_whashes(df):
    if df['images_array_1'] == -1 or df['images_array_2'] == -1:
        hash_comp = -1
    else:
        hash_array1 = (str(df['images_array_1'])).replace(' ','')
        hash_array2 = (str(df['images_array_2'])).replace(' ','')

        hash_ls1 = hash_array1.split(',')
        hash_ls2 = hash_array2.split(',')

        num_images = len(hash_ls1)
        if len(hash_ls2) < num_images:
            num_images = len(hash_ls2)

        count = 0
        bound = 10

        dist_ls = []
        for h1 in hash_ls1:
            if int(h1) in whash:
                for h2 in hash_ls2:
                    if int(h2) in whash:
                        dist = hDist(whash[int(h1)], whash[int(h2)])
                        if dist < bound:
                            count += 1
                            
        hash_comp = count/num_images
                        
    return hash_comp

In [3]:
# Load CSV Files
testing = 0


types1 = {
        'itemID_1': np.dtype(int),
        'itemID_2': np.dtype(int),
        'isDuplicate': np.dtype(int),
        'generationMethod': np.dtype(int),
        }

types2 = {
    'itemID': np.dtype(int),
    'categoryID': np.dtype(int),
    'title': np.dtype(str),
    'description': np.dtype(str),
    'images_array': np.dtype(str),
    'attrsJSON': np.dtype(str),
    'price': np.dtype(float),
    'locationID': np.dtype(int),
    'metroID': np.dtype(float),
    'lat': np.dtype(float),
    'lon': np.dtype(float),
        }

print("Load ItemPairs_train.csv")
pairs = pd.read_csv("ItemPairs_train.csv", dtype=types1)
# Add 'id' column for easy merge

print("Load ItemInfo_train.csv")
items = pd.read_csv("ItemInfo_train.csv", dtype=types2)
items.fillna(-1, inplace=True)
#print('Loading Location')
#location = pd.read_csv("Location.csv")

#print('Loading Category')
#category = pd.read_csv("Category.csv")

print('Loading hash pickles')
#dhash = pickle.load(open('dhash.pkl','rb'))
#whash = pickle.load(open('whash_haar.pkl','rb'))

print('Data Loaded')

Load ItemPairs_train.csv
Load ItemInfo_train.csv
Loading hash pickles
Data Loaded


In [None]:
#Prepare Training Set
print('Starting Data Prep')


start_time = time.time()
train = pairs


train = train.drop(['generationMethod'], axis=1)

print('Calculating Lengths')
items['len_title'] = items['title'].str.len()

items['len_description'] = items['description'].str.len()

items['len_attrs_JSON'] = items['attrsJSON'].str.len()

items['num_words_title'] = items['title'].map(lambda x: len(str(x).split()))

items['num_words_description'] = items['description'].map(lambda x: len(str(x).split()))


item1 = items[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon', 
'len_title', 'len_description', 'len_attrs_JSON', 'num_words_title', 
'num_words_description', 'title', 'description', 'attrsJSON', 'images_array']]
#item1 = pd.merge(item1, category, how='left', on='categoryID', left_index=True)
#item1 = pd.merge(item1, location, how='left', on='locationID', left_index=True)

print('Starting Item 1')

item1 = item1.rename(
    columns={
        'itemID': 'itemID_1',
        'categoryID': 'categoryID_1',
        #'parentCategoryID': 'parentCategoryID_1',
        'price': 'price_1',
        'locationID': 'locationID_1',
        #'regionID': 'regionID_1',
        'metroID': 'metroID_1',
        'lat': 'lat_1',
        'lon': 'lon_1',
        'len_title': 'len_title_1',
        'len_description': 'len_description_1',
        'len_attrs_JSON': 'len_attrs_JSON_1',
        'num_words_title': 'num_words_title_1',
        'num_words_description': 'num_words_description_1',
        'title': 'title_1',
        'description': 'description_1',
        'attrsJSON': 'JSON_1',
        'images_array': 'images_array_1'
    }
)

# Add item 1 data
train = pd.merge(train, item1, how='left', on='itemID_1', left_index=True)

print('Starting Item 2')

item2 = items[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon', 
'len_title', 'len_description', 'len_attrs_JSON', 'num_words_title', 
'num_words_description', 'title', 'description', 'attrsJSON', 'images_array']]
#item2 = pd.merge(item2, category, how='left', on='categoryID', left_index=True)
#item2 = pd.merge(item2, location, how='left', on='locationID', left_index=True)

item2 = item2.rename(
    columns={
        'itemID': 'itemID_2',
        'categoryID': 'categoryID_2',
        #'parentCategoryID': 'parentCategoryID_2',
        'price': 'price_2',
        'locationID': 'locationID_2',
        #'regionID': 'regionID_2',
        'metroID': 'metroID_2',
        'lat': 'lat_2',
        'lon': 'lon_2',
        'len_title': 'len_title_2',
        'len_description': 'len_description_2',
        'len_attrs_JSON': 'len_attrs_JSON_2',
        'num_words_title': 'num_words_title_2',
        'num_words_description': 'num_words_description_2',
        'title': 'title_2',
        'description': 'description_2',
        'attrsJSON': 'JSON_2',
        'images_array': 'images_array_2'
    }
)

# Add item 2 data
train = pd.merge(train, item2, how='left', on='itemID_2', left_index=True)

# Create same arrays
print('Starting Stem')
start_stem = time.time()
train['title_1'] = train['title_1'].map(lambda x: str_stem(x))
train['description_1'] = train['description_1'].map(lambda x: str_stem(x))
train['title_2'] = train['title_2'].map(lambda x: str_stem(x))
train['description_2'] = train['description_2'].map(lambda x: str_stem(x))
finish_stem = time.time()
print('Seconds to stem/stopword removal on training data: ', str((finish_stem - start_stem)))

train['price_same'] = np.equal(train['price_1'], train['price_2']).astype(np.int32)
train['locationID_same'] = np.equal(train['locationID_1'], train['locationID_2']).astype(np.int32)
train['categoryID_same'] = np.equal(train['categoryID_1'], train['categoryID_2']).astype(np.int32)
#train['regionID_same'] = np.equal(train['regionID_1'], train['regionID_2']).astype(np.int32)
train['metroID_same'] = np.equal(train['metroID_1'], train['metroID_2']).astype(np.int32)
train['lat_same'] = np.equal(train['lat_1'], train['lat_2']).astype(np.int32)
train['lon_same'] = np.equal(train['lon_1'], train['lon_2']).astype(np.int32)



#temp_df = train.apply(similarity, axis = 1)
#train['title_sim_1'] = temp_df['title_sim_1']
#train['title_sim_2'] = temp_df['title_sim_2']
#train['desc_sim_1'] = temp_df['desc_sim_1']
#train['desc_sim_2'] = temp_df['desc_sim_2']
#train['json_keys_sim_1'] = temp_df['json_keys_sim_1']
#train['json_keys_sim_2'] = temp_df['json_keys_sim_2']
#train['json_values_sim_1'] = temp_df['json_values_sim_1']
#train['json_values_sim_2'] = temp_df['json_values_sim_2']

print('Applying Ratios')

temp_df = train.apply(title_compare, axis = 1)
train['title_ratio_1'] = temp_df['title_ratio_1']
train['title_ratio_2'] = temp_df['title_ratio_2']

temp_df = train.apply(desc_compare, axis = 1)
train['desc_ratio_1'] = temp_df['desc_ratio_1']
train['desc_ratio_2'] = temp_df['desc_ratio_2']

print('Calculating Hash Distance')
#train['dhash_comp'] = train.apply(compare_dhashes, axis = 1)
#train['whash_comp'] = train.apply(compare_whashes, axis = 1)


# print(train.describe())
print('Create train data time: {} seconds'.format(round(time.time() - start_time, 2)))

gc.collect()

print('Training Set Completed')

Starting Data Prep
Calculating Lengths
Starting Item 1
Starting Item 2
Starting Stem


In [None]:
# Load Test Data

print("Load ItemPairs_train.csv")
pairs_test = pd.read_csv("ItemPairs_test.csv", dtype=types1)
# Add 'id' column for easy merge

print("Load ItemInfo_test.csv")
items_test = pd.read_csv("ItemInfo_test.csv", dtype=types2)
items_test.fillna(-1, inplace=True)

In [None]:
#Prepare Test Set
print('Starting Data Prep')
start_time = time.time()
test = pairs_test
test = train.drop(['generationMethod'], axis=1)

print('Add text features...')
items_test['len_title'] = items_test['title'].str.len()
print('Len Title')
items_test['len_description'] = items_test['description'].str.len()
print('Len Desc')
items_test['len_attrs_JSON'] = items_test['attrsJSON'].str.len()
print('Len JSON')
items_test['num_words_title'] = items_test['title'].map(lambda x: len(str(x).split()))
print('Num Words In Title')
items_test['num_words_description'] = items_test['description'].map(lambda x: len(str(x).split()))

print('Merge item 1...')
item1 = items_test[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon', 
'len_title', 'len_description', 'len_attrs_JSON', 'num_words_title', 
'num_words_description', 'title', 'description', 'attrsJSON']]
item1 = pd.merge(item1, category, how='left', on='categoryID', left_index=True)
item1 = pd.merge(item1, location, how='left', on='locationID', left_index=True)



item1 = item1.rename(
    columns={
        'itemID': 'itemID_1',
        'categoryID': 'categoryID_1',
        'parentCategoryID': 'parentCategoryID_1',
        'price': 'price_1',
        'locationID': 'locationID_1',
        'regionID': 'regionID_1',
        'metroID': 'metroID_1',
        'lat': 'lat_1',
        'lon': 'lon_1',
        'len_title': 'len_title_1',
        'len_description': 'len_description_1',
        'len_attrs_JSON': 'len_attrs_JSON_1',
        'num_words_title': 'num_words_title_1',
        'num_words_description': 'num_words_description_1',
        'title': 'title_1',
        'description': 'description_1',
        'attrsJSON': 'JSON_1'
    }
)

# Add item 1 data
test = pd.merge(train, item1, how='left', on='itemID_1', left_index=True)


print('Merge item 2...')
item2 = items_test[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon', 
'len_title', 'len_description', 'len_attrsJSON', 'num_words_title', 
'num_words_description', 'title', 'description', 'attrsJSON']]
item2 = pd.merge(item2, category, how='left', on='categoryID', left_index=True)
item2 = pd.merge(item2, location, how='left', on='locationID', left_index=True)

item2 = item2.rename(
    columns={
        'itemID': 'itemID_2',
        'categoryID': 'categoryID_2',
        'parentCategoryID': 'parentCategoryID_2',
        'price': 'price_2',
        'locationID': 'locationID_2',
        'regionID': 'regionID_2',
        'metroID': 'metroID_2',
        'lat': 'lat_2',
        'lon': 'lon_2',
        'len_title': 'len_title_2',
        'len_description': 'len_description_2',
        'len_attrs_JSON': 'len_attrs_JSON_2',
        'num_words_title': 'num_words_title_2',
        'num_words_description': 'num_words_description_2',
        'title': 'title_2',
        'description': 'description_2',
        'attrsJSON': 'JSON_2'
    }
)

# Add item 2 data
test = pd.merge(train, item2, how='left', on='itemID_2', left_index=True)

# Create same arrays
print('Create same arrays')

print('Stemming')
start_stem = time.time()
test['title_1'] = test['title_1'].map(lambda x: str_stem(x))
test['description_1'] = test['description_1'].map(lambda x: str_stem(x))
test['title_2'] = test['title_2'].map(lambda x: str_stem(x))
test['description_2'] = test['description_2'].map(lambda x: str_stem(x))
finish_stem = time.time()
print('Seconds to stem/stopword removal on training data: ', str((finish_stem - start_stem)))

test['price_same'] = np.equal(test['price_1'], test['price_2']).astype(np.int32)
test['locationID_same'] = np.equal(test['locationID_1'], test['locationID_2']).astype(np.int32)
test['categoryID_same'] = np.equal(test['categoryID_1'], test['categoryID_2']).astype(np.int32)
test['regionID_same'] = np.equal(test['regionID_1'], test['regionID_2']).astype(np.int32)
test['metroID_same'] = np.equal(test['metroID_1'], test['metroID_2']).astype(np.int32)
test['lat_same'] = np.equal(test['lat_1'], test['lat_2']).astype(np.int32)
test['lon_same'] = np.equal(test['lon_1'], test['lon_2']).astype(np.int32)

print('Start Similarity')

temp_df = train.apply(similarity, axis = 1)
test['title_sim_1'] = temp_df['title_sim_1']
test['title_sim_2'] = temp_df['title_sim_2']
test['desc_sim_1'] = temp_df['desc_sim_1']
test['desc_sim_2'] = temp_df['desc_sim_2']
test['json_keys_sim_1'] = temp_df['json_keys_sim_1']
test['json_keys_sim_2'] = temp_df['json_keys_sim_2']
test['json_values_sim_1'] = temp_df['json_values_sim_1']
test['json_values_sim_2'] = temp_df['json_values_sim_2']


print('Start BOWs Compare')
temp_df = test.apply(title_compare, axis = 1)
test['title_ratio_1'] = temp_df['title_ratio_1']
test['title_ratio_2'] = temp_df['title_ratio_2']

temp_df = test.apply(desc_compare, axis = 1)
test['desc_ratio_1'] = temp_df['desc_ratio_1']
test['desc_ratio_2'] = temp_df['desc_ratio_2']


# print(train.describe())
print('Process Training set complete')
print('Create train data time: {} seconds'.format(round(time.time() - start_time, 2)))



In [9]:
train[:4]

Unnamed: 0,itemID_1,itemID_2,isDuplicate,generationMethod,categoryID_1,price_1,locationID_1,metroID_1,lat_1,lon_1,...,json_keys_sim_1,json_keys_sim_2,json_values_sim_1,json_values_sim_2,title_ratio_1,title_ratio_2,desc_ratio_1,desc_ratio_2,dhash_comp,whash_comp
2123,1,4112648,1,1,81,300000.0,648140,-1.0,64.686946,30.815924,...,1.0,1.0,0.0,0.0,10.0,10.0,10.0,10.0,1.0,1.0
1402,3,1991275,1,1,14,300000.0,639040,-1.0,55.678037,37.256548,...,1.0,1.0,0.0,0.0,10.0,10.0,10.0,1.0,0.75,1.0
1540,4,1223296,0,1,84,3500.0,640650,-1.0,56.239398,43.460458,...,1.0,1.0,0.0,0.0,2.0,2.0,0.333333,5.0,0.0,0.0
3386,7,1058851,1,1,84,13500.0,662210,-1.0,55.77717,37.586194,...,1.0,1.0,0.0,0.0,10.0,7.0,10.0,10.0,-1.0,-1.0
