In [1]:
import datetime
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
import random
from operator import itemgetter
import zipfile
from sklearn.metrics import roc_auc_score
import time
import os
random.seed(2016)

path = ('~/Datasets/Avito')

In [2]:
def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()    

In [3]:
def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap = 'xgb.fmap')
    importance = sorted(importance.items(), key = itemgetter(1), reverse = True)
    return importance   

In [4]:
def intersect(a, b):
    return list(set(a) & set(b))

def print_features_importances(imp):
    for i in range(len(imp)):
        print("# " + str(imp[i][1]))
        print('output.remove(\'' + imp[i][0] + '\')')

In [12]:
def run_default_test(train, test, features, target, random_state = 0):
    eta = 0.1
    max_depth = 5
    subsample = 0.8
    colsample_bytree = 0.8
    start_time = time.time()
    
    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta,
          max_depth, subsample, colsample_bytree))
    params = {
        'objective': 'binary:logistic',
        'booster': 'gbtree',
        'eval_metric': 'auc',
        'eta': eta,
        'max_depth': max_depth,
        'subsample': subsample,
        'colsample_bytree': colsample_bytree,
        'silent': 1,
        'seed': random_state
    }
    num_boost_round = 260
    early_stopping_rounds = 20
    test_size = 0.1
    
    X_train, X_valid = train_test_split(train, test_size = test_size, random_state = random_state)
    y_train = X_train[target]
    y_valid = X_valid[target]
    dtrain = xgb.DMatrix(X_train[features], y_train)
    dvalid = xgb.DMatrix(X_valid[features], y_valid)
    
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    gbm = xgb.train(params, dtrain, num_boost_round, evals = watchlist,
        early_stopping_rounds = early_stopping_rounds, verbose_eval = True)
    
    print('Validating...')
    check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit = gbm.best_ntree_limit)
    score = roc_auc_score(X_valid[target].values, check)
    print('Check error value: {:.6f}'.format(score))
    
    imp = get_importance(gbm, features)
    print('Importances array: ', imp)
    
    print('Predict test set...')
    test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit = gbm.best_ntree_limit)
    
    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return test_prediction.tolist(), score

In [13]:
def create_submission(score, test, prediction):
    # Make submission
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(score) + '_' + str(now.strftime('%Y-%m-%d-%H-%M')) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('id,probability\n')
    total = 0
    for id in test['id']:
        str1 = str(id) + ',' + str(prediction[total])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()

In [14]:
def get_features(train, test):
    trainval = list(train.columns.values)
    testval = list(test.columns.values)
    output = intersect(trainval, testval)
    output.remove('itemID_1')
    output.remove('itemID_2')
    return output

In [15]:
def prep_train():
    testing = 0
    start_time = time.time()
    
    types1 = {
        'itemID_1': np.dtype(int),
        'itemID_2': np.dtype(int),
        'isDuplicate': np.dtype(int),
        'generationMethod': np.dtype(int)
    }
    
    types2 = {
        'itemID': np.dtype(int),
        'categoryID': np.dtype(int),
        'title': np.dtype(str),
        'description': np.dtype(str),
        'images_array': np.dtype(str),
        'attrsJSON': np.dtype(str),
        'price': np.dtype(float),
        'locationID': np.dtype(int),
        'metroID': np.dtype(float),
        'lat': np.dtype(float),
        'lon': np.dtype(float),
    }
    
    print('Load ItemPairs_train.csv')
    pairs = pd.read_csv('~/Datasets/Avito/ItemPairs_train.csv', dtype = types1)
    # Add 'id' column for easy merge
    print('Load ItemInfo_train.csv')
    items = pd.read_csv('~/Datasets/Avito/ItemInfo_train.csv', dtype = types2)
    items.fillna(-1, inplace = True)
    location = pd.read_csv('~/Datasets/Avito/Location.csv')
    category = pd.read_csv('~/Datasets/Avito/Category.csv')
    
    train = pairs
    train = train.drop(['generationMethod'], axis = 1)
    
    print('Add text features...')
    train['len_title'] = items['title'].str.len()
    train['len_description'] = items['description'].str.len()
    train['len_attrsJSON'] = items['attrsJSON'].str.len()
    
    print('Merge items 1...')
    item1 = items[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon']]
    item1 = pd.merge(item1, category, how = 'left', on = 'categoryID', left_index = True)
    item1 = pd.merge(item1, location, how = 'left', on = 'locationID', left_index = True)
    
    item1 = item1.rename(
        columns = {
            'itemID': 'itemID_1',
            'categoryID': 'categoryID_1',
            'parentCategoryID': 'parentCategoryID_1',
            'price': 'price_1',
            'locationID': 'locationID_1',
            'regionID': 'regionID_1',
            'metroID': 'metroID_1',
            'lat': 'lat_1',
            'lon': 'lon_1'
        })
    
     # Add item 1 data
    train = pd.merge(train, item1, how='left', on='itemID_1', left_index=True)

    print('Merge item 2...')
    item2 = items[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon']]
    item2 = pd.merge(item2, category, how='left', on='categoryID', left_index=True)
    item2 = pd.merge(item2, location, how='left', on='locationID', left_index=True)

    item2 = item2.rename(
        columns={
            'itemID': 'itemID_2',
            'categoryID': 'categoryID_2',
            'parentCategoryID': 'parentCategoryID_2',
            'price': 'price_2',
            'locationID': 'locationID_2',
            'regionID': 'regionID_2',
            'metroID': 'metroID_2',
            'lat': 'lat_2',
            'lon': 'lon_2'
        }
    )
    
    # Add item 2 data
    train = pd.merge(train, item2, how='left', on='itemID_2', left_index=True)
    
    # Create same arrays
    print('Create same arrays')
    train['price_same'] = np.equal(train['price_1'], train['price_2']).astype(np.int32)
    train['locationID_same'] = np.equal(train['locationID_1'], train['locationID_2']).astype(np.int32)
    train['categoryID_same'] = np.equal(train['categoryID_1'], train['categoryID_2']).astype(np.int32)
    train['regionID_same'] = np.equal(train['regionID_1'], train['regionID_2']).astype(np.int32)
    train['MetroID_same'] = np.equal(train['metroID_1'], train['metroID_2']).astype(np.int32)
    train['lat_same'] = np.equal(train['lat_1'], train['lat_2']).astype(np.int32)
    train['lon_same'] = np.equal(train['lon_1'], train['lon_2']).astype(np.int32)
    
    print('Create train data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return train

In [16]:
def prep_test():
    start_time = time.time()

    types1 = {
        'itemID_1': np.dtype(int),
        'itemID_2': np.dtype(int),
        'id': np.dtype(int),
    }

    types2 = {
        'itemID': np.dtype(int),
        'categoryID': np.dtype(int),
        'title': np.dtype(str),
        'description': np.dtype(str),
        'images_array': np.dtype(str),
        'attrsJSON': np.dtype(str),
        'price': np.dtype(float),
        'locationID': np.dtype(int),
        'metroID': np.dtype(float),
        'lat': np.dtype(float),
        'lon': np.dtype(float),
    }
    
    print('Load ItemPairs_test.csv')
    pairs = pd.read_csv('~/Datasets/Avito/ItemPairs_test.csv', dtype = types1)
    print('Load ItemInfo_testcsv')
    items = pd.read_csv('~/Datasets/Avito/ItemInfo_test.csv', dtype = types2)
    items.fillna(-1, inplace = True)
    location = pd.read_csv('~/Datasets/Avito/Location.csv')
    category = pd.read_csv('~/Datasets/Avito/Category.csv')
    
    test = pairs
    
    print('Add text features...')
    test['len_title'] = items['title'].str.len()
    test['len_description'] = items['description'].str.len()
    test['len_attrsJSON'] = items['attrsJSON'].str.len()
    
    print('Merge item 1...')
    item1 = items[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon']]
    item1 = pd.merge(item1, category, how = 'left', on = 'categoryID', left_index = True)
    item1 = pd.merge(item1, location, how = 'left', on = 'locationID', left_index = True)
    
    item1 = item1.rename(
        columns = {
            'itemID': 'itemID_1',
            'categoryID': 'categoryID_1',
            'parentCategoryID': 'parentCategoryID_1',
            'price': 'price_1',
            'locationID': 'locationID_1',
            'regionID': 'regionID_1',
            'metroID': 'metroID_1',
            'lat': 'lat_1',
            'lon': 'lon_1'
        })
    
    # Add item 1 data
    test = pd.merge(test, item1, how = 'left', on = 'itemID_1', left_index = True)
    
    print('Merge item 2...')
    item2 = items[['itemID', 'categoryID', 'price', 'locationID', 'metroID', 'lat', 'lon']]
    item2 = pd.merge(item2, category, how = 'left', on = 'categoryID', left_index = True)
    item2 = pd.merge(item2, location, how = 'left', on = 'locationID', left_index = True)
    
    item2 = item2.rename(
        columns = {
            'itemID': 'itemID_2',
            'categoryID': 'categoryID_2',
            'parentCategoryID': 'parentCategoryID_2',
            'price': 'price_2',
            'locationID': 'locationID_2',
            'regionID': 'regionID_2',
            'metroID': 'metroID_2',
            'lat': 'lat_2',
            'lon': 'lon_2'
        })
    
    # Add item 2 data
    test = pd.merge(test, item2, how ='left', on = 'itemID_2', left_index = True)
    
    # Create same arrays
    print('Create same arrays')
    test['price_same'] = np.equal(test['price_1'], test['price_2']).astype(np.int32)
    test['locationID_same'] = np.equal(test['locationID_1'], test['locationID_2']).astype(np.int32)
    test['categoryID_same'] = np.equal(test['categoryID_1'], test['categoryID_2']).astype(np.int32)
    test['regionID_same'] = np.equal(test['regionID_1'], test['regionID_2']).astype(np.int32)
    test['metorID_same'] = np.equal(test['metroID_1'], test['metroID_2']).astype(np.int32)
    test['lat_same'] = np.equal(test['lat_1'], test['lat_2']).astype(np.int32)
    test['lon_same'] = np.equal(test['lat_1'], test['lat_2']).astype(np.int32)
    
    print(test.describe())
    print('Create test data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return test

In [17]:
def read_test_train():
    train = prep_train()
    test = prep_test()
    train.fillna(-1, inplace = True)
    test.fillna(-1, inplace = True)
    
    # Get only subset of data
    if 1:
        len_old = len(train.index)
        train = train.sample(frac = 0.5)
        len_new = len(train.index)
        print('Reduce train from {} to {}'.format(len_old, len_new))
    features = get_features(train, test)
    return train, test, features

In [18]:
train, test, features = read_test_train()
print('Length of train: ', len(train))
print('Length of test: ', len(test))
print('Features [{}]: {}'.format(len(features), sorted(features)))
test_prediction, score = run_default_test(train, test, features, 'isDuplicate')
print('Real score = {}'.format(score))
create_submission(score, test, test_prediction)

Load ItemPairs_train.csv
Load ItemInfo_train.csv
Add text features...
Merge items 1...
Merge item 2...
Create same arrays
Create train data time: 43.52 seconds
Load ItemPairs_test.csv
Load ItemInfo_testcsv
Add text features...
Merge item 1...
Merge item 2...
Create same arrays
                 id      itemID_1      itemID_2     len_title  \
count  1.044196e+06  1.044196e+06  1.044196e+06  1.044195e+06   
mean   5.220975e+05  2.027628e+06  4.065207e+06  2.278319e+01   
std    3.014336e+05  1.438573e+06  1.443087e+06  1.187001e+01   
min    0.000000e+00  5.000000e+00  5.847000e+03  1.000000e+00   
25%    2.610488e+05  8.107795e+05  3.040622e+06  1.400000e+01   
50%    5.220975e+05  1.780676e+06  4.309976e+06  2.100000e+01   
75%    7.831462e+05  3.037476e+06  5.285806e+06  3.000000e+01   
max    1.044195e+06  6.105866e+06  6.111991e+06  5.000000e+01   

       len_description  len_attrsJSON  categoryID_1       price_1  \
count     1.044158e+06   1.010558e+06  1.044196e+06  1.044196e+06  

Will train until eval error hasn't decreased in 20 rounds.
[0]	train-auc:0.727787	eval-auc:0.729461
[1]	train-auc:0.733909	eval-auc:0.735503
[2]	train-auc:0.748158	eval-auc:0.749490
[3]	train-auc:0.747957	eval-auc:0.749187
[4]	train-auc:0.750892	eval-auc:0.751945
[5]	train-auc:0.751542	eval-auc:0.752646
[6]	train-auc:0.757013	eval-auc:0.758105
[7]	train-auc:0.758712	eval-auc:0.759816
[8]	train-auc:0.759496	eval-auc:0.760618
[9]	train-auc:0.759137	eval-auc:0.760306
[10]	train-auc:0.762078	eval-auc:0.763310
[11]	train-auc:0.762726	eval-auc:0.763851
[12]	train-auc:0.765118	eval-auc:0.766132
[13]	train-auc:0.765850	eval-auc:0.766916
[14]	train-auc:0.766623	eval-auc:0.767655
[15]	train-auc:0.767495	eval-auc:0.768521
[16]	train-auc:0.768561	eval-auc:0.769497
[17]	train-auc:0.768841	eval-auc:0.769759
[18]	train-auc:0.769222	eval-auc:0.770082
[19]	train-auc:0.770019	eval-auc:0.770910
[20]	train-auc:0.770614	eval-auc:0.771515
[21]	train-auc:0.771027	eval-auc:0.771917
[22]	train-auc:0.772769	eva

Validating...
Check error value: 0.822297
Importances array:  [('categoryID_2', 1115), ('price_2', 1085), ('price_1', 948), ('parentCategoryID_2', 677), ('lat_1', 479), ('lon_2', 461), ('lon_1', 461), ('lat_2', 459), ('locationID_1', 310), ('locationID_2', 297), ('categoryID_1', 248), ('price_same', 183), ('metroID_2', 168), ('metroID_1', 164), ('locationID_same', 111), ('regionID_same', 90), ('regionID_1', 85), ('len_attrsJSON', 71), ('regionID_2', 68), ('parentCategoryID_1', 68), ('len_title', 67), ('lon_same', 61), ('lat_same', 57), ('len_description', 47)]
Predict test set...
Training time: 4.44 minutes
Real score = 0.8222974730546484
Writing submission:  submission_0.822297473055_2016-05-15-00-25.csv


In [14]:
import zipfile
import os
import io
from PIL import Image
import datetime
import pandas as pd
import numpy as np

In [2]:
os.chdir('/home/valesco/Datasets/Avito/')

In [15]:

def dhash(image,hash_size = 16):
    image = image.convert('LA').resize((hash_size+1, hash_size), Image.ANTIALIAS)
    mat = np.array(
        list(map(lambda x: x[0], image.getdata()))
    ).reshape(hash_size, hash_size+1)
    
    return ''.join(
        map(
            lambda x: hex(x)[2:].rjust(2,'0'),
            np.packbits(np.fliplr(np.diff(mat) < 0))
        )
    )
    

for zip_counter in [0,1,2,3,4,5,6,7,8,9]:
    imgzipfile = zipfile.ZipFile('Images_'+str(zip_counter)+'.zip')
    print ('Doing zip file ' + str(zip_counter))
    #namelist = imgzipfile.namelist()
    # Comment this line below and uncomment the above line when you do for the whole set
    namelist = imgzipfile.namelist()[:10]
    print ('Total elements ' + str(len(namelist)))

    img_id_hash = []
    counter = 1
    for name in namelist:
        #print name
        #try:
        imgdata = imgzipfile.read(name)
        if len(imgdata) >0:
            img_id = name[:-4]
            stream = io.BytesIO(imgdata)
            img = Image.open(stream)
            img_hash = dhash(img)
            img_id_hash.append([img_id,img_hash])
            counter+=1
        # Uncomment the lines below to get an idea of progress when you do for the whole set
        if counter%10000==0:
            print('Done ' + str(counter) , datetime.datetime.now())
        #except:
            #print ('Could not read ' + str(name) + ' in zip file ' + str(zip_counter))
    df = pd.DataFrame(img_id_hash,columns=['image_id','image_hash'])
    df.to_csv('image_hash_' + str(zip_counter) + '.csv')
    

Doing zip file 0
Total elements 10
Doing zip file 1
Total elements 10
Doing zip file 2
Total elements 10
Doing zip file 3
Total elements 10
Doing zip file 4
Total elements 10
Doing zip file 5
Total elements 10
Doing zip file 6
Total elements 10
Doing zip file 7
Total elements 10
Doing zip file 8
Total elements 10
Doing zip file 9
Total elements 10


In [18]:
import pickle

In [19]:
hash_file = pickle.load(open('dhash.pkl', 'rb'))

In [21]:
len(hash_file)

10824317