In [28]:
import multiprocessing
from multiprocessing import Process
from multiprocessing import Manager

import sys, os
import xgboost
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import datetime
import time
import os
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from ml_metrics import mapk
from sklearn.preprocessing import LabelEncoder

pd.options.mode.chained_assignment = None  # default='warn'

In [18]:
def prepare_data(df):
    """
    Feature engineering
    """

    minute = df.time % 60
    df['hour'] = df['time'] // 60
    #df.drop(['time'], axis=1, inplace=True)
    df['weekday'] = df['hour'] // 24
    df['month'] = df['weekday'] // 30
    df['year'] = (df['weekday'] // 365 + 1) * 10.0
    df['hour'] = ((df['hour'] % 24 + 1) + minute / 60.0) * 4.0
    df['weekday'] = (df['weekday'] % 7 + 1) * 3.0
    df['month'] = (df['month'] % 12 + 1) * 2.0
    df['accuracy'] = np.log10(df['accuracy']) * 10.0

    return df

train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

train = prepare_data(train)
test = prepare_data(test)

In [71]:
def xfrange(start, end, step):
    gens = [];
    end = round(end, 2)
    start = round(start, 2)
    while(start < end):
        gens.append(start)
        start = round(start + step, 2)
            
    return gens
        
def gen_ranges(start, end, step):
    return zip(xfrange(start, end, step), xfrange(start + step, end + step, step));

size = 10.0;

x_step = 0.2
y_step = 0.08

x_ranges = gen_ranges(0, size, x_step);
y_ranges = gen_ranges(0, size, y_step);

In [35]:
size_cv = 2.0;

x_cv_start = 2;
x_cv_end = x_cv_start + size_cv
y_cv_start = 2;
y_cv_end = y_cv_start + size_cv;

cv = train[(train['x'] >= x_cv_start) & 
           (train['x'] <= x_cv_end) &
           (train['y'] >= y_cv_start) &
           (train['y'] <= y_cv_end)]

cv = cv.sort_values(by='time', axis=0, ascending=True)
train_cv = cv[:cv.shape[0]//7]
test_cv = cv[cv.shape[0]//7:]

print cv.shape
print train_cv.shape
print test_cv.shape

(1208069, 10)
(172581, 10)
(1035488, 10)


In [64]:
x_step = 0.2
y_step = 0.08

x_ranges_cv = gen_ranges(x_cv_start, x_cv_end, x_step);
y_ranges_cv = gen_ranges(y_cv_start, y_cv_end, y_step);

In [73]:
def process_column(x_min, x_max, y_ranges, x_end, y_end, train, test, raw_output, th, preds_total):
    start_time_column = time.time()
    preds_total[x_min] = pd.DataFrame();
    for y_min, y_max in  y_ranges: 
        start_time_cell = time.time()
        if x_max == x_end:
            x_max = x_end + 0.001

        if y_max == y_end:
            y_max = y_end + 0.001

        train_cell = train[(train['x'] >= x_min - 0.03) &
                           (train['x'] < x_max + 0.03) &
                           (train['y'] >= y_min - 0.015) &
                           (train['y'] < y_max + 0.015)]
        shape = train_cell.shape
        add_data = train_cell[train_cell.hour<10]# add data for periodic time that hit the boundary
        add_data.hour = add_data.hour+96

        add_data2 = train_cell[train_cell.hour>90]
        add_data2.hour = add_data2.hour-96

        train_cell = train_cell.append(add_data)
        train_cell = train_cell.append(add_data2)
        del add_data,add_data2
        #print str(shape) + 'after' + str(train_cell.shape)

        train_cell = train_cell.drop(['time'], axis=1)
        train_cell = train_cell.groupby("place_id").filter(lambda x: len(x) >= th)

        test_cell = test[(test['x'] >= x_min) &
                         (test['x'] < x_max) &
                         (test['y'] >= y_min) &
                         (test['y'] < y_max)]

        row_ids = test_cell['row_id'].reset_index(drop=True);
        test_cell = test_cell.drop(['row_id', 'time', 'year'], axis=1)

        le = LabelEncoder()

        y = le.fit_transform(train_cell.place_id.values)
        X = train_cell.drop(['row_id', 'place_id', 'year'], axis=1)

        # Construct DMatrices
        dm_train = xgboost.DMatrix(X.as_matrix(), label=y)
        dm_test = xgboost.DMatrix(test_cell.as_matrix())
        # use the XGBoost built in cross validation function,
        # stopping early to prevent overfitting
        '''res = xgboost.cv(
            {'eta': 0.1, 'objective': 'multi:softprob',
             'num_class': len(le.classes_),
             'alpha': 0.1, 'lambda': 0.1, 'booster': 'gbtree'},
            dm_train, num_boost_round=200, nfold=5, seed=42,
            early_stopping_rounds=10#, verbose_eval=10
            # For some reason, verbose_eval seems to be broken on my install
        )
        print(res)'''

        # this will be the number of epochs that (approximately) prevents
        # overfitting
        N_epochs = 35#res.shape[0]

        booster = xgboost.train(
            {'eta': 0.1, 'objective': 'multi:softprob',
             'num_class': len(le.classes_),
             'alpha': 0.1, 'lambda': 0.1, 'booster': 'gbtree'},
            dm_train, num_boost_round=N_epochs, verbose_eval=10)

        y_pred = booster.predict(dm_test)

        preds = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3])
        preds = pd.DataFrame.from_dict(preds)
        preds['row_id'] = row_ids;
        preds = preds.set_index('row_id')
        preds.index.name = 'row_id';
        
        preds_all = dict(zip(le.classes_, zip(*y_pred)))
        preds_all = pd.DataFrame.from_dict(preds_all)
        preds_all.to_csv("./raw/" + raw_output + str(x_min) + "_" + str(y_min) + ".csv", index = False);
        #print preds_all.shape
        preds_total[x_min] = pd.concat([preds_total[x_min], preds], axis=0);
    print("Elapsed time column: %s minutes" % ((time.time() - start_time_column)/60))

def model(x_ranges, y_ranges, x_end, y_end, train, test, raw_output, th):   
    start_time = time.time()
    jobs = []
    mgr = Manager()
    preds_total = mgr.dict();

    for x_min, x_max in  x_ranges:
        p = multiprocessing.Process(target=process_column, args=(x_min, x_max, y_ranges, \
                                                                 x_end, y_end, train, test, raw_output, 3, preds_total))
        jobs.append(p)
        p.start()
        if len(jobs) == 5:
            for proc in jobs:
                proc.join();
            jobs = [];
        
    print("Elapsed time overall: %s minutes" % ((time.time() - start_time)/60))
    
    preds_total = pd.concat(preds_total.values(), axis=0);
    print preds_total.shape
    
    return preds_total.sort_values(by='row_id', axis=0, ascending=True);

def modelq(x_ranges, y_ranges, x_end, y_end, train, test, raw_output, th):   
    start_time = time.time()
    jobs = []
    mgr = Manager()
    preds_total = mgr.dict();

    for x_min, x_max in  x_ranges:
        
        if x_max == x_end:
            x_max = x_max + 0.001
                
        train_column = train[(train['x'] >= x_min - 0.03) &
                             (train['x'] < x_max + 0.03)]
        
        test_column = test[(test['x'] >= x_min) &
                             (test['x'] < x_max)]
        
        p = multiprocessing.Process(target=process_column, args=(x_min, x_max, y_ranges, \
                                                                 x_end, y_end, train_column, test_column,
                                                                 raw_output, th, preds_total))
        jobs.append(p)
        p.start()
        if len(jobs) == 10:
            for proc in jobs:
                proc.join();
            jobs = [];
        
    print("Elapsed time overall: %s minutes" % ((time.time() - start_time)/60))
    
    preds_total = pd.concat(preds_total.values(), axis=0);
    print preds_total.shape
    
    return preds_total.sort_index();

In [66]:
predictions = modelq(x_ranges_cv, y_ranges_cv, x_cv_end, y_cv_end, train_cv, test_cv.drop(['place_id'], axis=1),\
                    'cv/xgb/', 8);
actual = test_cv[['place_id']].sort();
print mapk(np.array([actual.values.flatten()]).T, predictions.values, 3)

Elapsed time column: 8.69479350249 minutes
Elapsed time column: 9.02213106553 minutes
Elapsed time column: 9.17870934804 minutes
Elapsed time column: 9.91766578356 minutes
Elapsed time overall: 9.92024463415 minutes
(1035488, 3)


  from ipykernel import kernelapp as app


0.446849215056


In [65]:
print predictions.shape
print predictions[:5]
print actual.shape
print actual[:5]

(1035488, 3)
              0           1           2
16   3369021322  4939682720  6052372179
82   9313893105  4184942588  1785058724
91   4639582171  4464263185  1312011163
123  8724107029  1940379729  9735105400
163  4634909749  8980163153  3610237287
(1035488, 1)
       place_id
16   2123587484
82   9313893105
91   4639582171
123  8724107029
163  3869813743


In [74]:
preds_total = modelq(x_ranges, y_ranges, size, size, train, test, 
                    'xgb_150_02_008_extended_003-0015_th3_accuracy_adddata/', 3)

preds_total['place_id'] = preds_total['l1'].apply(str) + ' ' + preds_total['l2'].apply(str) + ' ' + preds_total['l3'].apply(str);
sub_file = os.path.join('xgb_150_02_008_extended_003-0015_th3_accuracy_adddata'
                        + str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + '.csv')
preds_total[['row_id','place_id']].to_csv(sub_file, index = False);
print sub_file

Elapsed time column: 124.139323052 minutes
Elapsed time column: 179.587287267 minutes
Elapsed time column: 189.770548499 minutes
Elapsed time column: 192.302193685 minutes
Elapsed time column: 192.742986917 minutes
Elapsed time column: 194.542433751 minutes
Elapsed time column: 201.535967366 minutes
Elapsed time column: 203.951551517 minutes
Elapsed time column: 206.031371534 minutes
Elapsed time column: 206.343225133 minutes
Elapsed time column: 235.390047598 minutes
Elapsed time column: 235.831229516 minutes
Elapsed time column: 236.753945001 minutes
Elapsed time column: 238.187316132 minutes
Elapsed time column: 238.430221732 minutes
Elapsed time column: 238.464797548 minutes
Elapsed time column: 241.935420982 minutes
Elapsed time column: 243.377277565 minutes
Elapsed time column: 243.528731918 minutes
Elapsed time column: 243.571000954 minutes
Elapsed time column: 242.315373067 minutes
Elapsed time column: 243.8879209 minutes
Elapsed time column: 247.358157949 minutes
Elapsed time 

KeyError: 'l1'

In [75]:
preds_total.shape

(8607230, 3)

In [76]:
preds_total[:10]

Unnamed: 0_level_0,0,1,2
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3831655216,6131996960,1466246422
1,2465239230,5801740503,4634106612
2,2516481553,7862615088,5946611563
3,7995458948,8643187406,8393706174
4,4764406629,8277155346,8711861736
5,8370753254,9727638738,6305916485
6,7283245557,9054319794,7282698430
7,4346049470,9175474378,2247481267
8,6470278079,9340499987,6421756522
9,3894834079,6980929490,4495746994


In [83]:
preds_total[-10:]

Unnamed: 0_level_0,l1,l2,l3
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8607220,9924871949,1023555506,8253094049
8607221,8933542378,7878855704,5378587809
8607222,4683526794,2813163259,7429659042
8607223,9244044921,2614601100,3628085189
8607224,7519528207,7336608920,7053690910
8607225,6388207576,1450752225,4798840125
8607226,9482625693,4985409672,3071064812
8607227,6422555124,9466512367,3066735997
8607228,6645978051,2649611830,6144260946
8607229,2341683842,1221929656,4358647823


In [84]:
preds_total = pd.DataFrame(preds_total, dtype=str, columns=['l1', 'l2', 'l3'])
#Concatenating the 3 predictions for each sample
ds_sub = preds_total.l1.str.cat([preds_total.l2, preds_total.l3], sep=' ')
ds_sub.name = 'place_id'
ds_sub.to_csv('xgb_150_02_008_extended_003-0015_th3_accuracy_adddata1', index=True, header=True, index_label='row_id')