In [1]:
import xgboost as xgb

from sklearn.linear_model import LogisticRegression

import pandas as pd
import numpy as np
import datetime
import time
import os
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

import multiprocessing
from multiprocessing import Process
from multiprocessing import Manager

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [4]:
size = 10.0;

#train = train[(train['x'] >= 0) & (train['x'] <= size) & (train['y'] >= 0) & (train['y'] <= size)]
#test = test[(test['x'] >= 0) & (test['x'] <= size) & (test['y'] >= 0) & (test['y'] <= size)]
print train.shape
print test.shape

print('Calculate hour, weekday, month and year for train and test')
train['hour'] = (train['time']//60)%24+1 # 1 to 24
train['hour_decimal'] = (train['time']/60)%24+1 # 1 to 24
train['weekday'] = (train['time']//1440)%7+1
train['month'] = (train['time']//43200)%12+1 # rough estimate, month = 30 days
train['year'] = (train['time']//525600)+1 

test['hour'] = (test['time']//60)%24+1 # 1 to 24
test['hour_decimal'] = (test['time']/60)%24+1 # 1 to 24
test['weekday'] = (test['time']//1440)%7+1
test['month'] = (test['time']//43200)%12+1 # rough estimate, month = 30 days
test['year'] = (test['time']//525600)+1

print 'shape after time engineering'
print train.shape

(29118021, 6)
(8607230, 5)
Calculate hour, weekday, month and year for train and test
shape after time engineering
(29118021, 11)


In [None]:
x_step = 0.5
y_step = 0.02

x_ranges = zip(np.arange(0, size, x_step), np.arange(x_step, size + x_step, x_step));
y_ranges = zip(np.arange(0, size, y_step), np.arange(y_step, size + y_step, y_step));

def worker(x_min, x_max, train, test, preds_total):
    start_time_worker = time.time()
    preds_total_worker = pd.DataFrame();
    for y_min, y_max in  y_ranges: 
        start_time_cell = time.time()
        x_max = round(x_max, 4)
        x_min = round(x_min, 4)
        
        y_max = round(y_max, 4)
        y_min = round(y_min, 4)
        
        if x_max == size:
            x_max = x_max + 0.001
            
        if y_max == size:
            y_max = y_max + 0.001
            
        train_grid = train[(train['x'] >= x_min) &
                           (train['x'] < x_max) &
                           (train['y'] >= y_min) &
                           (train['y'] < y_max)]

        test_grid = test[(test['x'] >= x_min) &
                         (test['x'] < x_max) &
                         (test['y'] >= y_min) &
                         (test['y'] < y_max)]
        
        X_train_grid = train_grid[['x','y','accuracy','time', 'hour', 'hour_decimal', 'weekday', 'month', 'year']];
        y_train_grid = train_grid[['place_id']].values.ravel();
        X_test_grid = test_grid[['x','y','accuracy','time', 'hour', 'hour_decimal', 'weekday', 'month', 'year']];
        
        #clf = GradientBoostingClassifier();
        #clf =  LogisticRegression(multi_class='multinomial', solver = 'lbfgs');
        clf = xgb.XGBClassifier(n_estimators = 8, nthread = -1);
        #clf = RandomForestClassifier(n_estimators = 300, n_jobs = -1);
        clf.fit(X_train_grid, y_train_grid)
        
        preds = dict(zip([el for el in clf.classes_], zip(*clf.predict_proba(X_test_grid))))
        preds = pd.DataFrame.from_dict(preds)
                
        preds['0_'], preds['1_'], preds['2_'] = zip(*preds.apply(lambda x: preds.columns[x.argsort()[::-1][:3]].tolist(), axis=1));
        preds = preds[['0_','1_','2_']];
        
        preds['row_id'] = test_grid['row_id'].reset_index(drop=True);
        
        preds.to_csv('./raw/xgb1/' + x_min + '_' + y_min + '.csv')
        preds_total_worker = pd.concat([preds_total_worker, preds], axis=0);
        #print("Elapsed time cell: %s seconds" % (time.time() - start_time_cell))
    preds_total_worker.to_csv('./tots/xgb1/' + str(x_min) + '.csv');
    preds_total[x_min] = preds_total_worker;
    print("Elapsed time row: %s minuts" % ((time.time() - start_time_worker)/60))


start_time = time.time()

jobs = []
mgr = Manager()
#ns = mgr.Namespace()
preds_total = mgr.dict();

for x_min, x_max in  x_ranges:
    p = multiprocessing.Process(target=worker, args=(x_min, x_max, train, test, preds_total))
    jobs.append(p)
    p.start()

for proc in jobs:
    proc.join()

preds_total = pd.concat(preds_total.values(), axis=0);

print X_test.shape
print preds_total.shape

preds_total = preds_total.sort_values(by='row_id', axis=0, ascending=True);
preds_total['place_id'] = preds_total['0_'].apply(str) + ' ' + preds_total['1_'].apply(str) + ' ' + preds_total['2_'].apply(str);

sub_file = os.path.join('submission_from_disc$$_' + str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + '.csv')
preds_total[['row_id','place_id']].to_csv(sub_file, index = False);

print("Elapsed time overall: %s minuts" % ((time.time() - start_time)/60))
