In [1]:
import multiprocessing
from multiprocessing import Process
from multiprocessing import Manager

import math
import xgboost
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import datetime
import time
import os
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from ml_metrics import mapk
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')

In [10]:
def xfrange(start, end, step):
    gens = [];
    end = round(end, 5)
    start = round(start, 5)
    while(start < end):
        gens.append(start)
        start = round(start + step, 5)
            
    return gens

def xfrangeq(start, end, n_steps):
    gens = [];
    x_slice = (end - start) / n_steps
    for i in range(n_steps):
        x_min = start + x_slice * i
        gens.append(round(x_min,12))

    return gens

def gen_ranges(start, end, step):
    return zip(xfrange(start, end, step), xfrange(start + step, end + step, step));

def gen_rangesq(start, end, n_steps):
    return zip(xfrangeq(start, end, n_steps), xfrangeq(end/n_steps, end + end/n_steps, n_steps));

size = 10.0;
x_step = 10.0/12.0
y_step = 0.4

x_ranges = gen_rangesq(0, size, 12);
y_ranges = gen_rangesq(0, size, 25);

print x_ranges
print y_ranges

[(0.0, 0.833333333333), (0.833333333333, 1.666666666667), (1.666666666667, 2.5), (2.5, 3.333333333333), (3.333333333333, 4.166666666667), (4.166666666667, 5.0), (5.0, 5.833333333333), (5.833333333333, 6.666666666667), (6.666666666667, 7.5), (7.5, 8.333333333333), (8.333333333333, 9.166666666667), (9.166666666667, 10.0)]
[(0.0, 0.4), (0.4, 0.8), (0.8, 1.2), (1.2, 1.6), (1.6, 2.0), (2.0, 2.4), (2.4, 2.8), (2.8, 3.2), (3.2, 3.6), (3.6, 4.0), (4.0, 4.4), (4.4, 4.8), (4.8, 5.2), (5.2, 5.6), (5.6, 6.0), (6.0, 6.4), (6.4, 6.8), (6.8, 7.2), (7.2, 7.6), (7.6, 8.0), (8.0, 8.4), (8.4, 8.8), (8.8, 9.2), (9.2, 9.6), (9.6, 10.0)]


In [4]:
size_cv = 10.0;

x_cv_start = 0;
x_cv_end = x_cv_start + size_cv
y_cv_start = 0;
y_cv_end = y_cv_start + size_cv;

cv = train[(train['x'] >= x_cv_start) & 
           (train['x'] <= x_cv_end) &
           (train['y'] >= y_cv_start) &
           (train['y'] <= y_cv_end)]

cv = cv.sort_values(by='time', axis=0, ascending=True)
train_cv = cv[cv.shape[0]//4:]
test_cv = cv[:cv.shape[0]//4]

print cv.shape
print train_cv.shape
print test_cv.shape

(29118021, 6)
(21838516, 6)
(7279505, 6)


In [63]:
x_ranges_cv = gen_rangesq(x_cv_start, x_cv_end, 1);
y_ranges_cv = gen_rangesq(y_cv_start, y_cv_end, 25);

print x_ranges_cv
print y_ranges_cv

[(9.166666666667, 10.0)]
[(0.0, 0.4), (0.4, 0.8), (0.8, 1.2), (1.2, 1.6), (1.6, 2.0), (2.0, 2.4), (2.4, 2.8), (2.8, 3.2), (3.2, 3.6), (3.6, 4.0), (4.0, 4.4), (4.4, 4.8), (4.8, 5.2), (5.2, 5.6), (5.6, 6.0), (6.0, 6.4), (6.4, 6.8), (6.8, 7.2), (7.2, 7.6), (7.6, 8.0), (8.0, 8.4), (8.4, 8.8), (8.8, 9.2), (9.2, 9.6), (9.6, 10.0)]


In [None]:
for x_min, x_max in  x_ranges_cv:
    step_x_small = 0.2
    x_min_to_load = round(int(x_min/step_x_small)*step_x_small, 4)
    for x_min_small in xfrange(x_min_to_load, x_max, step_x_small):
        print x_min_small

In [66]:
def load_preds_from_disc(input_dir, x_min, y_min, x_end, y_end, x_max, y_max, step_x_small, step_y_small):
    
    x_min_to_load = round(int(x_min/step_x_small)*step_x_small, 4)
    y_min_to_load = round(int(y_min/step_y_small)*step_y_small, 4)
    
    chunk_total = pd.DataFrame();
    for x_min_small in xfrange(x_min_to_load, x_max, step_x_small):
        
        x_max_small = x_min_small + step_x_small
        
        if x_max_small == x_end:
            x_max_small = x_end + 0.001
        
        for y_min_small in xfrange(y_min_to_load, y_max, step_y_small):
            
            y_max_small = y_min_small + step_y_small
            
            if y_max_small == y_end:
                y_max_small = y_end + 0.001
                
            test_cell = test[(test['y'] >= y_min_small) &
                             (test['y'] < y_max_small)&
                             (test['x'] >= x_min_small) &
                             (test['x'] < x_max_small)]
            
            row_ids = test_cell['row_id'].reset_index(drop=True);
            
            chunk = pd.read_csv(input_dir + str(x_min_small) + "_" + str(y_min_small) + ".csv");
            chunk.rename(columns = lambda x: int(x), inplace=True);
            #print row_ids.shape
            #print chunk.shape
            chunk['row_id'] = row_ids;
            chunk = chunk.set_index('row_id')
            chunk_total = chunk_total.add(chunk, fill_value=0).fillna(value=0);
                
    return chunk_total;

def process_column(x_min, x_max, y_ranges, x_end, y_end, train, test, preds_total):
    start_time_column = time.time()
    preds_total[x_min] = pd.DataFrame();
    for y_min, y_max in  y_ranges: 
        start_time_cell = time.time()

        preds_all = pd.read_csv('./raw/knn1.5+rf1-lb-58369/' + str(x_min) + "_" + str(y_min) + ".csv", index_col='row_id')
        preds_all.rename(columns = lambda x: int(x), inplace=True);
        
        #chunk_rf = pd.read_csv('./raw/cached/rf_50_gini+entropy_02_008_aug_0.05-0.017_th3_accuracy_adddata/'
        #                       + str(x_min) + "_" + str(y_min) + ".csv", index_col='row_id')

        
        chunk_rf = load_preds_from_disc('./raw/rf_50_gini+entropy_02_008_aug_0.05-0.017_th3_accuracy_adddata/',
                                        x_min, y_min, 10.0, 10.0, x_max, y_max, 0.2, 0.08)
        chunk_rf.rename(columns = lambda x: int(x), inplace=True);
        
        chunk_rf = chunk_rf[chunk_rf.index.isin(preds_all.index.values)]
        chunk_rf.sort_index().to_csv('./raw/cached/rf_50_gini+entropy_02_008_aug_0.05-0.017_th3_accuracy_adddata/'
                                     + str(x_min) + "_" + str(y_min) + ".csv", index = True);
        
        #chunk_xgb = pd.read_csv('./raw/cached/xgb_150_02_008_extended_003-0015_th3_accuracy_adddata/'
        #                       + str(x_min) + "_" + str(y_min) + ".csv", index_col='row_id')
        chunk_xgb = load_preds_from_disc('./raw/xgb_150_02_008_extended_003-0015_th3_accuracy_adddata/',
                                        x_min, y_min, 10.0, 10.0, x_max, y_max, 0.2, 0.08)
        chunk_xgb.rename(columns = lambda x: int(x), inplace=True);
        
        chunk_xgb = chunk_xgb[chunk_xgb.index.isin(preds_all.index.values)]
        chunk_xgb.sort_index().to_csv('./raw/cached/xgb_150_02_008_extended_003-0015_th3_accuracy_adddata/'
                                     + str(x_min) + "_" + str(y_min) + ".csv", index = True);
        
        #chunk_rf = pd.read_csv('./raw/rf-lb-58369/' + str(x_min) + "_" + str(y_min) + ".csv",
        #                       index_col='row_id')
        
        '''for col in preds_all.columns:
            preds_all[col] *= 1.5'''
        
        for col in chunk_rf.columns:
            chunk_rf[col] *= 0.4
            
        for col in chunk_xgb.columns:
            chunk_xgb[col] *= 0.2
            
        summed = preds_all.add(chunk_rf, fill_value=0).add(chunk_xgb, fill_value=0).fillna(value=0);
        
        #summed.to_csv('./raw/knn+rf-1.5-1-lb-58369/' + str(x_min) + "_" + str(y_min) + ".csv", index_col='row_id')
        
        summed['l1'], summed['l2'], summed['l3'] = \
            zip(*summed.apply(lambda x: summed.columns[x.argsort()[::-1][:3]].tolist(), axis=1));
            
        summed = summed[['l1','l2','l3']];
        preds_total[x_min] = pd.concat([preds_total[x_min], summed], axis=0);
             
    print("Elapsed time column: %s minutes" % ((time.time() - start_time_column)/60))

def model(x_ranges, y_ranges, x_end, y_end, train, test):   
    start_time = time.time()
    jobs = []
    mgr = Manager()
    preds_total = mgr.dict();

    for x_min, x_max in  x_ranges:
        
        p = multiprocessing.Process(target=process_column, args=(x_min, x_max, y_ranges, \
                                                                 x_end, y_end, train, test, preds_total))
        jobs.append(p)
        p.start()
        if len(jobs) == 1:
            for proc in jobs:
                proc.join();
            jobs = [];
        
    print("Elapsed time overall: %s minutes" % ((time.time() - start_time)/60))
    
    preds_total = pd.concat(preds_total.values(), axis=0);
    print preds_total.shape
    
    return preds_total.sort_index();

In [None]:
model(x_ranges_cv, y_ranges_cv, x_cv_end, y_cv_end, train_cv, test_cv.drop(['place_id'], axis=1));

In [62]:
predictions = model(x_ranges_cv, y_ranges_cv, x_cv_end, y_cv_end, train, test_cv.drop(['place_id'], axis=1));

Process Process-269:
Traceback (most recent call last):
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/process.py", line 114, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-59-796a4db134d1>", line 48, in process_column
    + str(x_min) + "_" + str(y_min) + ".csv", index_col='row_id')
  File "/Library/Python/2.7/site-packages/pandas-0.17.1-py2.7-macosx-10.10-intel.egg/pandas/io/parsers.py", line 498, in parser_f
    return _read(filepath_or_buffer, kwds)
  File "/Library/Python/2.7/site-packages/pandas-0.17.1-py2.7-macosx-10.10-intel.egg/pandas/io/parsers.py", line 275, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/Library/Python/2.7/site-packages/pandas-0.17.1-py2.7-macosx-10.10-intel.egg/pandas/io/parsers.py", line 590, in __init__
    s

KeyboardInterrupt: 

    summed['l1'], summed['l2'], summed['l3'] =             zip(*summed.apply(lambda x: summed.columns[x.argsort()[::-1][:3]].tolist(), axis=1));
KeyboardInterrupt
  File "/Library/Python/2.7/site-packages/pandas-0.17.1-py2.7-macosx-10.10-intel.egg/pandas/core/frame.py", line 3972, in apply
    return self._apply_standard(f, axis, reduce=reduce)
  File "/Library/Python/2.7/site-packages/pandas-0.17.1-py2.7-macosx-10.10-intel.egg/pandas/core/frame.py", line 4026, in _apply_standard
    labels=labels)
  File "pandas/src/reduce.pyx", line 613, in pandas.lib.reduce (pandas/lib.c:43773)
Process Process-264:
  File "pandas/src/reduce.pyx", line 132, in pandas.lib.Reducer.get_result (pandas/lib.c:33913)
Traceback (most recent call last):
  File "<ipython-input-59-796a4db134d1>", line 87, in <lambda>
  File "/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
    summed['l1'], summed['l2'], summed['l3'] =             zip(*s

In [55]:
predictions.shape

(7911778, 3)

In [13]:
# 1:1
#predictions = model(x_ranges_cv, y_ranges_cv, x_cv_end, y_cv_end, train_cv, test_cv.drop(['place_id'], axis=1));
actual = test_cv[['place_id']].sort();
#predictions[['l1','l2','l3']]  = predictions[['l1','l2','l3']].astype('int64')
print actual.shape
print mapk(np.array([actual.values.flatten()]).T, predictions.values, 3)

  app.launch_new_instance()


(7279505, 1)
0.647978239363


In [15]:
# 2:1
predictions = model(x_ranges_cv, y_ranges_cv, x_cv_end, y_cv_end, train_cv, test_cv.drop(['place_id'], axis=1));
actual = test_cv[['place_id']].sort();
predictions[['l1','l2','l3']]  = predictions[['l1','l2','l3']].astype('int64')
print actual.shape
print mapk(np.array([actual.values.flatten()]).T, predictions.values, 3)

Elapsed time column: 30.4403275331 minutes
Elapsed time column: 31.4096225699 minutes
Elapsed time column: 33.6390999158 minutes
Elapsed time column: 34.9108743509 minutes
Elapsed time column: 36.042342933 minutes
Elapsed time column: 36.2695594152 minutes
Elapsed time column: 36.394053634 minutes
Elapsed time column: 36.508232832 minutes
Elapsed time column: 36.658953615 minutes
Elapsed time column: 36.7901131829 minutes
Elapsed time column: 36.8177387993 minutes
Elapsed time column: 36.8746377349 minutes
Elapsed time overall: 36.8773222804 minutes
(7279505, 3)


  app.launch_new_instance()


(7279505, 1)
0.648229813245


In [18]:
# 4:1
predictions = model(x_ranges_cv, y_ranges_cv, x_cv_end, y_cv_end, train_cv, test_cv.drop(['place_id'], axis=1));
actual = test_cv[['place_id']].sort();
predictions[['l1','l2','l3']]  = predictions[['l1','l2','l3']].astype('int64')
print actual.shape
print mapk(np.array([actual.values.flatten()]).T, predictions.values, 3)

Elapsed time column: 38.4620311975 minutes
Elapsed time column: 39.2670983831 minutes
Elapsed time column: 41.8771187345 minutes
Elapsed time column: 43.5470228831 minutes
Elapsed time column: 44.4005404353 minutes
Elapsed time column: 44.8322546999 minutes
Elapsed time column: 45.0872049133 minutes
Elapsed time column: 45.1210696022 minutes
Elapsed time column: 45.1813807646 minutes
Elapsed time column: 45.3620578011 minutes
Elapsed time column: 45.4705918511 minutes
Elapsed time column: 45.5853589495 minutes
Elapsed time overall: 45.5880936027 minutes
(7279505, 3)


  app.launch_new_instance()


(7279505, 1)
0.647146245979


In [22]:
# 3:1
predictions = model(x_ranges_cv, y_ranges_cv, x_cv_end, y_cv_end, train_cv, test_cv.drop(['place_id'], axis=1));
actual = test_cv[['place_id']].sort();
predictions[['l1','l2','l3']]  = predictions[['l1','l2','l3']].astype('int64')
print actual.shape
print mapk(np.array([actual.values.flatten()]).T, predictions.values, 3)

Elapsed time column: 14.2427910328 minutes
Elapsed time column: 14.7059950868 minutes
Elapsed time column: 15.473421669 minutes
Elapsed time column: 15.8806776007 minutes
Elapsed time column: 16.2274898489 minutes
Elapsed time column: 16.4376385808 minutes
Elapsed time column: 16.4534943859 minutes
Elapsed time column: 16.4945543488 minutes
Elapsed time column: 16.5008187691 minutes
Elapsed time column: 16.5399011691 minutes
Elapsed time column: 16.5737236818 minutes
Elapsed time column: 16.6257001519 minutes
Elapsed time overall: 16.6281627297 minutes
(7279505, 3)


  app.launch_new_instance()


(7279505, 1)
0.647658139759


In [25]:
# 1.5:1
predictions = model(x_ranges_cv, y_ranges_cv, x_cv_end, y_cv_end, train_cv, test_cv.drop(['place_id'], axis=1));
actual = test_cv[['place_id']].sort();
predictions[['l1','l2','l3']]  = predictions[['l1','l2','l3']].astype('int64')
print actual.shape
print mapk(np.array([actual.values.flatten()]).T, predictions.values, 3)

Elapsed time column: 14.9665695985 minutes
Elapsed time column: 15.3974535982 minutes
Elapsed time column: 16.2040850163 minutes
Elapsed time column: 16.6524471521 minutes
Elapsed time column: 17.0386225184 minutes
Elapsed time column: 17.2591627479 minutes
Elapsed time column: 17.2834522645 minutes
Elapsed time column: 17.2958891988 minutes
Elapsed time column: 17.3387034973 minutes
Elapsed time column: 17.3708709478 minutes
Elapsed time column: 17.3977789998 minutes
Elapsed time column: 17.4600841165 minutes
Elapsed time overall: 17.4626104196 minutes
(7279505, 3)


  app.launch_new_instance()


(7279505, 1)
0.648358141568


In [29]:
# 1.75:1
predictions = model(x_ranges_cv, y_ranges_cv, x_cv_end, y_cv_end, train_cv, test_cv.drop(['place_id'], axis=1));
actual = test_cv[['place_id']].sort();
predictions[['l1','l2','l3']]  = predictions[['l1','l2','l3']].astype('int64')
print actual.shape
print mapk(np.array([actual.values.flatten()]).T, predictions.values, 3)

Elapsed time column: 15.1652602315 minutes
Elapsed time column: 15.4857278824 minutes
Elapsed time column: 16.2854601304 minutes
Elapsed time column: 16.7138969819 minutes
Elapsed time column: 17.1094597022 minutes
Elapsed time column: 17.3059388161 minutes
Elapsed time column: 17.3574807167 minutes
Elapsed time column: 17.3591024478 minutes
Elapsed time column: 17.3978994846 minutes
Elapsed time column: 17.4323835174 minutes
Elapsed time column: 17.4713435809 minutes
Elapsed time column: 17.5198141336 minutes
Elapsed time overall: 17.5222983003 minutes
(7279505, 3)


  app.launch_new_instance()


(7279505, 1)
0.648321531936


In [33]:
# 1.25:1
predictions = model(x_ranges_cv, y_ranges_cv, x_cv_end, y_cv_end, train_cv, test_cv.drop(['place_id'], axis=1));
actual = test_cv[['place_id']].sort();
predictions[['l1','l2','l3']]  = predictions[['l1','l2','l3']].astype('int64')
print actual.shape
print mapk(np.array([actual.values.flatten()]).T, predictions.values, 3)

Elapsed time column: 14.7934530338 minutes
Elapsed time column: 15.2159933647 minutes
Elapsed time column: 15.9233281493 minutes
Elapsed time column: 16.3699216326 minutes
Elapsed time column: 16.7171323816 minutes
Elapsed time column: 16.9315239986 minutes
Elapsed time column: 16.9447553992 minutes
Elapsed time column: 16.984264799 minutes
Elapsed time column: 16.9979028662 minutes
Elapsed time column: 17.0344791174 minutes
Elapsed time column: 17.0703659336 minutes
Elapsed time column: 17.1149411996 minutes
Elapsed time overall: 17.1172762831 minutes
(7279505, 3)


  app.launch_new_instance()


(7279505, 1)
0.64827498573


In [35]:
# 1.625:1
predictions = model(x_ranges_cv, y_ranges_cv, x_cv_end, y_cv_end, train_cv, test_cv.drop(['place_id'], axis=1));
actual = test_cv[['place_id']].sort();
predictions[['l1','l2','l3']]  = predictions[['l1','l2','l3']].astype('int64')
print actual.shape
print mapk(np.array([actual.values.flatten()]).T, predictions.values, 3)

Elapsed time column: 14.3525497993 minutes
Elapsed time column: 14.7467328191 minutes
Elapsed time column: 15.4248833497 minutes
Elapsed time column: 15.8553809166 minutes
Elapsed time column: 16.2372039835 minutes
Elapsed time column: 16.4154511333 minutes
Elapsed time column: 16.4494563659 minutes
Elapsed time column: 16.487549603 minutes
Elapsed time column: 16.4933715343 minutes
Elapsed time column: 16.5310536822 minutes
Elapsed time column: 16.5758609494 minutes
Elapsed time column: 16.6134159644 minutes
Elapsed time overall: 16.6159875035 minutes
(7279505, 3)


  app.launch_new_instance()


(7279505, 1)
0.648355623082


In [53]:
#predictions = model(x_ranges_cv, y_ranges_cv, x_cv_end, y_cv_end, train_cv, test_cv.drop(['place_id'], axis=1));
#actual = test_cv[['place_id']].sort();
predictions[['l1','l2','l3']]  = predictions[['l1','l2','l3']].astype('int64')
print actual.shape
print mapk(np.array([actual.values.flatten()]).T, predictions.values, 3)

(7279505, 1)
0.64374626663


In [59]:
predictions = model(x_ranges_cv, y_ranges_cv, x_cv_end, y_cv_end, train_cv, test_cv.drop(['place_id'], axis=1));
actual = test_cv[['place_id']].sort();
predictions[['l1','l2','l3']]  = predictions[['l1','l2','l3']].astype('int64')
print actual.shape
print mapk(np.array([actual.values.flatten()]).T, predictions.values, 3)

Elapsed time column: 4.18145811558 minutes
Elapsed time column: 4.18293550014 minutes
Elapsed time column: 4.28711673419 minutes
Elapsed time column: 4.36312333345 minutes
Elapsed time column: 4.36676673492 minutes
Elapsed time column: 4.4031859676 minutes
Elapsed time column: 4.41110750039 minutes
Elapsed time column: 4.41602480014 minutes
Elapsed time column: 4.41773593028 minutes
Elapsed time column: 4.41846118371 minutes
Elapsed time column: 4.36977894704 minutes
Elapsed time column: 4.39419483344 minutes
Elapsed time column: 4.39500201543 minutes
Elapsed time column: 4.39964491924 minutes
Elapsed time column: 4.40976421833 minutes
Elapsed time column: 4.43874106407 minutes
Elapsed time column: 4.44669770002 minutes
Elapsed time column: 4.4484692653 minutes
Elapsed time column: 4.47444631656 minutes
Elapsed time column: 4.48419839938 minutes
Elapsed time column: 4.47482994795 minutes
Elapsed time column: 4.49865211646 minutes
Elapsed time column: 4.52512131532 minutes
Elapsed time 

  from ipykernel import kernelapp as app


(7279505, 1)
0.630077411399


In [65]:
predictions = model(x_ranges_cv, y_ranges_cv, x_cv_end, y_cv_end, train_cv, test_cv.drop(['place_id'], axis=1));
actual = test_cv[['place_id']].sort();
predictions[['l1','l2','l3']]  = predictions[['l1','l2','l3']].astype('int64')
print actual.shape
print mapk(np.array([actual.values.flatten()]).T, predictions.values, 3)

Elapsed time column: 2.43738391399 minutes
Elapsed time column: 2.47274431785 minutes
Elapsed time column: 2.50941873391 minutes
Elapsed time column: 2.56711030006 minutes
Elapsed time column: 2.57242245277 minutes
Elapsed time column: 2.59250754913 minutes
Elapsed time column: 2.60327646732 minutes
Elapsed time column: 2.6047546347 minutes
Elapsed time column: 2.61355179946 minutes
Elapsed time column: 2.61733216842 minutes
Elapsed time column: 2.66868231694 minutes
Elapsed time column: 2.68114528259 minutes
Elapsed time column: 2.70505626599 minutes
Elapsed time column: 2.71511448224 minutes
Elapsed time column: 2.7185640653 minutes
Elapsed time column: 2.75397520065 minutes
Elapsed time column: 2.75972094933 minutes
Elapsed time column: 2.76139810085 minutes
Elapsed time column: 2.77201389869 minutes
Elapsed time column: 2.80608005126 minutes
Elapsed time column: 2.67047816515 minutes
Elapsed time column: 2.74321651856 minutes
Elapsed time column: 2.74925008217 minutes
Elapsed time 

  from ipykernel import kernelapp as app


(7279505, 1)
0.629864026927


In [50]:
print predictions.dtypes

l1    object
l2    object
l3    object
dtype: object


In [51]:
actual.dtypes

place_id    int64
dtype: object

In [11]:
preds_total = model(x_ranges, y_ranges, size, size, train, test)

Elapsed time column: 15.6289599657 minutes
Elapsed time column: 17.8379606684 minutes
Elapsed time column: 18.6429466009 minutes
Elapsed time column: 19.3398334702 minutes
Elapsed time column: 20.1132657687 minutes
Elapsed time column: 21.3575217684 minutes
Elapsed time column: 21.5296566327 minutes
Elapsed time column: 21.5479592999 minutes
Elapsed time column: 21.5744784673 minutes
Elapsed time column: 22.1243719856 minutes
Elapsed time column: 19.1988959511 minutes
Elapsed time column: 19.4457983812 minutes
Elapsed time column: 19.6714697838 minutes
Elapsed time column: 20.130786399 minutes
Elapsed time column: 20.3939861337 minutes
Elapsed time column: 16.0720068494 minutes
Elapsed time column: 17.3344372988 minutes
Elapsed time column: 18.6908761342 minutes
Elapsed time column: 18.7616165837 minutes
Elapsed time column: 19.5174747507 minutes
Elapsed time overall: 82.1650434494 minutes
(8607230, 3)


In [12]:
preds_total1 = preds_total.applymap(str)
preds_total1.columns = ['l1', 'l2', 'l3'];
print('Writing submission file')
preds_total1 = preds_total1.l1.str.cat([preds_total1.l2, preds_total1.l3], sep=' ')
preds_total1.name = ['place_id'];
preds_total1.to_csv('submission_stack_4x_2X_1X.csv', index=True, header=True, index_label='row_id')

Writing submission file


In [22]:
preds_total1.to_csv('submission_stack_4x_2X_1X.csv', index=True, header=True, index_label='row_id')

In [21]:
train[:20]

Unnamed: 0,row_id,x,y,accuracy,time,place_id,hour,weekday,month,year
0,0,0.7941,9.0809,17.323938,470702,8523065625,88.133333,15,22,10
1,1,5.9567,4.7968,11.139434,186555,1757726713,57.0,12,10,10
2,2,8.3078,7.0407,18.692317,322648,1137537235,9.866667,3,16,10
3,3,7.3665,2.5165,18.129134,704587,6567393236,32.466667,21,10,20
4,4,4.0961,1.1307,14.913617,472130,7440663949,87.333333,18,22,10
5,5,3.8099,1.9586,18.750613,178065,6289802927,67.0,15,10,10
6,6,6.3336,4.372,11.139434,666829,9931249544,11.266667,6,8,20
7,7,5.7409,6.7697,19.294189,369002,5662813655,28.133333,15,18,10
8,8,4.3114,6.941,4.771213,166384,8471780938,56.266667,12,8,10
9,9,6.3414,0.0758,18.129134,400060,1253803156,82.666667,15,20,10


In [77]:
preds_total['place_id'] = preds_total['l1'].apply(str) + ' ' + preds_total['l2'].apply(str) + ' ' + preds_total['l3'].apply(str);

In [78]:
preds_total['place_id'].to_csv('summed_total_fix_submitable.csv', header=True);

In [4]:
preds_total = pd.read_csv('./summed_rf_knn_adddata.csv')

In [6]:
preds_total.shape

(8607230, 2)

In [40]:
print preds_total[-15:]
print test[15:]

          row_id                          place_id
8607215  8607229  2341683842 5086776815 1221929656
8607216      NaN  7276485501 1987770840 9940153795
8607217      NaN  9429226559 8516734099 4185473526
8607218      NaN  9796247868 6984803265 4864454069
8607219      NaN  6252994738 8604489285 6339939984
8607220      NaN  4314460660 3123784059 5349902730
8607221      NaN  7809383965 4777992086 7738504775
8607222      NaN  6425263733 3986169799 6602374893
8607223      NaN  4895014859 3703851555 2110925726
8607224      NaN  8444416732 8518754015 5003377269
8607225      NaN  4017621362 1643674303 1612082183
8607226      NaN  3340795021 2057449143 8971759038
8607227      NaN  5204771649 8929683468 3676075937
8607228      NaN  8827756453 5369509161 5155595138
8607229      NaN  1915818458 4538240724 6412424861
          row_id       x       y   accuracy     time       hour  weekday  \
15            15  0.2898  3.0031  17.558749   909801  81.400000        6   
16            16  5.2386  1.8262

In [39]:
print set(preds_total.row_id.values)
print len(set(test.row_id.values))

set([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 8607229.0, nan, nan])
8607230


In [15]:
set(preds_total.row_id.values)

TypeError: 'set' object has no attribute '__getitem__'

In [58]:
preds_total1  =  preds_total[preds_total['row_id'] == preds_total['row_id']]

In [59]:
preds_total1['row_id']  = preds_total1['row_id'].astype(int)

In [63]:
set1 = set(preds_total1.row_id.values)
set2 = set(test.row_id.values)

In [71]:
preds_missing  =  preds_total[preds_total['row_id'] != preds_total['row_id']]
missing = list(set2 - set1)

In [66]:
preds_total1 = preds_total1.set_index('row_id')
preds_total1.index.name = 'row_id';

In [68]:
preds_total1[-14:]

Unnamed: 0_level_0,place_id
row_id,Unnamed: 1_level_1
8607216,4314460660 6248581507 3513132876
8607217,5842923271 5052683076 7295234444
8607218,5435941405 7815053482 6653133495
8607219,8364800794 4920677861 4132439450
8607220,9924871949 3278278289 4295380287
8607221,8933542378 8792295459 5378587809
8607222,4683526794 2813163259 7429659042
8607223,2614601100 9361412597 4642612962
8607224,7519528207 7053690910 7336608920
8607225,6388207576 1450752225 8305731085


In [70]:
print test[test['row_id'].isin(missing)]

          row_id   x       y   accuracy    time       hour  weekday  month  \
187603    187603  10  0.9917  18.450980  887615  42.333333        3     18   
3105384  3105384  10  8.0616  17.708520  975746  61.733333       18     22   
3184243  3184243  10  3.8558  22.405492  799577  29.133333        9     14   
3722078  3722078  10  9.9011  23.117539  978106  27.066667        3     22   
3777156  3777156  10  7.2781  18.129134  927602  20.133333        3     20   
4669744  4669744  10  3.5732  13.010300  907455  21.000000        3     20   
5293602  5293602  10  5.2668  22.095150  944056  61.066667       15     20   
5384155  5384155  10  1.9184  18.260748  806636  19.733333        3     14   
6738507  6738507  10  4.1714  17.403627  894461  18.733333       18     18   
6949762  6949762  10  7.3085  17.781513  937723  22.866667        3     20   
6951548  6951548  10  6.4704  18.692317  952481  46.733333       12     22   
7548906  7548906  10  7.5501  17.853298  873328  49.866667      

In [73]:
preds_missing["row_id"] = missing

In [75]:
preds_missing  = preds_missing.set_index('row_id')
preds_missing.index.name = 'row_id';

In [76]:
preds_missing

Unnamed: 0_level_0,place_id
row_id,Unnamed: 1_level_1
5293602,7276485501 1987770840 9940153795
3777156,9429226559 8516734099 4185473526
3184243,9796247868 6984803265 4864454069
3105384,6252994738 8604489285 6339939984
7548906,4314460660 3123784059 5349902730
6738507,7809383965 4777992086 7738504775
6949762,6425263733 3986169799 6602374893
4669744,4895014859 3703851555 2110925726
187603,8444416732 8518754015 5003377269
8082232,4017621362 1643674303 1612082183


In [77]:
preds_total1 = pd.concat([preds_total1, preds_missing], axis=0);

In [78]:
preds_total1 = preds_total1.sort_index()

In [79]:
preds_total1['place_id'].to_csv('summed_rf_knn_adddata_missing.csv', header=True);