In [1]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import datetime
import time
import os
from sklearn.ensemble import RandomForestClassifier
from ml_metrics import mapk
from sklearn.preprocessing import LabelEncoder

import multiprocessing
from multiprocessing import Process
from multiprocessing import Manager

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
def prepare_data(df):
    """
    Feature engineering
    """
    minute = df.time % 60
    df['hour'] = df['time'] // 60
    #df.drop(['time'], axis=1, inplace=True)
    df['weekday'] = df['hour'] // 24
    df['month'] = df['weekday'] // 30
    df['year'] = (df['weekday'] // 365 + 1) * 10.0
    df['hour'] = ((df['hour'] % 24 + 1) + minute / 60.0) * 4.0
    df['weekday'] = (df['weekday'] % 7 + 1) * 3.0
    df['month'] = (df['month'] % 12 + 1) * 2.0
    df['accuracy'] = np.log10(df['accuracy']) * 10.0
   
    return df

train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')

train = prepare_data(train)
test = prepare_data(test)

In [2]:
def apply_weights(df, fw):
    df['accuracy'] *= fw[0]
    df['day_of_year_sin'] *= fw[1]
    df['day_of_year_cos'] *= fw[1]
    df['minute_sin'] *= fw[2]
    df['minute_cos'] *= fw[2]
    df['weekday_sin'] *= fw[3]
    df['weekday_cos'] *= fw[3]
    #df.x *= fw[4]
    #df.y *= fw[5]
    df['year'] *= fw[6]
    return df

def prepare_data(df):
    minute = 2*np.pi*((df["time"]//5)%288)/288
    df['minute_sin'] = (np.sin(minute)+1).round(4)
    df['minute_cos'] = (np.cos(minute)+1).round(4)
    del minute
    day = 2*np.pi*((df['time']//1440)%365)/365
    df['day_of_year_sin'] = (np.sin(day)+1).round(4)
    df['day_of_year_cos'] = (np.cos(day)+1).round(4)
    del day
    weekday = 2*np.pi*((df['time']//1440)%7)/7
    df['weekday_sin'] = (np.sin(weekday)+1).round(4)
    df['weekday_cos'] = (np.cos(weekday)+1).round(4)
    del weekday
    df['year'] = (df['time']//525600).astype(float)
    #df.drop(['time'], axis=1, inplace=True)
    df['accuracy'] = np.log10(df['accuracy']).astype(float)
    return df

train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')

train = prepare_data(train)
test = prepare_data(test)

fw = [0.61,0.32435, 0.56525, 0.2670, 22, 52, 0.51885]

train = apply_weights(train, fw)
test = apply_weights(test, fw)

In [3]:
def xfrange(start, end, step):
    gens = [];
    end = round(end, 2)
    start = round(start, 2)
    while(start < end):
        gens.append(start)
        start = round(start + step, 2)
            
    return gens
        
def gen_ranges(start, end, step):
    return zip(xfrange(start, end, step), xfrange(start + step, end + step, step));

size = 10.0;

x_step = 0.2
y_step = 0.08

x_ranges = gen_ranges(0, size, x_step);
y_ranges = gen_ranges(0, size, y_step);
#print x_ranges
#print y_ranges

In [4]:
size_cv = 10.0;

x_cv_start = 0;
x_cv_end = x_cv_start + size_cv
y_cv_start = 0;
y_cv_end = y_cv_start + size_cv;

cv = train[(train['x'] >= x_cv_start) & 
           (train['x'] <= x_cv_end) &
           (train['y'] >= y_cv_start) &
           (train['y'] <= y_cv_end)]

cv = cv.sort_values(by='time', axis=0, ascending=True)
train_cv = cv[cv.shape[0]//8:]
test_cv = cv[:cv.shape[0]//8]

print cv.shape
print train_cv.shape
print test_cv.shape

x_ranges_cv = gen_ranges(x_cv_start, x_cv_end, x_step);
y_ranges_cv = gen_ranges(y_cv_start, y_cv_end, y_step);

(29118021, 13)
(25478269, 13)
(3639752, 13)


In [5]:
def process_column(x_min, x_max, y_ranges, x_end, y_end, train, test, raw_output, th, preds_total):
    start_time_column = time.time()
    preds_total[x_min] = pd.DataFrame();
    for y_min, y_max in  y_ranges: 
        start_time_cell = time.time()
        if x_max == x_end:
            x_max = x_end + 0.001

        if y_max == y_end:
            y_max = y_end + 0.001

        train_cell = train[(train['x'] >= x_min - 0.05) &
                           (train['x'] < x_max + 0.05) &
                           (train['y'] >= y_min - 0.017) &
                           (train['y'] < y_max + 0.017)]
        
        train_cell = train_cell.drop(['time'], axis=1)
        train_cell = train_cell.groupby("place_id").filter(lambda x: len(x) >= th)

        test_cell = test[(test['x'] >= x_min) &
                         (test['x'] < x_max) &
                         (test['y'] >= y_min) &
                         (test['y'] < y_max)]

        row_ids = test_cell['row_id'].reset_index(drop=True);
        test_cell = test_cell.drop(['row_id', 'time'], axis=1)
    
        #Feature engineering on x and y
        train_cell.loc[:,'x'] *= fw[4]
        train_cell.loc[:,'y'] *= fw[5]
        test_cell.loc[:,'x'] *= fw[4]
        test_cell.loc[:,'y'] *= fw[5]
            
        le = LabelEncoder()

        y = le.fit_transform(train_cell.place_id.values)
        X = train_cell.drop(['row_id', 'place_id'], axis=1)
        
        clf = RandomForestClassifier(n_estimators=65, max_depth=None, n_jobs=-1,
                             min_samples_split=4, random_state=0, criterion='gini')
        
        clf.fit(X, y)
        y_pred = clf.predict_proba(test_cell.values)
        
        clf2 = RandomForestClassifier(n_estimators=65, max_depth=None, n_jobs=-1,
                             min_samples_split=4, random_state=0, criterion='entropy')
        clf2.fit(X, y)
        y_pred2 = clf2.predict_proba(test_cell.values)

        preds_all = dict(zip(le.inverse_transform(clf.classes_), zip(*y_pred)))
        preds_all = pd.DataFrame.from_dict(preds_all)
        preds_all['row_id'] = row_ids;
        preds_all = preds_all.set_index('row_id')
        
        preds_all2 = dict(zip(le.inverse_transform(clf2.classes_), zip(*y_pred2)))
        preds_all2 = pd.DataFrame.from_dict(preds_all2)
        preds_all2['row_id'] = row_ids;
        preds_all2 = preds_all2.set_index('row_id')
        
        preds = preds_all.add(preds_all2).fillna(value=0);
        
        preds.to_csv("./raw/" + raw_output + str(x_min) + "_" + str(y_min) + ".csv", index = False);

        preds['l1'], preds['l2'], preds['l3'] = \
            zip(*preds.apply(lambda x: preds.columns[x.argsort()[::-1][:3]].tolist(), axis=1));

        preds = preds[['l1','l2','l3']];

        preds_total[x_min] = pd.concat([preds_total[x_min], preds], axis=0);
            
    print("Elapsed time column: %s minutes" % ((time.time() - start_time_column)/60))

def model(x_ranges, y_ranges, x_end, y_end, train, test, raw_output, th):   
    start_time = time.time()
    jobs = []
    mgr = Manager()
    preds_total = mgr.dict();
    #preds_total = {};
    for x_min, x_max in  x_ranges:
        #process_column(x_min, x_max, y_ranges,
        #               x_end, y_end, train,
        #               test, raw_output, 3, preds_total));

        p = multiprocessing.Process(target=process_column, args=(x_min, x_max, y_ranges,
                                                                 x_end, y_end, train,
                                                                 test, raw_output, 3, preds_total))
        jobs.append(p)
        p.start()
        if len(jobs) == 5:
            for proc in jobs:
                proc.join();
            jobs = [];
        
    print("Elapsed time overall: %s minutes" % ((time.time() - start_time)/60))
    
    preds_total = pd.concat(preds_total.values(), axis=0);
    print preds_total.shape
    
    return preds_total.sort_index();

In [28]:
predictions = model(x_ranges_cv, y_ranges_cv, x_cv_end, y_cv_end,
                    train_cv, test_cv.drop(['place_id'], axis=1),
                    'cv/rf/', 3);
#predictions = predictions.set_index('row_id')
predictions.index.name = None
actual = test_cv[['place_id']].sort_index();
print mapk(np.array([actual.values.flatten()]).T, predictions.values, 3)

Elapsed time column: 5.18594366312 minutes
Elapsed time column: 5.85396471818 minutes
Elapsed time column: 6.07159416676 minutes
Elapsed time column: 6.09313060045 minutes
Elapsed time column: 6.10955463648 minutes
Elapsed time column: 6.05053650141 minutes
Elapsed time column: 6.18558395306 minutes
Elapsed time column: 6.19932733377 minutes
Elapsed time column: 6.28905511697 minutes
Elapsed time column: 6.29336813291 minutes
Elapsed time column: 6.11127638419 minutes
Elapsed time column: 6.21258045038 minutes
Elapsed time column: 6.22498266697 minutes
Elapsed time column: 6.24967044989 minutes
Elapsed time column: 6.26506243149 minutes
Elapsed time column: 4.88253289859 minutes
Elapsed time column: 6.12975911697 minutes
Elapsed time column: 6.16262361606 minutes
Elapsed time column: 6.18641251723 minutes
Elapsed time column: 6.22440410058 minutes
Elapsed time overall: 24.9001695673 minutes
(593608, 3)
0.678842210123


In [6]:
predictions = model(x_ranges_cv, y_ranges_cv, x_cv_end, y_cv_end,
                    train_cv, test_cv.drop(['place_id'], axis=1),
                    'cv/rf_50_gini+entropy_02_008_aug_0.05-0.017_th3_accuracy_adddata/', 3);

predictions.index.name = None
actual = test_cv[['place_id']].qdaxsort_index();
print mapk(np.array([actual.values.flatten()]).T, predictions.values, 3)


preds_total = model(x_ranges, y_ranges, size, size, train, test, 
                    'rf_50_gini+entropy_02_008_aug_0.05-0.017_th3_accuracy_adddata/', 3)


Elapsed time column: 83.1824204683 minutes
Elapsed time column: 86.018810614 minutes
Elapsed time column: 86.2520467679 minutes
Elapsed time column: 86.3308351318 minutes
Elapsed time column: 86.4458239635 minutes
Elapsed time column: 18.9313154856 minutes
Elapsed time column: 19.9345006506 minutes
Elapsed time column: 20.1228297671 minutes
Elapsed time column: 20.2524216811 minutes
Elapsed time column: 20.2821010351 minutes
Elapsed time column: 23.1021382014 minutes
Elapsed time column: 23.3752674977 minutes
Elapsed time column: 23.7705391645 minutes
Elapsed time column: 23.7899013837 minutes
Elapsed time column: 23.7918964346 minutes
Elapsed time column: 21.0342624346 minutes
Elapsed time column: 21.0405694683 minutes
Elapsed time column: 21.132999301 minutes
Elapsed time column: 21.1467328469 minutes
Elapsed time column: 21.2936470668 minutes
Elapsed time column: 21.6371132334 minutes
Elapsed time column: 21.8982223153 minutes
Elapsed time column: 21.9119090637 minutes
Elapsed time 

In [11]:
#preds_total.columns = ['l1', 'l2', 'l3', 'row_id'];
#preds_total['place_id'] = preds_total['l1'].apply(str) + ' ' + preds_total['l2'].apply(str) + ' ' + preds_total['l3'].apply(str);
sub_file = os.path.join('rf_50_gini+entropy_02_008_aug_0.05-0.017_th3_accuracy_adddata' + str(datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")) + '.csv')
preds_total[['place_id']].to_csv(sub_file, index=True, header=True, index_label='row_id');
print sub_file

rf_50_gini+entropy_02_008_aug_0.05-0.017_th3_accuracy_adddata2016-07-05-08-41.csv
