In [1]:
import multiprocessing
from multiprocessing import Process
from multiprocessing import Manager

import math
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np
import datetime
import time
import os
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from ml_metrics import mapk
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
def apply_weights(df, fw):
    df['accuracy'] *= fw[0]
    df['day_of_year_sin'] *= fw[1]
    df['day_of_year_cos'] *= fw[1]
    df['minute_sin'] *= fw[2]
    df['minute_cos'] *= fw[2]
    df['weekday_sin'] *= fw[3]
    df['weekday_cos'] *= fw[3]
    #df.x *= fw[4]
    #df.y *= fw[5]
    df['year'] *= fw[6]
    return df

def prepare_data(df):
    minute = 2*np.pi*((df["time"]//5)%288)/288
    df['minute_sin'] = (np.sin(minute)+1).round(4)
    df['minute_cos'] = (np.cos(minute)+1).round(4)
    del minute
    day = 2*np.pi*((df['time']//1440)%365)/365
    df['day_of_year_sin'] = (np.sin(day)+1).round(4)
    df['day_of_year_cos'] = (np.cos(day)+1).round(4)
    del day
    weekday = 2*np.pi*((df['time']//1440)%7)/7
    df['weekday_sin'] = (np.sin(weekday)+1).round(4)
    df['weekday_cos'] = (np.cos(weekday)+1).round(4)
    del weekday
    df['year'] = (df['time']//525600).astype(float)
    #df.drop(['time'], axis=1, inplace=True)
    df['accuracy'] = np.log10(df['accuracy']).astype(float)
    return df

train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')

train = prepare_data(train)
test = prepare_data(test)

fw = [0.61,0.32435, 0.56525, 0.2670, 22, 52, 0.51885]

train = apply_weights(train, fw)
test = apply_weights(test, fw)

In [3]:
def xfrange(start, end, step):
    gens = [];
    end = round(end, 2)
    start = round(start, 2)
    while(start < end):
        gens.append(start)
        start = round(start + step, 2)
            
    return gens
        
        
def gen_ranges(start, end, step):
    return zip(xfrange(start, end, step), xfrange(start + step, end + step, step));

size = 10.0;

x_step = 1.0
y_step = 0.4

x_ranges = gen_ranges(0, size, x_step);
y_ranges = gen_ranges(0, size, y_step);

In [4]:
size_cv = 10.0;

x_cv_start = 0;
x_cv_end = x_cv_start + size_cv
y_cv_start = 0;
y_cv_end = y_cv_start + size_cv;

x_ranges_cv = gen_ranges(x_cv_start, x_cv_end, x_step);
y_ranges_cv = gen_ranges(y_cv_start, y_cv_end, y_step);

cv = train[(train['x'] >= x_cv_start) & 
           (train['x'] <= x_cv_end) &
           (train['y'] >= y_cv_start) &
           (train['y'] <= y_cv_end)]

cv = cv.sort_values(by='time', axis=0, ascending=True)
train_cv = cv[cv.shape[0]//8:]
test_cv = cv[:cv.shape[0]//8]

In [5]:
print cv.shape
print train_cv.shape
print test_cv.shape

(29118021, 13)
(25478269, 13)
(3639752, 13)


In [8]:
def calculate_distance(distances):
    return distances ** -2.225

def process_column(x_min, x_max, y_ranges, x_end, y_end, train_column, test_column, preds_total):
    start_time_column = time.time()
    preds_total[x_min] = pd.DataFrame();
    for y_min, y_max in  y_ranges: 
        
            start_time_cell = time.time()

            if y_max == y_end:
                y_max = y_max + 0.001
            
            train_cell = train_column[(train_column['y'] >= y_min - 0.017) &
                                      (train_column['y'] < y_max + 0.017)]
            
            train_cell = train_cell.drop(['time'], axis=1)
            train_cell = train_cell.groupby("place_id").filter(lambda x: len(x) >= 8)

            test_cell = test_column[(test_column['y'] >= y_min) &
                                    (test_column['y'] < y_max)]
            
            row_ids = test_cell['row_id'].reset_index(drop=True);
            test_cell = test_cell.drop(['row_id', 'time'], axis=1)
            
            #Feature engineering on x and y
            train_cell.loc[:,'x'] *= fw[4]
            train_cell.loc[:,'y'] *= fw[5]
            test_cell.loc[:,'x'] *= fw[4]
            test_cell.loc[:,'y'] *= fw[5]
            
            le = LabelEncoder()

            y = le.fit_transform(train_cell.place_id.values)
            X = train_cell.drop(['row_id', 'place_id'], axis=1)

            #Applying the classifier
            cte = 5.8
            n_neighbors = int((y.size ** 0.5) / cte)
            clf = KNeighborsClassifier(n_neighbors=n_neighbors,
                            weights=calculate_distance, p=1, 
                            n_jobs=2, leaf_size=15)

            clf.fit(X, y)

            y_pred = clf.predict_proba(test_cell.values)

            y_pred = clf.predict_proba(test_cell.values)
            preds = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) 
            preds = pd.DataFrame.from_dict(preds)
            preds['row_id'] = row_ids;
            preds = preds.set_index('row_id')
            preds.index.name = 'row_id';

            preds_total[x_min] = pd.concat([preds_total[x_min], preds], axis=0);
            
    print("Elapsed time column: %s minutes" % ((time.time() - start_time_column)/60))

def modelq(x_ranges, y_ranges, x_end, y_end, train, test):   
    start_time = time.time()
    jobs = []
    mgr = Manager()
    preds_total = mgr.dict();

    for x_min, x_max in  x_ranges:
        
        if x_max == x_end:
            x_max = x_max + 0.001
                
        train_column = train[(train['x'] >= x_min - 0.05) &
                             (train['x'] < x_max + 0.05)]
        
        test_column = test[(test['x'] >= x_min) &
                             (test['x'] < x_max)]
        
        p = multiprocessing.Process(target=process_column, args=(x_min, x_max, y_ranges, \
                                                                 x_end, y_end, train_column, test_column, preds_total))
        jobs.append(p)
        p.start()
        if len(jobs) == 5:
            for proc in jobs:
                proc.join();
            jobs = [];
        
    print("Elapsed time overall: %s minutes" % ((time.time() - start_time)/60))
    
    preds_total = pd.concat(preds_total.values(), axis=0);
    print preds_total.shape
    
    return preds_total.sort_index();

In [9]:
predictions = modelq(x_ranges_cv, y_ranges_cv, x_cv_end, y_cv_end, train_cv, 
                    test_cv.drop(['place_id'], axis=1));
actual = test_cv[['place_id']].sort();
print mapk(np.array([actual.values.flatten()]).T, predictions.values, 3)

Elapsed time column: 3.12429431677 minutes
Elapsed time column: 3.30709615151 minutes
Elapsed time column: 3.3712097168 minutes
Elapsed time column: 3.36353240013 minutes
Elapsed time column: 3.57280368408 minutes
Elapsed time column: 2.83974711498 minutes
Elapsed time column: 3.20833914677 minutes
Elapsed time column: 3.19278701544 minutes
Elapsed time column: 3.27259593407 minutes
Elapsed time column: 3.35424630245 minutes
Elapsed time overall: 7.04358919859 minutes
(3639752, 3)


  app.launch_new_instance()


0.693093467174


In [76]:
clf = KNeighborsClassifier(n_neighbors=29, weights='distance', 
                                       metric='manhattan', n_jobs=-1)

preds_total = modelq(x_ranges, y_ranges, size, size, train, test, 5)
preds_total['place_id'] = preds_total['l1'].apply(str) + ' ' + preds_total['l2'].apply(str) + ' ' + preds_total['l3'].apply(str);
preds_total['place_id'].to_csv('summed_total_fix_submitable.csv', header=True);

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

Elapsed time row: 20.984288617 minutes
Elapsed time row: 27.7590901176 minutes
Elapsed time row: 28.7786113143 minutes
Elapsed time row: 30.570629251 minutes
Elapsed time row: 34.0028994521 minutes
Elapsed time row: 37.6347986142 minutes
Elapsed time row: 37.047338899 minutes
Elapsed time row: 38.242803216 minutes
Elapsed time row: 36.5508922497 minutes
Elapsed time row: 34.3818624655 minutes
Elapsed time row: 36.3424279849 minutes
Elapsed time row: 33.8832034826 minutes
Elapsed time row: 33.35711145 minutes
Elapsed time row: 33.5121737679 minutes
Elapsed time row: 36.0588709633 minutes
Elapsed time row: 33.1081194162 minutes
Elapsed time row: 30.2188024998 minutes
Elapsed time row: 29.7511512518 minutes
Elapsed time row: 28.0056342999 minutes
Elapsed time row: 23.5284121513 minutes
Elapsed time overall: 643.719177636 minutes


In [77]:
preds_total['place_id'] = preds_total['l1'].apply(str) + ' ' + preds_total['l2'].apply(str) + ' ' + preds_total['l3'].apply(str);

In [78]:
preds_total['place_id'].to_csv('summed_total_fix_submitable.csv', header=True);