In [None]:
import os
import numpy as np
import pandas as pd
import time
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

x_threshold = 0.025
y_threshold = 0.0125

# area 10km by 10 km is divided into grids of size 20x40
grid_size = 10.0
x_step = 0.5
y_step = 0.25

In [104]:
def prepare_data(df):
    """
    Feature engineering
    """

    minute = df.time % 60
    df['hour'] = df['time'].div(60).map(int)
#     df.drop(['time'], axis=1, inplace=True)
    df['weekday'] = df['hour'].div(24).map(int)
    df['month'] = df['weekday'].div(30).map(int)
    df['year'] = (df['weekday'].div(365).map(int) + 1) * 10.0
    df['hour'] = ((df['hour'] % 24 + 1) + minute.div(60.0)) * 4.0
    df['weekday'] = (df['weekday'] % 7 + 1) * 3.1
    df['month'] = (df['month'] % 12 + 1) * 2.1
    df['accuracy'] = np.log10(df['accuracy']) * 10.0

    return df


def process_one_cell(df_train, df_test, th, x_min, y_min, x_max, y_max, method='rf', gridNum=0, cpuCores=-1):
    """   
    Classification inside one grid cell.
    """

    x_min_th = x_min - x_threshold
    y_min_th = y_min - y_threshold
    x_max_th = x_max + x_threshold
    y_max_th = y_max + y_threshold

    # Working on df_train, getting few extra points outside this grid
    df_cell_train = df_train[(df_train['x'] >= x_min_th)
                             & (df_train['x'] <= x_max_th)
                             & (df_train['y'] >= y_min_th)
                             & (df_train['y'] <= y_max_th)]

    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    # Feature engineering on x and y for test
    df_cell_train.loc[:, 'x'] *= 490.0
    df_cell_train.loc[:, 'y'] *= 980.0

    df_cell_train = df_cell_train.loc[mask]

    # Working on df_test
    df_cell_test = df_test[(df_test['x'] >= x_min_th) & (df_test['x'] <= x_max_th) &
                           (df_test['y'] >= y_min_th) & (df_test['y'] <= y_max_th)]
    row_ids = df_cell_test.index
    # Feature engineering on x and y for test
    df_cell_test.loc[:, 'x'] *= 490.0
    df_cell_test.loc[:, 'y'] *= 980.0

    

    # Applying the classifier
    rf_sample_weight=np.log10(20+df_cell_train.time.div(1440.0).values/30)**3

    df_cell_train.drop(['time'], axis=1, inplace=True)

    df_cell_test.drop(['time'], axis=1, inplace=True)
    
    # Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id'], axis=1).values
    X_test = df_cell_test.values if mode=='test' else df_cell_test.drop(['place_id'], axis=1).values
    
    
    if method=='rf':
        
        clf = RandomForestClassifier(n_estimators=330, max_depth=None, n_jobs=cpuCores, min_samples_split=4,
                                 random_state=0)
        clf.fit(X, y, sample_weight=rf_sample_weight)
#         clf.fit(X, y)
        
    elif method=='xgb':
        clf=XGBClassifier(learning_rate=0.04, n_estimators=150, objective='multi:softprob', max_depth=3, seed=0)
        clf.fit(X, y)
        
    elif method=='knn':
        def calculate_distance(distances):
            return distances ** -2

        numNeighbors=np.floor(np.sqrt(len(df_cell_train))/5.1282).astype(int)
#         numNeighbors=36
        clf=KNeighborsClassifier(n_neighbors=numNeighbors, weights=calculate_distance, metric='manhattan', n_jobs=cpuCores)
        clf.fit(X, y)

    
    y_pred = clf.predict_proba(X_test)
    
    le_labels=np.argsort(y_pred, axis=1)[:, ::-1][:, :6]
    pred_labels = le.inverse_transform(le_labels)
    pred_confs=[y_pred[i,le_labels[i]] for i in xrange(len(y_pred))]
    
    if 'place_id' in df_cell_test.columns:
        regionalAccuracy=1.0*sum([df_cell_test['place_id'].iloc[i] in pred_labels[i] for i in xrange(len(df_cell_test))])/len(df_cell_test)
        regionalConfidence=[sum(z)/len(df_cell_test) for z in zip(*pred_confs)]
        regionalConfidence3=sum(regionalConfidence)

        fbAccuracy=0
        fbScores0=[df_cell_test['place_id'].iloc[i] in pred_labels[i][:1] for i in xrange(len(df_cell_test))]
        fbAccuracy+=1.0*sum([df_cell_test['place_id'].iloc[i] in pred_labels[i][:1] for i in xrange(len(df_cell_test))])/len(df_cell_test)
        fbAccuracy+=1.0/2*sum([df_cell_test['place_id'].iloc[i] in pred_labels[i][1:2] for i in xrange(len(df_cell_test))])/len(df_cell_test)
        fbAccuracy+=1.0/3*sum([df_cell_test['place_id'].iloc[i] in pred_labels[i][2:3] for i in xrange(len(df_cell_test))])/len(df_cell_test)
        FBscores.append(fbAccuracy)
        print 'region {}: {},{} accuracy: {},  fbAccu: {}, confidence: {}:'.format(gridNum, x_min, y_min, regionalAccuracy, fbAccuracy, regionalConfidence3)
        accuracies.append(regionalAccuracy)


    return pred_labels, pred_confs, row_ids


def process_grid(df_train, df_test, th, mode='valid', method='rf', note='', cpuCores=-1, startGrid=0):
    """
    Iterates over all grid cells, aggregates the results and makes the
    submission.
    """
    base='./'
    folderName=method+'-'+time.strftime('%c')
    folderPath=base+folderName+'/'
    if not os.path.exists(folderPath):
        os.makedirs(folderPath)
        
        
            
    preds = np.zeros((df_test.shape[0], 6), dtype=int)
    confs = np.zeros((df_test.shape[0], 6), dtype=float)
    grids = np.zeros(df_test.shape[0], dtype=int)
#     correct=np.zeros()

    iterations_x = int(grid_size / x_step) # 20
    iterations_y = int(grid_size / y_step) # 40

    sTime=time.time()
    
    
    gridNum=0
    
    for i in range(iterations_x):
        print(i)
        x_min = x_step * i
        x_max = x_step * i + x_step
        x_min = round(x_min, 4)
        x_max = round(x_max, 4)
        if x_max == grid_size:
            x_max += 0.001

        for j in range(iterations_y):
            if gridNum<startGrid:
                gridNum+=1
                continue
                
                
            y_min = y_step * j
            y_max = y_step * j + y_step
            y_min = round(y_min, 4)
            y_max = round(y_max, 4)
            if y_max == grid_size:
                y_max += 0.001

            # Applying classifier to one grid cell
            pred_labels, pred_confs, row_ids = process_one_cell(df_train, df_test, th, x_min, y_min, x_max, y_max, method=method, gridNum=gridNum, cpuCores=cpuCores)

            # Updating predictions
                       
            preds[row_ids] = pred_labels
            confs[row_ids]=pred_confs
            grids[row_ids]=gridNum
            
            
            
            # serialize the grid results to a 
            # save one file for each grid             
            resultFileName='{:04d}-{}-{}-{}-{}.rst'.format(gridNum, x_min, x_max, y_min, y_max)
            resultFilePath=folderPath+resultFileName
            predColumns=zip(*pred_labels)
            confColumns=zip(*pred_confs)


            if len(predColumns[2])!=len(pred_labels):
                print 'missing values',gridNum, len(predColumns[0])

            def FBeval(p1, p2, p3, real):
                return (p1==real)+(p2==real)/2.0+(p3==real)/3.0
                
                
            results=pd.DataFrame({'originalIndex': row_ids,\
                                  'x':df_test.loc[row_ids,'x'].tolist(),\
                                  'y':df_test.loc[row_ids, 'y'].tolist(), \
                                  'accuracy':df_test.loc[row_ids, 'accuracy'].tolist(), \
                                  'pred0':predColumns[0], \
                                  'pred1':predColumns[1],\
                                  'pred2':predColumns[2], \
                                  'pred3':predColumns[3], \
                                  'pred4':predColumns[4],\
                                  'pred5':predColumns[5], \
                                  'conf0': confColumns[0], \
                                  'conf1': confColumns[1],\
                                  'conf2': confColumns[2],\
                                  'conf3': confColumns[3], \
                                  'conf4': confColumns[4],\
                                  'conf5': confColumns[5],\
                                  'real':df_test.loc[row_ids, 'place_id'] if 'place_id' in df_test.columns else [0]*len(row_ids),\
                                  'FBscore': [FBeval(predColumns[0][i], predColumns[1][i], predColumns[2][i], df_test.loc[row_ids[i], 'place_id']) for i in xrange(len(row_ids))]\
                                  if 'place_id' in df_test.columns else [0]*len(row_ids),\
                                  'regionalFBScore': [FBscores[-1]]*len(row_ids) if FBscores else [0]*len(row_ids)})
            
            results.to_csv(resultFilePath)
            # file saving is done
            
            gridNum+=1
            
            
        print time.time()-sTime
        if FBscores:
            print sum(FBscores)/len(FBscores)
        sTime=time.time()
        
        
    print 'Generating submission files'
        # Auxiliary dataframe with the 3 best predictions for each sample
    df_aux = pd.DataFrame(preds, dtype=str, columns=['pred0', 'pred1', 'pred2', 'pred3', 'pred4', 'pred5'])
    
    df_confs = pd.DataFrame(confs, columns=['conf0', 'conf1', 'conf2', 'conf3', 'conf4', 'conf5'])
    df_confs = pd.concat([df_aux, df_confs], axis=1)
    df_confs['grid']=grids
        
    if mode=='test':
        # Concatenating the 3 predictions for each sample
        ds_sub = df_aux.pred0.str.cat([df_aux.pred1, df_aux.pred2], sep=' ')
        ds_sub.name = 'place_id'
        resultFile=time.strftime('%c')+'-'+method+'-'+note
        ds_sub.to_csv(resultFile+'submit.csv', index=True, header=True, index_label='row_id')
        df_confs.to_csv(resultFile+'confidence.csv', index=True, index_label='row_id')
    elif mode=='valid':
        pass

In [78]:
print('Loading data')
df_train = pd.read_csv('../input/train.csv',
                       usecols=['row_id', 'x', 'y', 'accuracy', 'time', 'place_id'],
                       index_col=0)
df_test = pd.read_csv('../input/test.csv',
                      usecols=['row_id', 'x', 'y', 'accuracy', 'time'],
                      index_col=0)

div=int(0.7*len(df_train))

df_train.sort_values(by=['time'], inplace=True)
df_validation_train=df_train[:div]
df_validation_test=df_train[div:]
df_validation_test.index=xrange(len(df_validation_test))


Loading data


In [145]:
# df_validation_test.head()
# df_validation_train.head()
# df_train.head()

In [None]:
print('Preparing train data')
df_train = prepare_data(df_train)
print(df_train.shape)
# add data for periodic time that hit the boundary
pd.options.mode.chained_assignment = None
add_data1 = df_train[df_train.hour < 10]
add_data1.hour += 96
add_data2 = df_train[df_train.hour > 90]
add_data2.hour -= 96
df_train = df_train.append(add_data1)
df_train = df_train.append(add_data2)

print(df_train.shape)
print('Preparing test data')
df_test = prepare_data(df_test)
#########################################################################
df_validation_train=prepare_data(df_validation_train)
df_validation_test=prepare_data(df_validation_test)

add_data1 = df_validation_train[df_validation_train.hour < 10]
add_data1.hour += 96
add_data2 = df_validation_train[df_validation_train.hour > 90]
add_data2.hour -= 96
df_validation_train = df_validation_train.append(add_data1)
df_validation_train = df_validation_train.append(add_data2)


Preparing train data
(29118021, 9)
(33946497, 9)
Preparing test data


In [None]:
# Solving classification problems inside each grid cell
th = 5  # Keeping place_ids with more than th samples.
mode='valid'
method='xgb'
note=''

cpuCores=-1
startGrid=18


accuracies=[]
FBscores=[]

if mode=='test':
    process_grid(df_train, df_test, th, method=method, note=note, mode=mode, cpuCores=cpuCores, startGrid=startGrid)
if mode=='valid':
    process_grid(df_validation_train, df_validation_test, th, method=method, note=note, mode=mode, cpuCores=cpuCores, startGrid=startGrid)

0
region 18: 0.0,4.5 accuracy: 0.715815261773,  fbAccu: 0.567809938602, confidence: 0.798944080996:
region 19: 0.0,4.75 accuracy: 0.787852112676,  fbAccu: 0.587177230047, confidence: 0.787964989442:
region 20: 0.0,5.0 accuracy: 0.700611325611,  fbAccu: 0.520431145431, confidence: 0.789001085951:
region 21: 0.0,5.25 accuracy: 0.752794692357,  fbAccu: 0.560119967282, confidence: 0.794981188467:
region 22: 0.0,5.5 accuracy: 0.79068805822,  fbAccu: 0.631285147205, confidence: 0.823260846312:
region 23: 0.0,5.75 accuracy: 0.802544014826,  fbAccu: 0.60072164659, confidence: 0.808740840407:
region 24: 0.0,6.0 accuracy: 0.766492970209,  fbAccu: 0.610641365975, confidence: 0.814669967926:
region 25: 0.0,6.25 accuracy: 0.741526818032,  fbAccu: 0.552059339695, confidence: 0.78574449712:
region 26: 0.0,6.5 accuracy: 0.707951856805,  fbAccu: 0.544062682166, confidence: 0.784895708162:
region 27: 0.0,6.75 accuracy: 0.738275698721,  fbAccu: 0.549371019527, confidence: 0.798236606972:
region 28: 0.0,7

In [192]:
confs=pd.read_csv('Wed Jun 29 12:03:11 2016-knn-confidence.csv')

In [269]:
r=pd.read_csv('xgb-Wed Jun 29 23:34:00 2016//0000-0.0-0.5-0.0-0.25.rst')
th=0.3
l=len(r)
a=len(r[(r.conf0>th)])
b=len(r[(r.conf0>th) & (r.pred0 != r.real)])
1.0*(a-b)/a
1.0*a/l

0.5990068280571074

0.7493604155360881

#### FBscores
confs.head(200)