In [8]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

#print('	Reading train.csv')
df_train = pd.read_csv('../input/train.csv',
                        usecols=['row_id','x','y','time','place_id','accuracy'], 
                        index_col = 0)
#print('	Reading test.csv')
df_test = pd.read_csv('../input/test.csv',
                        usecols=['row_id','x','y','time','accuracy'],
                        index_col = 0)

print('Feature Augmentation')

minute = df_train.time%60
df_train['hour'] = df_train['time'].div(60).map(int)
df_train.drop(['time'], axis=1, inplace=True)
df_train['weekday'] = df_train['hour'].div(24).map(int)
df_train['month'] = df_train['weekday'].div(30).map(int)
df_train['year'] = (df_train['weekday'].div(365).map(int)+1).mul(10.0)
df_train['hour'] = ((df_train['hour']%24+1)+minute.div(60.0)).mul(4.0)

pd.options.mode.chained_assignment = None

add_data = df_train[df_train.hour<10]# add data for periodic time that hit the boundary
add_data.hour = add_data.hour+96

add_data2 = df_train[df_train.hour>90]
add_data2.hour = add_data2.hour-96

df_train = df_train.append(add_data)
df_train = df_train.append(add_data2)
# del add_data,add_data2

df_train['weekday'] = (df_train['weekday']%7+1).mul(3.1)
df_train['month'] = (df_train['month']%12+1).mul(2.1)
df_train['accuracy'] = np.log10(df_train['accuracy']).mul(10.0)

print 'done processing training'

minute = df_test['time']%60
df_test['hour'] = df_test['time'].div(60).map(int)
df_test.drop(['time'], axis=1, inplace=True)
df_test['weekday'] = df_test['hour'].div(24).map(int)
df_test['month'] = df_test['weekday'].div(30).map(int)
df_test['year'] = (df_test['weekday'].div(365).map(int)+1).mul(10.0)
df_test['hour'] = ((df_test['hour']%(24)+1)+minute.div(60.0)).mul(4.0)
df_test['weekday'] = (df_test['weekday']%7+1).mul(3.1)
df_test['month'] = (df_test['month']%12+1).mul(2.1)
df_test['accuracy'] = np.log10(df_test['accuracy']).mul(10.0)

print('features done')

Feature Augmentation
done processing training
features done


In [22]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline
print len(add_data)
print len(add_data2)
df_train.sort_index().head()
add_data.head(10)


1818434
3010042


Unnamed: 0_level_0,x,y,accuracy,place_id,hour,weekday,month,year
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0.7941,9.0809,17.323938,8523065625,88.133333,15.5,23.1,10.0
1,5.9567,4.7968,11.139434,1757726713,57.0,12.4,10.5,10.0
2,8.3078,7.0407,18.692317,1137537235,105.866667,3.1,16.8,10.0
2,8.3078,7.0407,18.692317,1137537235,9.866667,3.1,16.8,10.0
3,7.3665,2.5165,18.129134,6567393236,32.466667,21.7,10.5,20.0


Unnamed: 0_level_0,x,y,accuracy,place_id,hour,weekday,month,year
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,8.3078,7.0407,74,1137537235,105.866667,224,7,10.0
58,4.5698,4.1206,101,5158095731,103.666667,479,15,20.0
62,6.2019,5.4881,303,8673603270,105.533333,399,13,20.0
74,1.8616,2.8019,133,4530587605,102.8,534,17,20.0
139,7.6932,2.3443,19,5250105119,105.4,82,2,10.0
144,4.7891,1.2985,48,4490687859,101.533333,88,2,10.0
158,4.4052,2.7474,65,7124468757,101.866667,336,11,10.0
170,0.3764,6.3198,116,9646456529,103.666667,394,13,20.0
201,5.4298,2.9131,67,4108047704,101.666667,51,1,10.0
231,3.7208,8.8431,38,8907221664,102.733333,464,15,20.0


In [1]:
def calculate_distance(distances):
    return distances ** -2

def process_one_cell(df_cell_train, df_cell_test):
    
    #Working on df_train
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= 5).values
    df_cell_train = df_cell_train.loc[mask]
    
    #Working on df_test
    row_ids = df_cell_test.index
    
    #Feature engineering on x and y
    df_cell_train.loc[:,'x'] *= 490.0
    df_cell_train.loc[:,'y'] *= 980.0
    df_cell_test.loc[:,'x'] *= 490.0
    df_cell_test.loc[:,'y'] *= 980.0
    
    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id'], axis=1).values
    
    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=np.floor(np.sqrt(y.size)/5.1282).astype(int), 
                            weights=calculate_distance,metric='manhattan',n_jobs=-1)
    clf.fit(X, y)
    y_pred = clf.predict_proba(df_cell_test.values)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) 
    
    return pred_labels, row_ids
   
def process_grid(df_train, df_test):
    """
    Iterates over all grid cells, aggregates the results
    """
    size = 10.0
    x_step = 0.5
    y_step = 0.25
    
    x_border_augment = 0.03  
    y_border_augment = 0.015
    
    preds = np.zeros((df_test.shape[0], 3), dtype=int)

    for i in range((int)(size/x_step)):
        
        x_min = x_step * i
        x_max = x_step * (i+1)
        x_min = round(x_min, 4)
        x_max = round(x_max, 4) 
        if x_max == size:
            x_max = x_max + 0.001
            
        df_col_train = df_train[(df_train['x'] >= x_min-x_border_augment) & (df_train['x'] < x_max+x_border_augment)]
        df_col_test = df_test[(df_test['x'] >= x_min) & (df_test['x'] < x_max)]

        for j in range((int)(size/y_step)):
            y_min = y_step * j
            y_max = y_step * (j+1)
            y_min = round(y_min, 4)
            y_max = round(y_max, 4)   
            if y_max == size:
                y_max = y_max + 0.001
                
            df_cell_train = df_col_train[(df_col_train['y'] >= y_min-y_border_augment) & (df_col_train['y'] < y_max+y_border_augment)]
            df_cell_test = df_col_test[(df_col_test['y'] >= y_min) & (df_col_test['y'] < y_max)]
            
            #Applying classifier to one grid cell
            pred_labels, row_ids = process_one_cell(df_cell_train, df_cell_test)

            #Updating predictions
            preds[row_ids] = pred_labels
        print i, 'th column is done'
    
    return preds

def generate_sub(preds):    
    print('Writing submission file')
    #Auxiliary dataframe with the 3 best predictions for each sample
    df_aux = pd.DataFrame(preds, dtype=str, columns=['l1', 'l2', 'l3'])
    #Concatenating the 3 predictions for each sample
    ds_sub = df_aux.l1.str.cat([df_aux.l2, df_aux.l3], sep=' ')
    
    #Writting to csv
    ds_sub.name = 'place_id'
    ds_sub.to_csv('submission_v01.csv', index=True, header=True, index_label='row_id')

preds=process_grid(df_train, df_test)

del df_train, df_test

generate_sub(preds)
print 'done'

NameError: name 'df_train' is not defined

In [3]:
df_train.head()

NameError: name 'df_train' is not defined