In [1]:
from operator import itemgetter

import numpy as np
import pandas as pd

import xgboost as xgb
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib

In [2]:
df = pd.read_csv("data/sets/test_set_final.csv", sep=',', encoding='utf-8')
df.head()

Unnamed: 0,amount,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,terminal_id,work_add_lat,...,is_holiday,is_short,is_weekend,month,timestamp,week_of_year,pct_term_counts,pct_mcc_g_counts,is_abroad,amount_true
0,2.211818,70,0,643,00fd410f5c580c8351cafa88d82b60f3,,,5411,ff0476dae4b098a7b16aabe93d4268df,,...,0,0,0,8,1503522000,34,0.117647,0.5,0,407.153551
1,1.331379,70,0,643,00fd410f5c580c8351cafa88d82b60f3,,,4111,7cfd9a60282459d4692ecc85b856072e,,...,0,0,1,8,1502485200,32,0.029412,0.088235,0,53.619044
2,1.331379,70,0,643,00fd410f5c580c8351cafa88d82b60f3,,,4111,7cfd9a60282459d4692ecc85b856072e,,...,0,0,1,8,1502485200,32,0.029412,0.088235,0,53.619044
3,2.608004,50,0,643,00fd410f5c580c8351cafa88d82b60f3,,,5411,7e5a532f0029861d8a9c4f0479b9450b,,...,0,0,1,6,1497646800,24,0.029412,0.5,0,1013.77978
4,1.916752,70,0,643,00fd410f5c580c8351cafa88d82b60f3,,,5411,2afe7d1bc61b86c449f413bdf2119032,,...,0,0,1,8,1502485200,32,0.029412,0.5,0,206.391756


In [3]:
def is_inside_area(lat_pred, lon_pred, lat_true, lon_true, radius=0.02):
    return np.power(lat_pred - lat_true, 2) + np.power(lon_pred - lon_true, 2) <= radius ** 2

In [4]:
features = df.columns.tolist()
for f in ['home_add_lon', 'home_add_lat',
          'work_add_lon', 'work_add_lat',
          'customer_id', 'terminal_id']:
    features.remove(f)
features

[u'amount',
 u'city',
 u'country',
 u'currency',
 u'mcc',
 u'access24h',
 u'is_office',
 u'lat',
 u'lon',
 u'n_points',
 u'is_atm',
 u'is_raiff',
 u'is_partner',
 u'pct_same',
 u'neg_home',
 u'neg_work',
 u'pos_home',
 u'pos_work',
 u'mcc_group',
 u'has_car',
 u'has_children',
 u'has_pet',
 u'day',
 u'day_of_week',
 u'days_after_holiday',
 u'days_before_holiday',
 u'is_dayoff',
 u'is_holiday',
 u'is_short',
 u'is_weekend',
 u'month',
 u'timestamp',
 u'week_of_year',
 u'pct_term_counts',
 u'pct_mcc_g_counts',
 u'is_abroad',
 u'amount_true']

In [5]:
model = {
    'is_home': joblib.load('data/models/clf_home.model'),
    'is_work': joblib.load('data/models/clf_work.model')
}

In [6]:
def choose_best(group, pred):
    pred_i = group[pred].idxmax()
    return group.loc[pred_i, ['lat', 'lon']]


def modify_prediction(df, y_pred, col):
    cols = ['customer_id', 'lat', 'lon']
    pred = 'pred:{}'.format(col)
    
    df = df.loc[:, cols]
    df[pred] = y_pred
    
    return df.groupby('customer_id').apply(lambda x: choose_best(x, pred)).reset_index()


def score(df, y_pred, col):
    lon, lat = {
        'is_home': ('home_add_lon', 'home_add_lat'),
        'is_work': ('work_add_lon', 'work_add_lat')
    }[col]
    cols = ['customer_id', lat, lon]
    
    df_true = df.loc[:, cols].groupby('customer_id', as_index=False).median()
    df_pred = modify_prediction(df, y_pred, col)
    
    df_result = pd.merge(df_true, df_pred, how='left', on='customer_id')
    result = is_inside_area(df_result[lat], df_result[lon], df_result['lat'], df_result['lon'])
    return sum(result) / float(len(result))

In [7]:
df_preds = dict()

for col in ['is_home', 'is_work']:
    clf = model[col]
    y_pred = clf.predict_proba(df.loc[:, features])
    df_preds[col] = modify_prediction(df, y_pred[:,1], col)
    print "Preticted", col

Preticted is_home
Preticted is_work


In [8]:
df_preds['is_home'].head()

Unnamed: 0,customer_id,lat,lon
0,00021683ccb416637fe9a4cd35e4606e,55.041771,82.984329
1,0002d0f8a642272b41c292c12ab6e602,53.199818,50.173374
2,0004d182d9fede3ba2534b2d5e5ad27e,43.586273,39.724274
3,0008c2445518c9392cb356c5c3db3392,51.535133,46.046375
4,000b373cc4969c0be8e0933c08da67e1,56.232037,43.458107


In [9]:
df_preds['is_work'].head()

Unnamed: 0,customer_id,lat,lon
0,00021683ccb416637fe9a4cd35e4606e,55.023354,82.914726
1,0002d0f8a642272b41c292c12ab6e602,53.199818,50.173374
2,0004d182d9fede3ba2534b2d5e5ad27e,43.585316,39.723358
3,0008c2445518c9392cb356c5c3db3392,51.529232,46.019422
4,000b373cc4969c0be8e0933c08da67e1,55.830116,49.116356


In [10]:
df_pred = pd.merge(
    df_preds['is_home'], df_preds['is_work'],
    on='customer_id', how='outer', suffixes=('_home', '_work'))

df_pred['lat_work'].fillna(df_pred['lat_work'].median(), inplace=True)
df_pred['lon_work'].fillna(df_pred['lon_work'].median(), inplace=True)
df_pred['lat_home'].fillna(df_pred['lat_home'].median(), inplace=True)
df_pred['lon_home'].fillna(df_pred['lon_home'].median(), inplace=True)

df_pred.rename(columns={
        'customer_id': '_ID_',
        'lat_work': '_WORK_LAT_',
        'lon_work': '_WORK_LON_',
        'lat_home': '_HOME_LAT_',
        'lon_home': '_HOME_LON_'
    }, inplace=True)
df_pred = df_pred[['_ID_', '_WORK_LAT_', '_WORK_LON_', '_HOME_LAT_', '_HOME_LON_']]
print df_pred.shape[0]
df_pred.head()

9997


Unnamed: 0,_ID_,_WORK_LAT_,_WORK_LON_,_HOME_LAT_,_HOME_LON_
0,00021683ccb416637fe9a4cd35e4606e,55.023354,82.914726,55.041771,82.984329
1,0002d0f8a642272b41c292c12ab6e602,53.199818,50.173374,53.199818,50.173374
2,0004d182d9fede3ba2534b2d5e5ad27e,43.585316,39.723358,43.586273,39.724274
3,0008c2445518c9392cb356c5c3db3392,51.529232,46.019422,51.535133,46.046375
4,000b373cc4969c0be8e0933c08da67e1,55.830116,49.116356,56.232037,43.458107


In [11]:
df_pred.to_csv('prediction_01.csv', sep=',', index=False)