In [1]:
import codecs
from operator import itemgetter

import numpy as np
import pandas as pd

import xgboost as xgb
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib

In [2]:
def is_inside_area(lat_pred, lon_pred, lat_true, lon_true, radius=0.02):
    return np.power(lat_pred - lat_true, 2) + np.power(lon_pred - lon_true, 2) <= radius ** 2

In [3]:
time_features = {u'day', u'day_of_week', u'days_after_holiday',
       u'days_before_holiday', u'is_dayoff', u'is_holiday', u'is_short',
       u'is_weekend', u'month', u'timestamp', u'week_of_year'}

In [4]:
df = pd.read_csv("data/sets/test_set_final.csv", sep=',', encoding='utf-8')
df.drop(labels=time_features, axis=1, inplace=True)
df.drop_duplicates(inplace=True)
print df.shape
df.head()

(1123852, 36)


Unnamed: 0,amount,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,terminal_id,work_add_lat,...,has_children,has_pet,amount_mean,amount_sub,amount_div,pct_term_counts,pct_mcc_g_counts,is_abroad,amount_true,is_native
0,2.211818,70,0,643,00fd410f5c580c8351cafa88d82b60f3,,,5411,ff0476dae4b098a7b16aabe93d4268df,,...,0,0,2.736013,-0.524195,0.808409,0.117647,0.5,0,407.153551,1
1,1.331379,70,0,643,00fd410f5c580c8351cafa88d82b60f3,,,4111,7cfd9a60282459d4692ecc85b856072e,,...,0,0,2.736013,-1.404634,0.486613,0.029412,0.088235,0,53.619044,1
3,2.608004,50,0,643,00fd410f5c580c8351cafa88d82b60f3,,,5411,7e5a532f0029861d8a9c4f0479b9450b,,...,0,0,2.736013,-0.12801,0.953213,0.029412,0.5,0,1013.77978,0
4,1.916752,70,0,643,00fd410f5c580c8351cafa88d82b60f3,,,5411,2afe7d1bc61b86c449f413bdf2119032,,...,0,0,2.736013,-0.819261,0.700564,0.029412,0.5,0,206.391756,1
6,1.981067,0,0,643,00fd410f5c580c8351cafa88d82b60f3,,,5814,ab4f00601ff1d949afc59ee3f804c79c,,...,0,0,2.736013,-0.754946,0.724071,0.029412,0.058824,0,239.335406,0


In [5]:
df["lat_pred"] = df["lon_pred"] = np.nan

In [6]:
features = [f for f in df.columns
    if f not in {'home_add_lon', 'home_add_lat',
                 'work_add_lon', 'work_add_lat',
                 'is_work', 'is_home',
                 'customer_id', 'terminal_id'}
            and f not in time_features]
features

[u'amount',
 u'city',
 u'country',
 u'currency',
 u'mcc',
 u'access24h',
 u'is_office',
 u'lat',
 u'lon',
 u'n_points',
 u'is_atm',
 u'is_raiff',
 u'is_partner',
 u'pct_same',
 u'neg_home',
 u'neg_work',
 u'pos_home',
 u'pos_work',
 u'mcc_group',
 u'has_car',
 u'has_children',
 u'has_pet',
 u'amount_mean',
 u'amount_sub',
 u'amount_div',
 u'pct_term_counts',
 u'pct_mcc_g_counts',
 u'is_abroad',
 u'amount_true',
 u'is_native',
 'lat_pred',
 'lon_pred']

In [7]:
model = {
    'is_home': joblib.load('data/models/clf_home_04.model'),
    'is_work': joblib.load('data/models/clf_work_04.model')
}

In [8]:
def choose_best(group):
    y_pred = group.loc[:, pred_columns].values
    index = np.unravel_index(np.argmax(y_pred, axis=None), y_pred.shape)
    lat, lon = group.iloc[index[0]].loc[["lat", "lon"]]
    lat_mul, lon_mul = multip[index[1]]
    lat += lat_mul * 0.02
    lon += lon_mul * 0.02
    return pd.Series([lat, lon], index=['lat', 'lon'])

In [9]:
df_pred = dict()

multip = [(lat_mul, lon_mul) for lat_mul in range(-2, 3) for lon_mul in range(-2, 3)]

for col in ['is_home', 'is_work']:
    for i, (lat_mul, lon_mul) in enumerate(multip):
        df.loc[:, "lat_pred"] = df.loc[:, "lat"] * lat_mul
        df.loc[:, "lon_pred"] = df.loc[:, "lon"] * lon_mul
        
        y_pred = model[col].predict(xgb.DMatrix(df.loc[:, features].values))
        df["pred@{}".format(i)] = y_pred
        
        print col, i, "predicted..."
        
    pred_columns = filter(lambda c: '@' in c, df.columns)
    df_pred[col] = df[['customer_id', 'lat', 'lon'] + pred_columns].groupby('customer_id').apply(choose_best)
    df_pred[col].reset_index(drop=False, inplace=True)

is_home 0 predicted...
is_home 1 predicted...
is_home 2 predicted...
is_home 3 predicted...
is_home 4 predicted...
is_home 5 predicted...
is_home 6 predicted...
is_home 7 predicted...
is_home 8 predicted...
is_home 9 predicted...
is_home 10 predicted...
is_home 11 predicted...
is_home 12 predicted...
is_home 13 predicted...
is_home 14 predicted...
is_home 15 predicted...
is_home 16 predicted...
is_home 17 predicted...
is_home 18 predicted...
is_home 19 predicted...
is_home 20 predicted...
is_home 21 predicted...
is_home 22 predicted...
is_home 23 predicted...
is_home 24 predicted...
is_work 0 predicted...
is_work 1 predicted...
is_work 2 predicted...
is_work 3 predicted...
is_work 4 predicted...
is_work 5 predicted...
is_work 6 predicted...
is_work 7 predicted...
is_work 8 predicted...
is_work 9 predicted...
is_work 10 predicted...
is_work 11 predicted...
is_work 12 predicted...
is_work 13 predicted...
is_work 14 predicted...
is_work 15 predicted...
is_work 16 predicted...
is_work 17 p

In [10]:
df_pred = pd.merge(df_pred['is_home'], df_pred['is_work'], on='customer_id', suffixes=('_home', '_work'))

df_pred['lat_work'].fillna(df_pred['lat_work'].median(), inplace=True)
df_pred['lon_work'].fillna(df_pred['lon_work'].median(), inplace=True)
df_pred['lat_home'].fillna(df_pred['lat_home'].median(), inplace=True)
df_pred['lon_home'].fillna(df_pred['lon_home'].median(), inplace=True)

df_pred.rename(columns={
        'customer_id': '_ID_',
        'lat_work': '_WORK_LAT_',
        'lon_work': '_WORK_LON_',
        'lat_home': '_HOME_LAT_',
        'lon_home': '_HOME_LON_'
    }, inplace=True)
df_pred = df_pred[['_ID_', '_WORK_LAT_', '_WORK_LON_', '_HOME_LAT_', '_HOME_LON_']]

print df_pred.shape[0]
df_pred.head()

9997


Unnamed: 0,_ID_,_WORK_LAT_,_WORK_LON_,_HOME_LAT_,_HOME_LON_
0,00021683ccb416637fe9a4cd35e4606e,54.896865,83.070207,54.385966,84.128541
1,0002d0f8a642272b41c292c12ab6e602,44.086955,42.848298,44.086846,42.832
2,0004d182d9fede3ba2534b2d5e5ad27e,43.61243,39.696073,43.632545,39.688855
3,0008c2445518c9392cb356c5c3db3392,51.573936,45.98549,51.577788,45.978105
4,000b373cc4969c0be8e0933c08da67e1,56.277917,43.885426,59.894464,30.36464


In [11]:
df_pred.to_csv('prediction_04.csv', sep=',', index=False)