In [1]:
from operator import itemgetter

import numpy as np
import pandas as pd

import xgboost as xgb
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib

In [2]:
dtype = {
    'amount': np.float32,
    'city': np.int32,
    'country': np.int32,
    'currency': np.int32,
    'customer_id': object,
    'home_add_lat': np.float32,
    'home_add_lon': np.float32,
    'mcc': np.int32,
    'terminal_id': object,
    'work_add_lat': np.float32,
    'work_add_lon': np.float32,
    'access24h': np.int8,
    'is_office': np.int8,
    'lat': np.float32,
    'lon': np.float32,
    'n_points': np.int32,
    'is_atm': np.int8,
    'is_raiff': np.int8,
    'is_partner': np.int8,
    'pct_same': np.float32,
    'neg_home': np.int8,
    'neg_work': np.int8,
    'pos_home': np.int8,
    'pos_work': np.int8,
    'mcc_group': np.int32,
    'has_car': np.int8,
    'has_children': np.int8,
    'has_pet': np.int8,
    'amount_mean': np.float32,
    'amount_sub': np.float32,
    'amount_div': np.float32,
    'day': np.int8,
    'day_of_week': np.int8,
    'days_after_holiday': np.int8,
    'days_before_holiday': np.int8,
    'is_dayoff': np.int8,
    'is_holiday': np.int8,
    'is_short': np.int8,
    'is_weekend': np.int8,
    'month': np.int8,
    'timestamp': np.int64,
    'week_of_year': np.int8,
    'pct_term_counts': np.float32,
    'pct_mcc_g_counts': np.float32,
    'is_abroad': np.int8,
    'amount_true': np.float32
}

df = pd.read_csv("data/sets/train_set_final.csv", sep=',', encoding='utf-8', dtype=dtype)
df.head()

Unnamed: 0,amount,city,country,currency,customer_id,home_add_lat,home_add_lon,mcc,terminal_id,work_add_lat,...,is_short,is_weekend,month,timestamp,week_of_year,pct_term_counts,pct_mcc_g_counts,is_abroad,amount_true,is_native
0,2.884034,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,11606fde0c814ce78e0d726e39a0a5ee,59.847,...,0,1,7,1500066000,28,0.013889,0.138889,0,1914.140991,1
1,2.775633,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,e9647a5e1eacfb06713b6af755ccc595,59.847,...,0,0,10,1509051600,43,0.013889,0.138889,0,1491.327759,1
2,3.708368,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5992,df06c1fcd3718a514535ae822785f716,59.847,...,0,0,10,1506978000,40,0.013889,0.013889,0,12773.453125,1
3,3.708368,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5992,df06c1fcd3718a514535ae822785f716,59.847,...,0,0,10,1506978000,40,0.013889,0.013889,0,12773.453125,1
4,2.787498,1,0,643,0dc0137d280a2a82d2dc89282450ff1b,59.851002,30.232,5261,6c5e5793ebc984fb72875feffff62854,59.847,...,0,1,9,1504904400,36,0.013889,0.138889,0,1532.631592,1


In [3]:
def is_inside_area(lat_pred, lon_pred, lat_true, lon_true, radius=0.02):
    return np.power(lat_pred - lat_true, 2) + np.power(lon_pred - lon_true, 2) <= radius ** 2

In [4]:
mask = df.loc[:, ["home_add_lat", "home_add_lon"]].notnull().all(axis=1)
df["is_home"] = is_inside_area(
    df.loc[mask, "lat"], df.loc[mask, "lon"],
    df.loc[mask, "home_add_lat"], df.loc[mask, "home_add_lon"]
)

mask = df.loc[:, ["work_add_lat", "work_add_lon"]].notnull().all(axis=1)
df["is_work"] = is_inside_area(
    df.loc[mask, "lat"], df.loc[mask, "lon"],
    df.loc[mask, "work_add_lat"], df.loc[mask, "work_add_lon"]
)

In [5]:
features = df.columns.tolist()
for f in ['home_add_lon', 'home_add_lat',
          'work_add_lon', 'work_add_lat',
          'is_work', 'is_home',
          'customer_id', 'terminal_id']:
    features.remove(f)
features

[u'amount',
 u'city',
 u'country',
 u'currency',
 u'mcc',
 u'access24h',
 u'is_office',
 u'lat',
 u'lon',
 u'n_points',
 u'is_atm',
 u'is_raiff',
 u'is_partner',
 u'pct_same',
 u'neg_home',
 u'neg_work',
 u'pos_home',
 u'pos_work',
 u'mcc_group',
 u'has_car',
 u'has_children',
 u'has_pet',
 u'amount_mean',
 u'amount_sub',
 u'amount_div',
 u'day',
 u'day_of_week',
 u'days_after_holiday',
 u'days_before_holiday',
 u'is_dayoff',
 u'is_holiday',
 u'is_short',
 u'is_weekend',
 u'month',
 u'timestamp',
 u'week_of_year',
 u'pct_term_counts',
 u'pct_mcc_g_counts',
 u'is_abroad',
 u'amount_true',
 u'is_native']

In [6]:
model_0 = {
    'is_home': xgb.XGBClassifier(n_estimators=200, n_jobs=3, max_depth=7),
    'is_work': xgb.XGBClassifier(n_estimators=200, n_jobs=3, max_depth=7)
}

In [7]:
def choose_best(group, pred):
    pred_i = group[pred].idxmax()
    return group.loc[pred_i, ['lat', 'lon']]


def modify_prediction(df, y_pred, col):
    cols = ['customer_id', 'lat', 'lon']
    pred = 'pred:{}'.format(col)
    
    df = df.loc[:, cols]
    df[pred] = y_pred
    
    return df.groupby('customer_id').apply(lambda x: choose_best(x, pred)).reset_index()


def score(df, y_pred, col):
    lon, lat = {
        'is_home': ('home_add_lon', 'home_add_lat'),
        'is_work': ('work_add_lon', 'work_add_lat')
    }[col]
    cols = ['customer_id', lat, lon]
    
    df_true = df.loc[:, cols].groupby('customer_id', as_index=False).median()
    df_pred = modify_prediction(df, y_pred, col)
    
    df_result = pd.merge(df_true, df_pred, how='left', on='customer_id')
    result = is_inside_area(df_result[lat], df_result[lon], df_result['lat'], df_result['lon'])
    return sum(result) / float(len(result))

In [8]:
model = dict()

np.random.seed(8888)

for col in ['is_home', 'is_work']:
    mask = df.loc[:, col].notnull()
    
    customers = df.loc[mask, 'customer_id'].unique()
    np.random.shuffle(customers)
    
    border = int(len(customers) * 0.9)
    cust_train, cust_valid = set(customers[:border]), set(customers[border:])
    
    mask_train = np.logical_and(mask, df["customer_id"].map(lambda x: x in cust_train))
    mask_valid = np.logical_and(mask, df["customer_id"].map(lambda x: x in cust_valid))
    
    print "Training:", col
    print "nb_train = {}; nb_valid = {}; nb_total = {}".format(
        sum(mask_train), sum(mask_valid), sum(mask_train) + sum(mask_valid))
    
    clf = sklearn.base.clone(model_0[col])
    clf.fit(
        df.loc[mask_train, features], df.loc[mask_train, col],
        eval_metric='logloss',
        eval_set=[
            (df.loc[mask_train, features], df.loc[mask_train, col]),
            (df.loc[mask_valid, features], df.loc[mask_valid, col])
        ],
        early_stopping_rounds=10,
        verbose=1
    )
    model[col] = clf

    print "Fitted..."

    y_pred = clf.predict_proba(df.loc[mask_train, features])
    print "Train score: {}".format(score(df.loc[mask_train], y_pred[:,1], col))

    y_pred = clf.predict_proba(df.loc[mask_valid, features])
    print "Valid score: {}".format(score(df.loc[mask_valid], y_pred[:,1], col))
    
    break

Training: is_home
nb_train = 1943287; nb_valid = 209724; nb_total = 2153011
[0]	validation_0-logloss:0.655647	validation_1-logloss:0.658514
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[1]	validation_0-logloss:0.624969	validation_1-logloss:0.630598
[2]	validation_0-logloss:0.599442	validation_1-logloss:0.607496
[3]	validation_0-logloss:0.578076	validation_1-logloss:0.588096
[4]	validation_0-logloss:0.559989	validation_1-logloss:0.571949
[5]	validation_0-logloss:0.544498	validation_1-logloss:0.558207
[6]	validation_0-logloss:0.531377	validation_1-logloss:0.546768
[7]	validation_0-logloss:0.520027	validation_1-logloss:0.536661
[8]	validation_0-logloss:0.510374	validation_1-logloss:0.528331
[9]	validation_0-logloss:0.502031	validation_1-logloss:0.521332
[10]	validation_0-logloss:0.494543	validation_1-logloss:0.515341
[11]	validation_0-logloss:0.488096	validation_1-loglos

In [13]:
for f, i in sorted(zip(features, model['is_home'].feature_importances_), key=itemgetter(1), reverse=True):
    print "{}\t{}".format(f, i)

lat	0.157819390297
amount_mean	0.152092516422
lon	0.150220260024
n_points	0.0915198251605
pct_mcc_g_counts	0.0812775343657
city	0.0807268694043
pct_term_counts	0.0796255543828
pct_same	0.0470264330506
amount	0.0183920711279
mcc	0.0183920711279
is_native	0.0174008812755
has_car	0.0134361237288
has_children	0.010462555103
has_pet	0.010462555103
timestamp	0.00914096925408
day_of_week	0.00792951509356
mcc_group	0.00704845832661
pos_home	0.00671806186438
amount_sub	0.00561673985794
days_after_holiday	0.00528634339571
day	0.00429515400901
days_before_holiday	0.00396475754678
amount_div	0.00319383270107
is_office	0.0024229073897
is_atm	0.0024229073897
neg_home	0.00231277523562
pos_work	0.00231277523562
is_raiff	0.00220264308155
access24h	0.00198237877339
is_partner	0.00176211458165
currency	0.000770925136749
month	0.000770925136749
is_dayoff	0.000660792924464
neg_work	0.000330396462232
country	0.0
is_holiday	0.0
is_short	0.0
is_weekend	0.0
week_of_year	0.0
is_abroad	0.0
amount_true	0.0


In [10]:
for f, i in sorted(zip(features, model['is_work'].feature_importances_), key=itemgetter(1), reverse=True):
    print "{}\t{}".format(f, i)

lon	0.156432896852
lat	0.155687987804
amount_mean	0.146642550826
n_points	0.0968394204974
pct_term_counts	0.0940725728869
pct_mcc_g_counts	0.0933276563883
city	0.0477812066674
pct_same	0.0405448563397
mcc	0.0203256364912
amount	0.0190486330539
has_car	0.0139406193048
is_native	0.0123443650082
day_of_week	0.0111737791449
has_children	0.0108545282856
has_pet	0.00936469063163
amount_sub	0.00904543977231
timestamp	0.00744918594137
days_before_holiday	0.00638501672074
access24h	0.00595934875309
is_raiff	0.00585293164477
mcc_group	0.00585293164477
days_after_holiday	0.0057465150021
day	0.00489517953247
amount_div	0.00436309445649
pos_work	0.00361817609519
pos_home	0.00329892523587
is_office	0.00212833890691
is_atm	0.00202192179859
neg_work	0.00148983718827
month	0.00106416945346
is_dayoff	0.000957752461545
is_partner	0.000638501660433
neg_home	0.000425667763921
currency	0.000319250830216
is_abroad	0.00010641694098
country	0.0
is_holiday	0.0
is_short	0.0
is_weekend	0.0
week_of_year	0.0
amount

In [11]:
joblib.dump(model['is_home'], 'data/models/clf_home_01.model')

['data/models/clf_home_01.model']

In [12]:
joblib.dump(model['is_work'], 'data/models/clf_work_01.model')

['data/models/clf_work_01.model']