In [164]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.float_format', lambda x: '%.3f' % x)

sns.set()

%matplotlib inline

from src.metrics import mapk, transform_y

from sklearn.multiclass import OneVsRestClassifier
import xgboost

In [175]:
train_X = pd.read_pickle('generated_files/120K/train_X.pkl')
train_Y = pd.read_pickle('generated_files/120K/train_Y.pkl')

test_X = pd.read_pickle('generated_files/120K/test_X.pkl')
test_Y = pd.read_pickle('generated_files/120K/test_Y.pkl')

In [176]:
prediction_features = [col for col in train_X.columns if 'FT_' in col]

In [177]:
def enhance_with_timeseries_features(X):
    X_pred = X[prediction_features]

    how_long_had = X_pred.groupby(X['Customer_Code']).cumsum()
    ever_had = how_long_had > 0
    had_and_does_not_have_now = ever_had & ~(X_pred.astype('bool'))

    for ft in prediction_features:
        X['HAD_NOT_NOW_' + ft] = had_and_does_not_have_now[ft].astype('int64')

In [178]:
enhance_with_timeseries_features(train_X)
enhance_with_timeseries_features(test_X)

In [179]:
def reduce_train(train_X, train_Y):
    is_changed_series = train_Y.sum(axis=1) > 0
    
    X_reduced = train_X[is_changed_series].reset_index(drop=True)
    Y_reduced = train_Y[is_changed_series].reset_index(drop=True)
    
    return X_reduced, Y_reduced

X_tr_reduced, Y_tr_reduced = reduce_train(train_X, train_Y)
X_ts_reduced, Y_ts_reduced = reduce_train(test_X, test_Y)

In [185]:
from src.province import get_province_data

province_df = get_province_data()

def prepare_xgb_data(X):
    xgb_X = X.drop(['Row_Date', 'Customer_Code'], axis=1)
    
#     formated_province = xgb_X['Province_Name'].str.replace(",.*$","")
#     xgb_X['Province_GDP'] = formated_province.apply(lambda s: province_df.loc[s]['gdp']).astype('float64')
#     xgb_X['Province_Density'] = formated_province.apply(lambda s: province_df.loc[s]['density']).astype('float64')
    
    xgb_X['Province_Name'] = xgb_X['Province_Name'].apply(lambda s: 1 if s == 'MADRID' else 0).astype('float64')
    xgb_X['Sex'] = xgb_X['Sex'].apply(lambda s: 1 if s == 'V' else 0).astype('float64')
    xgb_X['Segmentation'] = xgb_X['Segmentation'].apply(lambda s: 1 if 'TOP' in s else 0 if 'PARTICULARES' in s else -1).astype('float64')
    
    return xgb_X

In [186]:
def predict_row(y_row, x_row, n_labels, thresh):
    row = y_row * (~x_row.astype('bool'))
    res = np.argsort(-row)[:n_labels]
    res = res[row[res] >= thresh]
    return res

def predict_order(probas, X):
    return [predict_row(y_row, x_row, 7, 0.0001) for y_row, x_row in zip(probas, X[prediction_features].values)]

In [187]:
xgb_X_tr = prepare_xgb_data(X_tr_reduced)
xgb_X_ts = prepare_xgb_data(X_ts_reduced)

In [188]:
model = OneVsRestClassifier(n_jobs=-1, estimator=xgboost.XGBClassifier(tree_method='hist', max_depth=6))
model.fit(xgb_X_tr, X_tr_reduced[prediction_features] + Y_tr_reduced)

OneVsRestClassifier(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bynode=1,
                                            colsample_bytree=1, gamma=0,
                                            learning_rate=0.1, max_delta_step=0,
                                            max_depth=6, min_child_weight=1,
                                            missing=None, n_estimators=100,
                                            n_jobs=1, nthread=None,
                                            objective='binary:logistic',
                                            random_state=0, reg_alpha=0,
                                            reg_lambda=1, scale_pos_weight=1,
                                            seed=None, silent=None, subsample=1,
                                            tree_method='hist', verbosity=1),
                    n_jobs=-1)

In [189]:
probas = model.predict_proba(xgb_X_ts)
try_order = predict_order(probas, xgb_X_ts)
mapk(transform_y(Y_ts_reduced, thresh=0.01), try_order, k=7)

0.8473047617906742

In [196]:
test_last_month = X_ts_reduced['Row_Date'] == '2016-04-28'
xgb_X_ts_last = prepare_xgb_data(X_ts_reduced[test_last_month])
Y_ts_last = Y_ts_reduced[test_last_month]

probas = model.predict_proba(xgb_X_ts_last)
try_order = predict_order(probas, xgb_X_ts_last)
mapk(transform_y(Y_ts_last, thresh=0.01), try_order, k=7)

0.8748325819479665