In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pd.set_option('display.float_format', lambda x: '%.3f' % x)

%matplotlib inline

In [3]:
train_X = pd.read_pickle('generated_files/train_X.pkl')
train_Y = pd.read_pickle('generated_files/train_Y.pkl')

test_X = pd.read_pickle('generated_files/test_X.pkl')
test_Y = pd.read_pickle('generated_files/test_Y.pkl')

In [4]:
def reduce_train(train_X, train_Y):
    is_changed_series = train_Y.sum(axis=1) > 0
    
    X_reduced = train_X[is_changed_series].reset_index(drop=True)
    Y_reduced = train_Y[is_changed_series].reset_index(drop=True)
    
    return X_reduced, Y_reduced

X_tr_reduced, Y_tr_reduced = reduce_train(train_X, train_Y)
X_ts_reduced, Y_ts_reduced = reduce_train(test_X, test_Y)

In [5]:
from src.metrics import mapk, transform_y

In [6]:
from sklearn.multiclass import OneVsRestClassifier
import xgboost

In [7]:
prediction_features = [col for col in train_X.columns if 'FT_' in col]

def prepare_xgb_data(X):
    xgb_X = X.drop(['Row_Date', 'Customer_Code'], axis=1)
    xgb_X['Province_Name'] = xgb_X['Province_Name'].apply(lambda s: 1 if s == 'MADRID' else 0).astype('int64')
    xgb_X['Sex'] = xgb_X['Sex'].apply(lambda s: 1 if s == 'V' else 0).astype('int64')
    xgb_X['Segmentation'] = xgb_X['Segmentation'].apply(lambda s: 1 if 'TOP' in s else 0 if 'PARTICULARES' in s else -1).astype('int64')
    
    context_features = [col for col in xgb_X.columns if 'FT_' not in col]
    
#     xgb_X = xgb_X[context_features]
    
    return xgb_X

In [8]:
model = OneVsRestClassifier(xgboost.XGBClassifier())
model.fit(prepare_xgb_data(X_tr_reduced), X_tr_reduced[prediction_features] + Y_tr_reduced)

  str(classes[c]))
  str(classes[c]))


OneVsRestClassifier(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bynode=1,
                                            colsample_bytree=1, gamma=0,
                                            learning_rate=0.1, max_delta_step=0,
                                            max_depth=3, min_child_weight=1,
                                            missing=None, n_estimators=100,
                                            n_jobs=1, nthread=None,
                                            objective='binary:logistic',
                                            random_state=0, reg_alpha=0,
                                            reg_lambda=1, scale_pos_weight=1,
                                            seed=None, silent=None, subsample=1,
                                            verbosity=1),
                    n_jobs=None)

In [9]:
try_y = model.predict(prepare_xgb_data(X_ts_reduced))

In [10]:
mapk(transform_y(Y_ts_reduced, thresh=0.01), transform_y(pd.DataFrame(try_y), thresh=0.01), k=7)

0.44690843732717783

In [11]:
def predict_row(y_row, x_row, n_labels, thresh):
    row = y_row * (~x_row.astype('bool'))
    res = np.argsort(-row)[:n_labels]
    res = res[row[res] >= thresh]
    return res

probas = model.predict_proba(prepare_xgb_data(X_ts_reduced))
try_y2 = [predict_row(y_row, x_row, 7, 0.0001) for y_row, x_row in zip(probas, X_ts_reduced[prediction_features].values)]

In [12]:
mapk(transform_y(Y_ts_reduced, thresh=0.01), try_y2, k=7)

0.7915140593494846

In [13]:
def fit_model(train_X, train_Y):
    models = []

    xgb_X = prepare_xgb_data(train_X)

    for pred in prediction_features:
        pred_train_X = xgb_X.drop(pred, axis=1)
        pred_train_Y = xgb_X[pred] + train_Y[pred]

        model = xgboost.XGBClassifier()
        model.fit(pred_train_X, pred_train_Y)

        models.append(model)
    
    return models

def predict_proba(models, X):
    per_pred_proba = []
    
    xgb_X = prepare_xgb_data(X)
    
    for pred, model in zip(prediction_features, models):
        pred_test_X = xgb_X.drop(pred, axis=1)
        
        res = model.predict_proba(pred_test_X)
        per_pred_proba.append(res)
    
    return np.array(per_pred_proba).T

In [14]:
models = fit_model(X_tr_reduced, Y_tr_reduced)

In [15]:
probas2 = predict_proba(models, X_ts_reduced)[1]
try_y3 = [predict_row(y_row, x_row, 7, 0.000001) for y_row, x_row in zip(probas2, X_ts_reduced[prediction_features].values)]
mapk(transform_y(Y_ts_reduced, thresh=0.01), try_y3, k=7)

0.7741923173366584