In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.float_format', lambda x: '%.3f' % x)

sns.set()

%matplotlib inline

from src.metrics import mapk, transform_y

from sklearn.multiclass import OneVsRestClassifier
import xgboost
from joblib import dump, load

In [2]:
train_X = pd.read_pickle(f'generated_files/entire/train_X.pkl')
train_Y = pd.read_pickle(f'generated_files/entire/train_Y.pkl')
last_df = pd.read_pickle(f'generated_files/entire/last_month_data.pkl')

is_train = (train_X['Row_Date'] >= '2015-05-28') & (train_X['Row_Date'] <= '2015-12-28')

train_X = train_X[is_train].reset_index(drop=True)
train_Y = train_Y[is_train].reset_index(drop=True)

train_X.drop(['month', 'is_2015'], axis=1, inplace=True)
last_df.drop(['month', 'is_2015'], axis=1, inplace=True)

# lagged_features = [ft for ft in train_X.columns if 'LAG2' in ft]
# train_X.drop(lagged_features, axis=1, inplace=True)
# last_df.drop(lagged_features, axis=1, inplace=True)

# is_test = train_X['Row_Date'] == '2015-06-28'
# is_train = ~is_test # ~is_test # train_X['Row_Date'] == '2015-03-28'

# test_X = train_X[is_test].reset_index(drop=True)
# test_Y = train_Y[is_test].reset_index(drop=True)

# train_X = train_X[is_train].reset_index(drop=True)
# train_Y = train_Y[is_train].reset_index(drop=True)

In [3]:
from src.clean_dataset import get_prediction_features
from src.model import get_xgb_model, fit_model, predict_proba, predict_ordered_list, evaluate_result

In [4]:
model = get_xgb_model()

In [5]:
had_fts = [ft for ft in train_X.columns if 'HAD_NOT_NOW_' in ft]
prediction_features = get_prediction_features(train_X)

In [6]:
fit_model(model, train_X, train_Y)

In [7]:
# probas = predict_proba(model, test_X, prediction_features)
# ordered_lists = predict_ordered_list(probas)
# evaluate_result(ordered_lists, test_Y)

In [8]:
dump(model, 'generated_files/final_model.joblib')

['generated_files/final_model.joblib']

In [9]:
final_model = load('generated_files/final_model.joblib')

In [10]:
batch_size = 200000
probas_batches = []

for i in range(math.ceil(len(last_df) / batch_size)):
    data = last_df.iloc[i*batch_size: (i+1)*batch_size]
    batch_res = predict_proba(final_model, data, prediction_features)
    probas_batches.append(batch_res)

last_month_probas = pd.concat(probas_batches)

In [11]:
last_month_ordered_lists = predict_ordered_list(last_month_probas)

In [12]:
from src.load_dataset import get_feature_translation_dict
trans_dict, reverse_trans_dict = get_feature_translation_dict("static/feature_translation.csv")

In [13]:
final_result = pd.DataFrame([[reverse_trans_dict[prediction_features[idx]] for idx in idx_lst] for idx_lst in last_month_ordered_lists], index=last_df['Customer_Code'])

In [14]:
final_result.to_pickle('generated_files/final_result.pkl')
final_result

Unnamed: 0_level_0,0,1,2,3,4,5,6
Customer_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
15889,ind_recibo_ult1,ind_dela_fin_ult1,ind_ecue_fin_ult1,ind_reca_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_ctop_fin_ult1
15890,ind_ctma_fin_ult1,ind_cco_fin_ult1,ind_dela_fin_ult1,ind_reca_fin_ult1,ind_valo_fin_ult1,ind_fond_fin_ult1,ind_ctop_fin_ult1
15892,ind_cno_fin_ult1,ind_fond_fin_ult1,ind_nom_pens_ult1,ind_nomina_ult1,ind_plan_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1
15893,ind_cco_fin_ult1,ind_dela_fin_ult1,ind_tjcr_fin_ult1,ind_recibo_ult1,ind_ecue_fin_ult1,ind_cno_fin_ult1,ind_fond_fin_ult1
15894,ind_dela_fin_ult1,ind_cno_fin_ult1,ind_plan_fin_ult1,ind_fond_fin_ult1,ind_ctop_fin_ult1,ind_ctma_fin_ult1,ind_ctpp_fin_ult1
...,...,...,...,...,...,...,...
1553685,ind_cco_fin_ult1,ind_ctma_fin_ult1,ind_cno_fin_ult1,ind_deco_fin_ult1,ind_nomina_ult1,ind_dela_fin_ult1,ind_recibo_ult1
1553686,ind_cco_fin_ult1,ind_cno_fin_ult1,ind_ctma_fin_ult1,ind_recibo_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_deco_fin_ult1
1553687,ind_cco_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_cno_fin_ult1,ind_recibo_ult1,ind_ctma_fin_ult1,ind_ecue_fin_ult1
1553688,ind_cco_fin_ult1,ind_ctma_fin_ult1,ind_cno_fin_ult1,ind_recibo_ult1,ind_nomina_ult1,ind_deco_fin_ult1,ind_nom_pens_ult1


In [15]:
test = pd.read_csv('dataset/test_ver2.csv')

In [16]:
test_result = final_result.loc[test['ncodpers']].reset_index().sort_values('Customer_Code').reset_index(drop=True)

In [17]:
added_products = test_result[[i for i in range(7)]].apply(lambda row: ' '.join(r for r in row if r), axis=1)

In [18]:
ncodpers = test_result['Customer_Code']

In [19]:
kaggle_result = pd.DataFrame({'ncodpers': ncodpers, 'added_products': added_products})

In [20]:
kaggle_result.to_csv('generated_files/kaggle_result6.csv', index=False)