In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.float_format', lambda x: '%.3f' % x)

sns.set()

%matplotlib inline

from src.metrics import mapk, transform_y

from sklearn.multiclass import OneVsRestClassifier
import xgboost

In [27]:
train_X = pd.read_pickle(f'generated_files/entire/train_X.pkl')
train_Y = pd.read_pickle(f'generated_files/entire/train_Y.pkl')
last_df = pd.read_pickle(f'generated_files/entire/last_month_data.pkl')

In [34]:
from src.clean_dataset import get_prediction_features
from src.model import get_xgb_model, fit_model, predict_proba, predict_ordered_list, evaluate_result

In [32]:
model = get_xgb_model()

In [33]:
fit_model(model, train_X, train_Y)

In [36]:
from joblib import dump, load
dump(model, 'generated_files/final_model.joblib')

['generated_files/final_model.joblib']

In [37]:
final_model = load('generated_files/final_model.joblib')

In [38]:
prediction_features = get_prediction_features(train_X)

In [39]:
last_month_probas = predict_proba(final_model, last_df, prediction_features)

In [42]:
last_month_ordered_lists = predict_ordered_list(last_month_probas)

In [52]:
from src.load_dataset import get_feature_translation_dict
trans_dict, reverse_trans_dict = get_feature_translation_dict("static/feature_translation.csv")

In [53]:
final_result = pd.DataFrame([[reverse_trans_dict[prediction_features[idx]] for idx in idx_lst] for idx_lst in last_month_ordered_lists], index=last_df['Customer_Code'])

In [54]:
final_result.to_pickle('generated_files/final_result.pkl')
final_result

Unnamed: 0_level_0,0,1,2,3,4,5,6
Customer_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
15889,ind_recibo_ult1,ind_ecue_fin_ult1,ind_nom_pens_ult1,ind_nomina_ult1,ind_dela_fin_ult1,ind_ctop_fin_ult1,ind_cno_fin_ult1
15890,ind_cco_fin_ult1,ind_reca_fin_ult1,ind_ctma_fin_ult1,ind_valo_fin_ult1,ind_dela_fin_ult1,ind_ctop_fin_ult1,ind_fond_fin_ult1
15892,ind_nom_pens_ult1,ind_cno_fin_ult1,ind_nomina_ult1,ind_ctop_fin_ult1,ind_fond_fin_ult1,ind_ctpp_fin_ult1,ind_ctma_fin_ult1
15893,ind_cco_fin_ult1,ind_tjcr_fin_ult1,ind_recibo_ult1,ind_ecue_fin_ult1,ind_cno_fin_ult1,ind_dela_fin_ult1,ind_nom_pens_ult1
15894,ind_cno_fin_ult1,ind_dela_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_fond_fin_ult1,ind_ctma_fin_ult1,ind_hip_fin_ult1
...,...,...,...,...,...,...,...
1553685,ind_cco_fin_ult1,ind_ctma_fin_ult1,ind_cno_fin_ult1,ind_recibo_ult1,ind_nom_pens_ult1,ind_nomina_ult1,ind_dela_fin_ult1
1553686,ind_cco_fin_ult1,ind_ctma_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_cno_fin_ult1,ind_recibo_ult1,ind_ecue_fin_ult1
1553687,ind_cco_fin_ult1,ind_nom_pens_ult1,ind_cno_fin_ult1,ind_recibo_ult1,ind_nomina_ult1,ind_ctma_fin_ult1,ind_ecue_fin_ult1
1553688,ind_cco_fin_ult1,ind_ctma_fin_ult1,ind_recibo_ult1,ind_cno_fin_ult1,ind_nom_pens_ult1,ind_nomina_ult1,ind_ecue_fin_ult1


In [55]:
test = pd.read_csv('dataset/test_ver2.csv')

In [59]:
test

Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,...,indext,conyuemp,canal_entrada,indfall,tipodom,cod_prov,nomprov,ind_actividad_cliente,renta,segmento
0,2016-06-28,15889,F,ES,V,56,1995-01-16,0,256,1,...,N,N,KAT,N,1,28.000,MADRID,1,326124.90,01 - TOP
1,2016-06-28,1170544,N,ES,H,36,2013-08-28,0,34,1,...,N,,KAT,N,1,3.000,ALICANTE,0,,02 - PARTICULARES
2,2016-06-28,1170545,N,ES,V,22,2013-08-28,0,34,1,...,N,,KHE,N,1,15.000,"CORUÑA, A",1,,03 - UNIVERSITARIO
3,2016-06-28,1170547,N,ES,H,22,2013-08-28,0,34,1,...,N,,KHE,N,1,8.000,BARCELONA,0,148402.98,03 - UNIVERSITARIO
4,2016-06-28,1170548,N,ES,H,22,2013-08-28,0,34,1,...,N,,KHE,N,1,7.000,"BALEARS, ILLES",0,106885.80,03 - UNIVERSITARIO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
929610,2016-06-28,660237,N,ES,V,55,1999-04-21,0,206,1,...,N,,KAT,N,1,28.000,MADRID,1,128643.57,01 - TOP
929611,2016-06-28,660238,N,ES,V,30,2006-11-29,0,115,1,...,N,,KFC,N,1,26.000,"RIOJA, LA",0,,02 - PARTICULARES
929612,2016-06-28,660240,N,ES,V,52,2006-11-29,0,115,1,...,N,,KBZ,N,1,33.000,ASTURIAS,1,72765.27,02 - PARTICULARES
929613,2016-06-28,660243,N,ES,V,32,2006-11-29,0,115,1,...,N,,KFC,N,1,33.000,ASTURIAS,0,147488.88,02 - PARTICULARES


In [66]:
test_result = final_result.loc[test['ncodpers']].reset_index().sort_values('Customer_Code').reset_index(drop=True)

In [81]:
added_products = test_result[[i for i in range(7)]].apply(lambda row: ' '.join(r for r in row if r), axis=1)

In [83]:
ncodpers = test_result['Customer_Code']

In [86]:
kaggle_result = pd.DataFrame({'ncodpers': ncodpers, 'added_products': added_products})

In [89]:
kaggle_result.to_csv('generated_files/kaggle_result.csv', index=False)