# Santander Product Recommendation

### Part 4. Predict

Based on the fact we saw from evaluation, we proceed to build a LightGBM model to predict test data and submit the result to Kaggle.<br>

In [1]:
import numpy as np
import pandas as pd
import pickle
import lightgbm as lgbm

In [3]:
with open('input/meta_data.pkl', 'rb') as fin:
    meta = pickle.load(fin)

features = meta['features']
target = meta['target']
prods = meta['prods']

with open('input/processed_data.pkl', 'rb') as finn:
    data = pickle.load(finn)

#validation data
trn = data['trn_all']
tst = data['tst_all']

del meta, data

### 4-1.  Training with all data

In [8]:
# Increase the tree number as the number of all data also increased
len_vld = trn[trn['fecha_dato']<'2016-05-28'].shape[0] 
len_all = trn.shape[0]

best_iteration = pickle.load(open("model/lgbm.model.meta", "rb"))
best_iteration = int(best_iteration * len_all / len_vld)

In [11]:
# Convert data to fit in the LightGBM model
dtrn = lgbm.Dataset(trn[features], label=trn['target'], feature_name=features)
dtst = lgbm.Dataset(tst[features], feature_name=features)

In [12]:
# Set up the parameter for the LightGBM model
params_lgb = {
    'task' : 'train',
    'boosting_type' : 'gbdt',
    'objective' : 'multiclass',
    'num_class': 17,
    'metric' : {'multi_logloss'},
    'is_training_metric': True,
    'max_bin': 255,
    'num_leaves' : 64,
    'learning_rate' : 0.1,
    'feature_fraction' : 0.8,
    'min_data_in_leaf': 10,
    'min_sum_hessian_in_leaf': 5,
}

In [None]:
# Train the model
model = lgbm.train(params_lgb, dtrn, num_boost_round=best_iteration)
model.save_model("lgbm.all.model.txt")

In [15]:
# Print out feature importance from the LightGBM model
print("Feature importance by split:")
for kv in sorted([(k,v) for k,v in zip(features, model.feature_importance("split"))], key=lambda kv: kv[1], reverse=True): 
    print(kv)

print("Feature importance by gain:")
for kv in sorted([(k,v) for k,v in zip(features, model.feature_importance("gain"))], key=lambda kv: kv[1], reverse=True):
    print(kv)

Feature importance by split:
('renta', 148843)
('age', 123217)
('antiguedad', 115031)
('fecha_alta_month', 86955)
('cod_prov', 71336)
('renta_prev', 67276)
('antiguedad_prev', 65075)
('age_prev', 64294)
('fecha_alta_year', 61312)
('canal_entrada', 55138)
('cod_prov_prev', 29179)
('sexo', 20993)
('canal_entrada_prev', 20213)
('segmento', 16269)
('ind_recibo_ult1_prev', 11540)
('ind_ecue_fin_ult1_prev', 11381)
('ind_cco_fin_ult1_prev', 11257)
('tiprel_1mes', 9494)
('ind_cno_fin_ult1_prev', 9432)
('segmento_prev', 8594)
('ind_reca_fin_ult1_prev', 8546)
('ind_tjcr_fin_ult1_prev', 8476)
('ind_actividad_cliente', 8229)
('ind_ctop_fin_ult1_prev', 7317)
('ind_dela_fin_ult1_prev', 6901)
('ind_ctpp_fin_ult1_prev', 6019)
('ind_nom_pens_ult1_prev', 5798)
('ind_valo_fin_ult1_prev', 5352)
('ind_nomina_ult1_prev', 5014)
('indext', 4858)
('sexo_prev', 4624)
('ind_fond_fin_ult1_prev', 3907)
('pais_residencia', 3897)
('tiprel_1mes_prev', 3789)
('ind_actividad_cliente_prev', 3322)
('ind_plan_fin_ult1_pre

### 4-2. Submission

In [56]:
# Prediction
preds_tst = model.predict(tst[features], num_iteration=best_iteration)

preds_tst_16 = np.delete(preds_tst, 16, axis=1)
preds_tst_16 = preds_tst_16 - tst[[prod + '_prev' for prod in prods[target]]]

In [68]:
# Create the submission file
submit_file = open('model/lgbm_ensemble_predicton', 'w')
submit_file.write('ncodpers,added_products\n')


ncodpers_tst = tst['ncodpers'].values

for ncodper, pred in zip(ncodpers_tst, preds_tst_16.values):
    y_prods = [(y,p,ip) for y,p,ip in zip(pred, prods[target], target)]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
    y_prods = [p for y,p,ip in y_prods]
    
    data = '{},{}\n'.format(int(ncodper), ' '.join(y_prods))
    submit_file.write(data)


submit_file.close()

### Result

The prediction scored 0.02552 in Private and 0.02521 in Public.