# Decision trees tryouts on SPR data, inspired by Kaggle Forum "When less is more"

Load training and validation data as 
    month : [ Features | Targets| Difference | Last Choice Targets  ]
    

In [1]:
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import pandas as pd

import logging
logging.getLogger().handlers = []
logging.basicConfig(level=logging.INFO)

import matplotlib.pylab as plt
%matplotlib inline

In [3]:
import sys
sys.path.append("../common")

from dataset import load_trainval, LC_TARGET_LABELS, TARGET_LABELS_FRQ, TARGET_LABELS_DIFF
from utils import to_yearmonth, TARGET_LABELS, TARGET_LABELS2
from utils import target_str_to_labels, decimal_to_dummies, targets_str_to_indices, targets_dec_to_indices

In [4]:
features = [
    u'ind_empleado', u'pais_residencia',
    u'sexo', u'age', u'ind_nuevo', u'antiguedad', u'indrel',
    u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
    u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
    u'ind_actividad_cliente', u'renta', u'segmento'    
]

In [5]:
# train_yearmonths_list = [201504, 201505, 201604]
train_yearmonths_list = [201505, 201602, 201605]
# train_yearmonths_list = [201505]
#val_yearmonth = [201605]
train_nb_clients = 150000
# train_nb_clients = 1500
#train_df, val_df = load_trainval(train_yearmonths_list, val_yearmonth, train_nb_clients, val_nb_clients=1500)
train_df = load_trainval(train_yearmonths_list, train_nb_clients=train_nb_clients)

INFO:root:- Load training data : 
INFO:root:- Load data : [201504, 201505, 201601, 201602, 201604, 201605]
INFO:root:-- Select 150000 clients
INFO:root:- Number of lines with unknown data : 38
INFO:root:- Number of columns with nan : 9
INFO:root:-- Process date : 201505
INFO:root:-- Process date : 201602
INFO:root:-- Process date : 201605
INFO:root:-- Add logCount columns
INFO:root:-- Process month : 2015-04-28
INFO:root:-- Process month : 2015-05-28
INFO:root:-- Process month : 2016-01-28
INFO:root:-- Process month : 2016-02-28
INFO:root:-- Process month : 2016-04-28
INFO:root:-- Process month : 2016-05-28
INFO:root:-- Add logDecimal columns
INFO:root:-- Transform age/renta/logdiff
INFO:root:-- Add target values frequencies
INFO:root:-- Add target diff


Display loaded data

In [6]:
train_df[['fecha_dato', 'ncodpers'] + TARGET_LABELS_FRQ.tolist()].head(10)

Unnamed: 0,fecha_dato,ncodpers,ind_ahor_fin_ult1_frq,ind_aval_fin_ult1_frq,ind_cco_fin_ult1_frq,ind_cder_fin_ult1_frq,ind_cno_fin_ult1_frq,ind_ctju_fin_ult1_frq,ind_ctma_fin_ult1_frq,ind_ctop_fin_ult1_frq,...,ind_hip_fin_ult1_frq,ind_plan_fin_ult1_frq,ind_pres_fin_ult1_frq,ind_reca_fin_ult1_frq,ind_tjcr_fin_ult1_frq,ind_valo_fin_ult1_frq,ind_viv_fin_ult1_frq,ind_nomina_ult1_frq,ind_nom_pens_ult1_frq,ind_recibo_ult1_frq
210121,2015-04-28,15893,0.999866,0.999993,0.228525,0.999582,0.896008,0.988999,0.989548,0.834539,...,0.992401,0.988205,0.996991,0.932021,0.943835,0.033589,0.995103,0.931998,0.926746,0.839723
1051663,2015-05-28,15893,0.999866,0.999993,0.228525,0.999582,0.896008,0.988999,0.989548,0.834539,...,0.992401,0.988205,0.996991,0.932021,0.943835,0.033589,0.995103,0.931998,0.926746,0.839723
1638558,2016-01-28,15893,0.999866,0.999993,0.228525,0.999582,0.896008,0.988999,0.989548,0.834539,...,0.992401,0.988205,0.996991,0.932021,0.943835,0.033589,0.995103,0.931998,0.926746,0.839723
2663080,2016-02-28,15893,0.999866,0.999993,0.228525,0.999582,0.896008,0.988999,0.989548,0.834539,...,0.992401,0.988205,0.996991,0.932021,0.943835,0.033589,0.995103,0.931998,0.926746,0.839723
3532959,2016-04-28,15893,0.999866,0.999993,0.228525,0.999582,0.896008,0.988999,0.989548,0.834539,...,0.992401,0.988205,0.996991,0.932021,0.943835,0.033589,0.995103,0.931998,0.926746,0.839723
4338255,2016-05-28,15893,0.999866,0.999993,0.228525,0.999582,0.896008,0.988999,0.989548,0.834539,...,0.992401,0.988205,0.996991,0.932021,0.943835,0.033589,0.995103,0.931998,0.926746,0.839723
210119,2015-04-28,15895,0.999866,0.999993,0.771475,0.999582,0.103992,0.988999,0.989548,0.834539,...,0.992401,0.011795,0.996991,0.067979,0.056165,0.033589,0.995103,0.931998,0.926746,0.160277
1051665,2015-05-28,15895,0.999866,0.999993,0.771475,0.999582,0.103992,0.988999,0.989548,0.834539,...,0.992401,0.011795,0.996991,0.067979,0.056165,0.033589,0.995103,0.931998,0.926746,0.160277
1638556,2016-01-28,15895,0.999866,0.999993,0.771475,0.999582,0.896008,0.988999,0.989548,0.834539,...,0.992401,0.011795,0.996991,0.067979,0.056165,0.033589,0.995103,0.931998,0.926746,0.160277
2663078,2016-02-28,15895,0.999866,0.999993,0.771475,0.999582,0.896008,0.988999,0.989548,0.834539,...,0.992401,0.011795,0.996991,0.067979,0.056165,0.033589,0.995103,0.931998,0.926746,0.160277


Useful structures

In [7]:
def get_common_clients(df1, mask1, mask2, df2=None):
    active_clients1 = df1[mask1]['ncodpers'].unique()
    if df2 is not None:
        active_clients2 = df2[mask2]['ncodpers'].unique()
    else:
        active_clients2 = df1[mask2]['ncodpers'].unique()
    active_clients = list(set(active_clients1) & set(active_clients2)) 
    
    if df2 is not None:
        return df1['ncodpers'].isin(active_clients), df2['ncodpers'].isin(active_clients)
    return df1['ncodpers'].isin(active_clients)

In [8]:
months_ym_map = {}
# months = list(set(train_df['fecha_dato'].unique()) | set(val_df['fecha_dato'].unique()))
months = train_df['fecha_dato'].unique()
for m in months:
    months_ym_map[to_yearmonth(m)] = m

        
train_months = train_df['fecha_dato'].unique()
# val_months = val_df['fecha_dato'].unique()
    

### Train a model

In [9]:
from utils import get_added_products, remove_last_choice, apk, map7_score
from visualization import visualize_train_test, visualize_folds, compare_two_datasets, compare_folds, compare_folds2

In [10]:
target_features = ['targets_diff', 'targets_logdiff', 'targets_logcount2_diff', 'targets_logcount2', 'targets_logcount1', 'targets_logDec']

In [140]:
def get_XY(current_month, df1, next_year_month, df2, months_ym_map):
    month_mask = df1['fecha_dato'] == months_ym_map[current_month]
    next_year_month_mask = df2['fecha_dato'] == months_ym_map[next_year_month]
    next_year_prev_month_mask = df2['fecha_dato'] == months_ym_map[next_year_month - 1]
    
    # get common clients from df1 at this month and df2 at next year month
    common_clients_mask1, common_clients_mask2 = get_common_clients(df1, month_mask, next_year_month_mask, df2)
    common_clients_mask2, common_clients_mask3 = get_common_clients(df2, common_clients_mask2 & next_year_month_mask, next_year_prev_month_mask, df2)
        
    c1 = df1[common_clients_mask1 & month_mask]['ncodpers'].values
    c2 = df2[common_clients_mask2 & next_year_month_mask]['ncodpers'].values
    c3 = df2[common_clients_mask3 & next_year_prev_month_mask]['ncodpers'].values
    assert (c1 == c2).all() and (c2 == c3).all(), "Problem with common clients" 
    
    X = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + target_features + features + TARGET_LABELS_FRQ.tolist()]            
   
    if TARGET_LABELS[0] in df2.columns and TARGET_LABELS_DIFF[0] in df2.columns and not df2[next_year_month_mask][TARGET_LABELS].isnull().all().all():
        Y = df2[common_clients_mask2 & next_year_month_mask][['ncodpers', 'fecha_dato', 'targets_str', 'lc_targets_str', 'targets_diff'] + TARGET_LABELS + TARGET_LABELS_DIFF.tolist()]    
        assert (X['ncodpers'].values == Y['ncodpers'].values).all(), "There is a problem in alignment"
        Y.index = X.index                
    else:
        Y = None
        
    if TARGET_LABELS_FRQ[0] in df2.columns and not df2[next_year_prev_month_mask][TARGET_LABELS].isnull().all().all():
        # Add TARGET_LABELS_FRQ from previous month to X:
        target_labels_frq = df2[common_clients_mask3 & next_year_prev_month_mask][['ncodpers'] + TARGET_LABELS_FRQ.tolist()]
        assert (X['ncodpers'].values == target_labels_frq['ncodpers'].values).all(), "There is a problem in alignment"
        target_labels_frq = target_labels_frq[TARGET_LABELS_FRQ]
        target_labels_frq.columns = [c + '_prev' for c in TARGET_LABELS_FRQ]
        target_labels_frq.index = X.index
        X = pd.concat([X, target_labels_frq], axis=1)        

    
    if LC_TARGET_LABELS[0] in df2.columns:
        clients_last_choice = df2[common_clients_mask2 & next_year_month_mask][['ncodpers', 'fecha_dato', 'targets_str'] + LC_TARGET_LABELS.tolist()]
    else:
        clients_last_choice = None
        
    return X, Y, clients_last_choice


In [141]:
current_month = 201505
next_year_month = current_month + 100

df1 = train_df if months_ym_map[current_month] in train_months else val_df
#df1 = train_df
df2 = train_df if months_ym_map[next_year_month] in train_months else val_df
#df2 = train_df

X, Y, clients_last_choice = get_XY(current_month, df1, next_year_month, df2, months_ym_map)

In [142]:
assert (X['ncodpers'].values == Y['ncodpers'].values).all(), "WTF"
assert (X['ncodpers'].values == clients_last_choice['ncodpers'].values).all(), "WTF"

In [143]:
print X.shape
X.head(10)

(149981, 75)


Unnamed: 0,ncodpers,fecha_dato,targets_diff,targets_logdiff,targets_logcount2_diff,targets_logcount2,targets_logcount1,targets_logDec,ind_empleado,pais_residencia,...,ind_hip_fin_ult1_frq_prev,ind_plan_fin_ult1_frq_prev,ind_pres_fin_ult1_frq_prev,ind_reca_fin_ult1_frq_prev,ind_tjcr_fin_ult1_frq_prev,ind_valo_fin_ult1_frq_prev,ind_viv_fin_ult1_frq_prev,ind_nomina_ult1_frq_prev,ind_nom_pens_ult1_frq_prev,ind_recibo_ult1_frq_prev
1051663,15893,2015-05-28,0.0,0.0,0.0,0.001256,0.001273,2.833213,0,0,...,0.992401,0.988205,0.996991,0.932021,0.943835,0.033589,0.995103,0.931998,0.926746,0.839723
1051665,15895,2015-05-28,0.0,0.0,0.0,4e-06,1.3e-05,14.781716,2,0,...,0.992401,0.011795,0.996991,0.067979,0.056165,0.033589,0.995103,0.931998,0.926746,0.160277
1051667,15899,2015-05-28,0.0,0.0,0.0,7e-06,7e-06,14.602025,4,0,...,0.992401,0.011795,0.996991,0.932021,0.943835,0.033589,0.995103,0.931998,0.926746,0.160277
1051668,15900,2015-05-28,0.0,0.0,0.0,0.000239,0.000293,13.287691,4,0,...,0.992401,0.988205,0.996991,0.067979,0.943835,0.966411,0.995103,0.931998,0.926746,0.160277
1051673,15907,2015-05-28,0.0,0.0,0.0,1.1e-05,7e-06,14.587507,1,0,...,0.992401,0.011795,0.996991,0.067979,0.056165,0.033589,0.995103,0.931998,0.926746,0.160277
1051660,15917,2015-05-28,0.0,0.0,0.0,0.00492,0.004841,14.586863,1,0,...,0.992401,0.988205,0.996991,0.932021,0.943835,0.966411,0.995103,0.931998,0.926746,0.160277
1051651,15927,2015-05-28,0.0,0.0,0.0,2e-06,7e-06,10.424244,4,0,...,0.992401,0.011795,0.996991,0.932021,0.943835,0.966411,0.995103,0.931998,0.926746,0.160277
1051654,15930,2015-05-28,0.0,0.0,0.0,3.4e-05,3.3e-05,14.556709,0,0,...,0.992401,0.011795,0.996991,0.932021,0.943835,0.033589,0.995103,0.931998,0.926746,0.839723
1051704,15944,2015-05-28,0.0,0.0,0.0,1e-05,1.3e-05,14.602877,4,0,...,0.992401,0.988205,0.996991,0.067979,0.056165,0.966411,0.995103,0.931998,0.926746,0.160277
1051715,15959,2015-05-28,0.0,0.0,0.0,7e-06,7e-06,13.342841,4,0,...,0.007599,0.988205,0.003009,0.067979,0.056165,0.966411,0.995103,0.068002,0.073254,0.160277


In [146]:
print Y.shape
Y[Y['targets_diff'] > 0][['fecha_dato', 'ncodpers', 'targets_str', 'lc_targets_str'] + TARGET_LABELS_DIFF.tolist() ].head(10)

(149981, 53)


Unnamed: 0,fecha_dato,ncodpers,targets_str,lc_targets_str,ind_ahor_fin_ult1_diff,ind_aval_fin_ult1_diff,ind_cco_fin_ult1_diff,ind_cder_fin_ult1_diff,ind_cno_fin_ult1_diff,ind_ctju_fin_ult1_diff,...,ind_hip_fin_ult1_diff,ind_plan_fin_ult1_diff,ind_pres_fin_ult1_diff,ind_reca_fin_ult1_diff,ind_tjcr_fin_ult1_diff,ind_valo_fin_ult1_diff,ind_viv_fin_ult1_diff,ind_nomina_ult1_diff,ind_nom_pens_ult1_diff,ind_recibo_ult1_diff
1051585,2016-05-28,15993,000010000001100001100001,000010000001100001000001,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1051627,2016-05-28,16056,001010001000000000000110,001010001000000000000001,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1051739,2016-05-28,16294,000010000000100000000111,000010000000000000000111,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1051354,2016-05-28,16576,000010010001100000110000,000000010001100000110000,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1051334,2016-05-28,16731,001000000001100000000001,001000000001100000000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1051325,2016-05-28,16857,001000011000000001010000,000000011000000001010001,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1051523,2016-05-28,16988,000010001000000001100011,000010001000000001000011,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1051438,2016-05-28,17151,000010010000001001110111,000010010000001001010111,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1051478,2016-05-28,17236,000010000000100101110111,000010000000100101110011,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1052264,2016-05-28,17364,000010000000100000100111,000010000000100000000111,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [94]:
print clients_last_choice.shape
clients_last_choice.head(10)

(149981, 26)


Unnamed: 0,ncodpers,fecha_dato,lc_ind_ahor_fin_ult1,lc_ind_aval_fin_ult1,lc_ind_cco_fin_ult1,lc_ind_cder_fin_ult1,lc_ind_cno_fin_ult1,lc_ind_ctju_fin_ult1,lc_ind_ctma_fin_ult1,lc_ind_ctop_fin_ult1,...,lc_ind_hip_fin_ult1,lc_ind_plan_fin_ult1,lc_ind_pres_fin_ult1,lc_ind_reca_fin_ult1,lc_ind_tjcr_fin_ult1,lc_ind_valo_fin_ult1,lc_ind_viv_fin_ult1,lc_ind_nomina_ult1,lc_ind_nom_pens_ult1,lc_ind_recibo_ult1
4338255,15893,2016-05-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4338253,15895,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
4338249,15899,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4338248,15900,2016-05-28,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4338243,15907,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
4338235,15917,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4338282,15927,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4338279,15930,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4338266,15944,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
4338197,15959,2016-05-28,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0


## Another train/predict + CV implementation

### Input

- `X` : `[nb_samples, nb_features]` shaped pd.DataFrame
    - `features_masks_list` : `{fm1_name: features_mask_1, fm2_name: features_mask_2, ...]` with `features_mask_i` is a list of feature column names. They can oversect.
    
- `Y` : `[nb_samples, nb_labels]` shaped pd.DataFrame
    - `labels_masks_list` : `{lm1_name: labels_mask_1, lm2_name: labels_mask_2, ...}` with `labels_mask_i` is a list of labels column names. They can oversect.

- `samples_masks_list` : `[samples_mask_1, samples_mask_2, ...]` with samples_mask_i is a function to produce a boolean pd.DataFrame . Used only for training. 


- Set of models `models` : list of functions to create a model, e.g. `[create_RF, create_NN, create_GBT]`


### Training phase




In [147]:
from collections import defaultdict
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier

In [236]:
samples_masks_list = [
#    lambda x:  ~(x['targets_diff'].isin([0])), 
#     lambda x, y:  x['targets_diff'] > 0, 
#     lambda x, y:  x['targets_diff'] < 0, 
    lambda x, y:  (x['targets_diff'] > 0) | (y['targets_diff'] > 0), 
#     lambda x, y:  y['targets_diff'] < 0, 
]

TARGET_LABELS_FRQ_PREV = [c + '_prev' for c in TARGET_LABELS_FRQ]

features_masks_dict = {
#     'fm_all': None,
    'fm0': features + target_features + TARGET_LABELS_FRQ.tolist() + TARGET_LABELS_FRQ_PREV,
    'fm1': ['pais_residencia', 'sexo', 'age', 'ind_nuevo', 'segmento', 'ind_empleado', 'ind_actividad_cliente', 'indresi'],
    'fm2': target_features,
    'fm3': ['pais_residencia', 'sexo', 'age', 'segmento', 'renta'],
#     'fm4': ['pais_residencia', 'sexo', 'age', 'renta', 'targets_logdiff', 'targets_logcount2_diff','targets_logcount2','targets_logcount1'],
    'fm5': ['nomprov', 'ind_nuevo', 'renta', 'ind_actividad_cliente', 'canal_entrada'],
#     'fm6': TARGET_LABELS_FRQ,
}

In [237]:
# https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/

def create_RF(input_shape, output_shape):        
    # https://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/
    return RandomForestClassifier(n_estimators=100, 
#                                   min_samples_split=100,
#                                   min_samples_leaf=25,
#                                   max_depth=10
                                  max_features=1.0, 
                                  oob_score=True,
                                  bootstrap=True,
                                  n_jobs=-1
                                 )

def create_ET(input_shape, output_shape):
    return ExtraTreesClassifier(n_estimators=100,
#                                   min_samples_leaf=25,
#                                   max_depth=10
                                  max_features=1.0, 
                                  oob_score=True,
                                  bootstrap=True,
                                  n_jobs=-1

                               )

def create_GB(input_shape, output_shape):
    return GradientBoostingClassifier(n_estimators=75)

models_dict = {
    'rf': create_RF,
    'et': create_ET,
    'gb': create_GB,
}

In [238]:
NP_TARGET_LABELS = np.array(TARGET_LABELS)
target_labels = TARGET_LABELS_DIFF

common_groups = [
#     [2, ],
    [2, 3, 4, 5],
    [2, 6, 7, 8, 12],
    [2, 18, 23], 
    [21, 22],
    [2, 12, 18],
    [2, 12, 23],
    [2, 18, 23],
    [18, 23, 21, 22],
    [21, 23, 22, 4],
#     [18, ],
#     [12, ],
#     [21, ],
#     [22, ],
#     [23, ],
#     [3, 4], 
    [22, 7, 8],
#     [17, ],
#     [i] for i in range(24)
]



def flatten(array):
    out = []
    for item in array:
        out += item
    return out

others = list(set(range(24)) - set(flatten(common_groups)))


# for i, a in enumerate(zip(TARGET_LABELS2, TARGET_LABELS)):
#     print i, a
    
s = set({})
labels_masks_dict = {}
for i, g in enumerate(common_groups):
    print 'lm_%i' % i, " <=> ", g
    labels_masks_dict['lm_%i' % i] = target_labels[g]
    s |= set(g)
labels_masks_dict['lm_others'] = target_labels[others]
s |= set(others)

assert len(s) == len(target_labels), "Sum is not equal 24, s=%i" % s
print labels_masks_dict

lm_0  <=>  [2, 3, 4, 5]
lm_1  <=>  [2, 6, 7, 8, 12]
lm_2  <=>  [2, 18, 23]
lm_3  <=>  [21, 22]
lm_4  <=>  [2, 12, 18]
lm_5  <=>  [2, 12, 23]
lm_6  <=>  [2, 18, 23]
lm_7  <=>  [18, 23, 21, 22]
lm_8  <=>  [21, 23, 22, 4]
lm_9  <=>  [22, 7, 8]
{'lm_8': array(['ind_nomina_ult1_diff', 'ind_recibo_ult1_diff',
       'ind_nom_pens_ult1_diff', 'ind_cno_fin_ult1_diff'], 
      dtype='|S22'), 'lm_9': array(['ind_nom_pens_ult1_diff', 'ind_ctop_fin_ult1_diff',
       'ind_ctpp_fin_ult1_diff'], 
      dtype='|S22'), 'lm_others': array(['ind_ahor_fin_ult1_diff', 'ind_aval_fin_ult1_diff',
       'ind_deco_fin_ult1_diff', 'ind_deme_fin_ult1_diff',
       'ind_dela_fin_ult1_diff', 'ind_fond_fin_ult1_diff',
       'ind_hip_fin_ult1_diff', 'ind_plan_fin_ult1_diff',
       'ind_pres_fin_ult1_diff', 'ind_reca_fin_ult1_diff',
       'ind_valo_fin_ult1_diff', 'ind_viv_fin_ult1_diff'], 
      dtype='|S22'), 'lm_0': array(['ind_cco_fin_ult1_diff', 'ind_cder_fin_ult1_diff',
       'ind_cno_fin_ult1_diff', 'ind_

In [239]:
# {model_name: [(samples_mask_code, features_mask_name, labels_mask_name), ...]}
models_pipelines = {
    'gb' : [('all', None, key) for key in labels_masks_dict if len(labels_masks_dict[key]) == 1],
    'rf' : [(None, None, key) for key in labels_masks_dict if len(labels_masks_dict[key]) > 1],
    'et' : [(None, None, key) for key in labels_masks_dict if len(labels_masks_dict[key]) > 1],
}
models_pipelines

{'et': [(None, None, 'lm_8'),
  (None, None, 'lm_9'),
  (None, None, 'lm_others'),
  (None, None, 'lm_0'),
  (None, None, 'lm_1'),
  (None, None, 'lm_2'),
  (None, None, 'lm_3'),
  (None, None, 'lm_4'),
  (None, None, 'lm_5'),
  (None, None, 'lm_6'),
  (None, None, 'lm_7')],
 'gb': [],
 'rf': [(None, None, 'lm_8'),
  (None, None, 'lm_9'),
  (None, None, 'lm_others'),
  (None, None, 'lm_0'),
  (None, None, 'lm_1'),
  (None, None, 'lm_2'),
  (None, None, 'lm_3'),
  (None, None, 'lm_4'),
  (None, None, 'lm_5'),
  (None, None, 'lm_6'),
  (None, None, 'lm_7')]}

In [240]:
from trainval import train_all, predict_all, probas_to_indices, score_estimators
from utils import map7_score0

In [241]:
ll = 110000

mask = X.index.isin(X.index[:ll])

X1 = X[mask]
Y1 = Y[mask]
clc = clients_last_choice[mask]
print X1.shape, Y1.shape, clc.shape

mask = X.index.isin(X.index[ll:ll+ll//2])
X2 = X[mask]
Y2 = Y[mask]
clc2 = clients_last_choice[mask]
print X2.shape, Y2.shape, clc2.shape

(110000, 75) (110000, 53) (110000, 27)
(39981, 75) (39981, 53) (39981, 27)


In [256]:
_kwargs = {'samples_masks_list': samples_masks_list, 
            'features_masks_dict': features_masks_dict, 
            'labels_masks_dict': labels_masks_dict, 
            'models_dict': models_dict,
            'labels': target_labels,
            'transform_proba_func': probas_to_indices,
            'threshold': 0.1,
            'n_highest': 7,
            'mode': 'sum',
            'verbose': False,
            'models_pipelines': models_pipelines,
            'return_probas': True
          }

In [243]:
estimators = train_all(X1, Y1, **_kwargs)

#print estimators

INFO:root:-- Train all --
INFO:root:-- Process : sample_mask=6125/110000, features_mask=fm5, labels_mask=lm_8
INFO:root:--- Score : model='et', fit accuracy : 0.720000
INFO:root:-- Process : sample_mask=6125/110000, features_mask=fm5, labels_mask=lm_8
INFO:root:--- Score : model='rf', fit accuracy : 0.721306
INFO:root:-- Process : sample_mask=6125/110000, features_mask=fm5, labels_mask=lm_9
INFO:root:--- Score : model='et', fit accuracy : 0.898286
INFO:root:-- Process : sample_mask=6125/110000, features_mask=fm5, labels_mask=lm_9
INFO:root:--- Score : model='rf', fit accuracy : 0.898776
INFO:root:-- Process : sample_mask=6125/110000, features_mask=fm5, labels_mask=lm_others
INFO:root:--- Score : model='et', fit accuracy : 0.988571
INFO:root:-- Process : sample_mask=6125/110000, features_mask=fm5, labels_mask=lm_others
INFO:root:--- Score : model='rf', fit accuracy : 0.988571
INFO:root:-- Process : sample_mask=6125/110000, features_mask=fm5, labels_mask=lm_0
INFO:root:--- Score : model=

In [244]:
accuracies = defaultdict(list)
for e in estimators:
    accuracies[e[0][2]].append(e[2])

mean_accuracy = {}
for key in accuracies:
    accuracy_list = accuracies[key]
    mean_accuracy[key] = sum(accuracy_list)/len(accuracy_list)
    
mean_accuracy

{'et': 0.84330092764378484, 'rf': 0.84334248608534312}

In [245]:
_ = score_estimators(estimators, X2, Y2, **_kwargs)

INFO:root:-- Score : model=et, features_mask=fm5, labels_mask=lm_8 -> 0.937445286511
INFO:root:-- Score : model=rf, features_mask=fm5, labels_mask=lm_8 -> 0.852880118056
INFO:root:-- Score : model=et, features_mask=fm5, labels_mask=lm_9 -> 0.986518596333
INFO:root:-- Score : model=rf, features_mask=fm5, labels_mask=lm_9 -> 0.96190690578
INFO:root:-- Score : model=et, features_mask=fm5, labels_mask=lm_others -> 0.998549310923
INFO:root:-- Score : model=rf, features_mask=fm5, labels_mask=lm_others -> 0.997598859458
INFO:root:-- Score : model=et, features_mask=fm5, labels_mask=lm_0 -> 0.99262149521
INFO:root:-- Score : model=rf, features_mask=fm5, labels_mask=lm_0 -> 0.979590305395
INFO:root:-- Score : model=et, features_mask=fm5, labels_mask=lm_1 -> 0.99339686351
INFO:root:-- Score : model=rf, features_mask=fm5, labels_mask=lm_1 -> 0.986293489407
INFO:root:-- Score : model=et, features_mask=fm5, labels_mask=lm_2 -> 0.908781671294
INFO:root:-- Score : model=rf, features_mask=fm5, labels_m

In [257]:
y_preds, Y_probas = predict_all(estimators, X2, **_kwargs)
#print y_preds[:5]

INFO:root:-- Predict all --


In [251]:
Y_probas.head()

Unnamed: 0,ind_ahor_fin_ult1_diff,ind_aval_fin_ult1_diff,ind_cco_fin_ult1_diff,ind_cder_fin_ult1_diff,ind_cno_fin_ult1_diff,ind_ctju_fin_ult1_diff,ind_ctma_fin_ult1_diff,ind_ctop_fin_ult1_diff,ind_ctpp_fin_ult1_diff,ind_deco_fin_ult1_diff,...,ind_hip_fin_ult1_diff,ind_plan_fin_ult1_diff,ind_pres_fin_ult1_diff,ind_reca_fin_ult1_diff,ind_tjcr_fin_ult1_diff,ind_valo_fin_ult1_diff,ind_viv_fin_ult1_diff,ind_nomina_ult1_diff,ind_nom_pens_ult1_diff,ind_recibo_ult1_diff
672292,0.0,0.0,0.629822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.092221,0.0,0.0,0.0,0.0,0.277957
672312,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
672314,0.0,0.0,0.153567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.846433,0.0,0.0,0.0,0.0,0.0
672320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.282018,0.365153,0.352829
672322,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [252]:
#y_preds[:5]

In [253]:
#from sklearn.metrics import roc_auc_score

In [258]:
print y_val[:100]
print y_preds[:100]

[[23] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [23] [] [] [] [] [] [19]
 [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] []
 [23] [] [] [] [] [] [] [] [] [] [18] [] [] [] [] [] [] [] [] [] [] [] []
 [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] []
 [] [] [] [] []]
[[2, 23] [] [18, 2] [22, 23, 21] [] [] [23] [2] [21, 22] [22, 21] [23] [23]
 [] [23] [23] [23] [2] [] [] [23] [12] [18] [18] [] [22, 21, 18] [] [23] []
 [] [18] [21, 22] [22, 21, 18] [] [] [] [] [] [] [23] [18] [] [] [18] []
 [23] [18] [] [23] [7] [18] [] [] [23] [18, 23] [] [18] [] [23] [] [] []
 [23] [18] [23, 18] [21, 22, 23] [23] [18] [] [18] [] [21, 22, 18] [21, 22]
 [21, 22, 18] [] [21, 22, 18] [23] [] [] [21, 22] [18] [18] [18] [21, 22]
 [] [] [] [22, 21] [23] [21, 22] [23] [23] [] [23] [] [7, 18] [] [2, 18] []
 [] []]


In [260]:
y_val = targets_str_to_indices(Y2[target_labels].values)

logging.info("- Compute max map7 score")
#map7_score(y_val, y_val, clc2[LC_TARGET_LABELS].values)
map7_score0(y_val, y_val)
logging.info("- Compute map7 score")
#map7_score(y_val, y_preds, clc2[LC_TARGET_LABELS].values)
map7_score0(y_val, y_preds)
#logging.info("- Compute AUC ROC : ")
#print roc_auc_score(y_val, y_preds)

INFO:root:- Compute max map7 score
INFO:root:-- Predicted map7 score: 0.0291638528301
INFO:root:- Compute map7 score
INFO:root:-- Predicted map7 score: 0.00774048228464


0.007740482284640759

0.021295269099703414 (GB on 'all')

0.021271936353906683 (RF tunning)

0.021668245671284416 (RF tunning)

0.02136609107928888

0.0211362663776694

In [None]:
# print labels_masks_dict[estimators[0][0][1]]
# print estimators[0][1].classes_
# print estimators[0][1].n_classes_
# print estimators[0][1].n_features_
# print estimators[0][1].n_outputs_
# print estimators[0][1].estimators_

In [169]:
from utils import targets_to_labels, targets_indices_to_labels, remove_last_choice

In [218]:
limit = 25
count = 0

not_predicted_predicted = defaultdict(int)
for last_choice, targets, products, proba in zip(clc2[LC_TARGET_LABELS].values, y_val, y_preds, Y_probas.values):
    added_products = remove_last_choice(targets, last_choice)
    predictions = remove_last_choice(products, last_choice)
    
    if len(added_products) == 0:
        continue
        
    if len(set(added_products) & set(predictions)) > 0:
#         print "Predicted : ", added_products, predictions
#         print set(added_products) & set(predictions)
        continue

    count += 1
    if count < limit:
        print "--- Count = ", count
        print targets_indices_to_labels(added_products, TARGET_LABELS2), targets_indices_to_labels(targets, TARGET_LABELS2)
        print targets_indices_to_labels(predictions, TARGET_LABELS2), targets_indices_to_labels(products, TARGET_LABELS2)#, proba
    
    for p in added_products:
        not_predicted_predicted[TARGET_LABELS2[p]] += 1
    

--- Count =  1
['Direct Debit'] ['Direct Debit']
[] []
--- Count =  2
['Securities'] ['Securities']
[] []
--- Count =  3
['Credit Card'] ['Credit Card']
['Direct Debit'] ['Direct Debit']
--- Count =  4
['Payroll', 'Pensions'] ['Payroll', 'Pensions']
[] []
--- Count =  5
['Payroll', 'Pensions'] ['Payroll', 'Pensions']
[] []
--- Count =  6
['Credit Card'] ['Credit Card']
[] ['Direct Debit']
--- Count =  7
['Taxes'] ['Taxes']
[] []
--- Count =  8
['e-account'] ['e-account']
['Direct Debit'] ['Direct Debit']
--- Count =  9
['e-account'] ['e-account']
['Direct Debit'] ['Direct Debit']
--- Count =  10
['Direct Debit'] ['Direct Debit']
[] []
--- Count =  11
['e-account'] ['e-account']
['Credit Card'] ['Credit Card']
--- Count =  12
['Payroll', 'Pensions'] ['Payroll', 'Pensions']
[] []
--- Count =  13
['Payroll', 'Pensions'] ['Payroll', 'Pensions']
[] []
--- Count =  14
['Payroll Account'] ['Payroll Account']
[] []
--- Count =  15
['e-account'] ['e-account']
['Pensions'] ['Pensions']
--- Count

In [219]:
print not_predicted_predicted, y_val.shape[0]

defaultdict(<type 'int'>, {'Long-term deposits': 1, 'Direct Debit': 69, 'e-account': 75, 'Payroll': 260, 'Pensions': 260, 'Taxes': 21, 'Payroll Account': 120, 'Securities': 3, 'Credit Card': 35, 'Current Accounts': 74, 'Junior Account': 2}) 39981


In [None]:
#print y_probas[:10, target_groups[0]]
#print Y[np.array(TARGET_LABELS)[target_groups[0]]].head(10)

### Run KFold Cross-validation 

In [None]:
from trainval import cross_val_score

In [None]:
nb_folds = 5
results = cross_val_score((X, Y, clients_last_choice[LC_TARGET_LABELS].values), 
                            nb_folds=nb_folds, **_kwargs)

print "Cross-Validation \n %i | %f | %f | %f | %f | %.5f " % (nb_folds, results.min(), results.mean(), np.median(results), results.max(), results.std())



### 201505 -> 201605 

Cross-Validation 
 5 | 0.014585 | 0.018385 | 0.019147 | 0.022227 | 0.00294 

Compute cross-validation across several months

In [None]:
nb_folds = 3
yms = [201504, 201505]
#yms = [201505]

for ym in yms:
    logging.info("\n-------------------------")
    logging.info("- Process month : %s" % ym)
    logging.info("-------------------------\n")
    
    ym1 = ym + 100    
    df1 = train_df if months_ym_map[ym] in train_months else val_df
    df2 = train_df if months_ym_map[ym1] in train_months else val_df
    X, Y, clients_last_choice = get_XY(ym, df1, ym1, df2) 
    results = cross_val_score2((X, Y, clients_last_choice[LC_TARGET_LABELS].values), 
                                profiles=profiles,
                                nb_folds=nb_folds)
    print "Cross-Validation \n %i | %f | %f | %f | %.5f " % (nb_folds, results.min(), results.mean(), results.max(), results.std())

## Train model for predictions

In [None]:
current_month = 201505
next_year_month = current_month + 100

df1 = train_df
#df1 = val_df
df2 = train_df #if months_ym_map[next_year_month] in train_months else val_df
#df2 = val_df

X, Y, clients_last_choice = get_XY(current_month, df1, next_year_month, df2, months_ym_map)

In [None]:
estimators = train_all(X, Y, **_kwargs)

In [None]:
y_preds, Y_probas = predict_all(estimators, X, **_kwargs)

Check score on the data 2016/05

In [None]:
logging.info("- Compute map7 score")
print map7_score(y_val, y_preds, clients_last_choice[LC_TARGET_LABELS].values)
logging.info("- Compute max map7 score")
print map7_score(y_val, y_val, clients_last_choice[LC_TARGET_LABELS].values)

## Prediction for 2016/06

In [None]:
from dataset import load_train_test

In [None]:
full_train_df, test_df = load_train_test([201506])

In [None]:
full_train_df.head()

In [None]:
test_df.head()

In [None]:
months_ym_map = {}
months = list(set(full_train_df['fecha_dato'].unique()) | set(test_df['fecha_dato'].unique()))
for m in months:
    months_ym_map[to_yearmonth(m)] = m
    
full_train_months = full_train_df['fecha_dato'].unique()
test_months = test_df['fecha_dato'].unique()

In [None]:
current_month = 201506
next_year_month = current_month + 100

df1 = full_train_df
df2 = test_df
X, _, clients_last_choice = get_XY(current_month, df1, next_year_month, df2, months_ym_map)

In [None]:
print X.shape, test_df.shape

In [None]:
X.head(10)

In [None]:
clients_last_choice.head(10)

In [None]:
def get_submission(predicted_added_products, clients, clc, target_labels):
    added_products_col = []
    count = 0 
    for products, last_choice in zip(predicted_added_products, clc):
        predictions = remove_last_choice(products, last_choice)
        added_products_col.append(' '.join([target_labels[i] for i in predictions]))
        count+=1
        if count % 100000 == 0:
            logging.info("Elapsed : %i", count)
            
    out = pd.DataFrame(data={'ncodpers': clients, 'added_products': added_products_col}, columns=['ncodpers', 'added_products'])
    return out

In [None]:
y_preds, Y_probas = predict_all(estimators, X, **_kwargs)

logging.info("- Get submission dataframe:")
clients = X['ncodpers'].values
submission = get_submission(y_pred, clients, clients_last_choice[TARGET_LABELS].values, TARGET_LABELS)

In [None]:
submission_clients = set(submission['ncodpers'].unique())
test_clients = set(test_df['ncodpers'].unique())
if submission_clients != test_clients:
    missing_clients = list(test_clients - submission_clients)
        
#     selected_estimators = []
#     for e in estimators:
#         if e[0]
        
    
    missing_added_products = np.zeros((len(missing_clients)))
    submission = pd.concat([submission, 
                            pd.DataFrame(data={
                                'ncodpers': missing_clients, 
                                'added_products': missing_added_products
                            }, columns=['ncodpers', 'added_products'])])

Get submission DataFrame and write csv file

In [None]:
print submission.shape
submission.head()

In [None]:
from datetime import datetime
import csv

logging.info('- Generate submission')
submission_file = '../results/submission_' + \
                  str(datetime.now().strftime("%Y-%m-%d-%H-%M")) + \
                  '.csv'

submission.to_csv(submission_file, index=False, index_label=False)

In [None]:
with open('../results/submission_2016-11-17-16-37.csv', 'r') as r:
    print r.readline()
    print r.readline()
    print r.readline()
    print r.readline()
    print r.readline()
    