# Decision trees tryouts on SPR data, inspired by Kaggle Forum "When less is more"

Load training and validation data as 
    month : [ Features | Targets| Difference | Last Choice Targets  ]
    

In [52]:
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

In [1]:
import os
import numpy as np
import pandas as pd

import logging
logging.getLogger().handlers = []
logging.basicConfig(level=logging.INFO)

import matplotlib.pylab as plt
%matplotlib inline



In [2]:
import sys
sys.path.append("../common")

from dataset import load_trainval, LC_TARGET_LABELS, TARGET_LABELS_FRQ, TARGET_LABELS_DIFF
from utils import to_yearmonth, TARGET_LABELS, TARGET_LABELS2
from utils import target_str_to_labels, decimal_to_dummies, targets_str_to_indices, targets_dec_to_indices

In [3]:
features = [
    u'ind_empleado', u'pais_residencia',
    u'sexo', u'age', u'ind_nuevo', u'antiguedad', u'indrel',
    u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
    u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
    u'ind_actividad_cliente', u'renta', u'segmento'    
]

In [4]:
# train_yearmonths_list = [201504, 201505, 201604]
train_yearmonths_list = [201505, 201602, 201605]
# train_yearmonths_list = [201505]
#val_yearmonth = [201605]
train_nb_clients = 150000
# train_nb_clients = 1500
#train_df, val_df = load_trainval(train_yearmonths_list, val_yearmonth, train_nb_clients, val_nb_clients=1500)
train_df = load_trainval(train_yearmonths_list, train_nb_clients=train_nb_clients)

INFO:root:- Load training data : 
INFO:root:- Load data : [201504, 201505, 201601, 201602, 201604, 201605]
INFO:root:-- Select 150000 clients
INFO:root:- Number of lines with unknown data : 30
INFO:root:- Number of columns with nan : 9
INFO:root:-- Process date : 201505
INFO:root:-- Process date : 201602
INFO:root:-- Process date : 201605
INFO:root:-- Add logCount columns
INFO:root:-- Process month : 2015-04-28
INFO:root:-- Process month : 2015-05-28
INFO:root:-- Process month : 2016-01-28
INFO:root:-- Process month : 2016-02-28
INFO:root:-- Process month : 2016-04-28
INFO:root:-- Process month : 2016-05-28
INFO:root:-- Add logDecimal columns
INFO:root:-- Transform age/renta/logdiff
INFO:root:-- Add target values frequencies
INFO:root:-- Add target diff


Display loaded data

In [5]:
train_df[['fecha_dato', 'ncodpers'] + TARGET_LABELS_FRQ.tolist()].head(10)

Unnamed: 0,fecha_dato,ncodpers,ind_ahor_fin_ult1_frq,ind_aval_fin_ult1_frq,ind_cco_fin_ult1_frq,ind_cder_fin_ult1_frq,ind_cno_fin_ult1_frq,ind_ctju_fin_ult1_frq,ind_ctma_fin_ult1_frq,ind_ctop_fin_ult1_frq,...,ind_hip_fin_ult1_frq,ind_plan_fin_ult1_frq,ind_pres_fin_ult1_frq,ind_reca_fin_ult1_frq,ind_tjcr_fin_ult1_frq,ind_valo_fin_ult1_frq,ind_viv_fin_ult1_frq,ind_nomina_ult1_frq,ind_nom_pens_ult1_frq,ind_recibo_ult1_frq
210122,2015-04-28,15892,0.99982,0.999954,0.228481,0.999542,0.102785,0.988507,0.989993,0.835684,...,0.992253,0.987635,0.996609,0.06836,0.055803,0.033134,0.995131,0.933164,0.927901,0.159955
1051662,2015-05-28,15892,0.99982,0.999954,0.228481,0.999542,0.102785,0.988507,0.989993,0.835684,...,0.992253,0.987635,0.996609,0.06836,0.055803,0.033134,0.995131,0.933164,0.927901,0.159955
1638559,2016-01-28,15892,0.99982,0.999954,0.771519,0.999542,0.897215,0.988507,0.989993,0.835684,...,0.992253,0.987635,0.996609,0.06836,0.055803,0.033134,0.995131,0.933164,0.927901,0.159955
2663081,2016-02-28,15892,0.99982,0.999954,0.771519,0.999542,0.897215,0.988507,0.989993,0.835684,...,0.992253,0.987635,0.996609,0.06836,0.055803,0.033134,0.995131,0.933164,0.927901,0.159955
3532930,2016-04-28,15892,0.99982,0.999954,0.771519,0.999542,0.897215,0.988507,0.989993,0.835684,...,0.992253,0.987635,0.996609,0.06836,0.055803,0.033134,0.995131,0.933164,0.927901,0.159955
4338256,2016-05-28,15892,0.99982,0.999954,0.771519,0.999542,0.897215,0.988507,0.989993,0.835684,...,0.992253,0.987635,0.996609,0.06836,0.055803,0.033134,0.995131,0.933164,0.927901,0.159955
210113,2015-04-28,15903,0.99982,0.999954,0.771519,0.999542,0.897215,0.988507,0.989993,0.164316,...,0.992253,0.987635,0.996609,0.93164,0.055803,0.966866,0.995131,0.933164,0.927901,0.840045
1051671,2015-05-28,15903,0.99982,0.999954,0.771519,0.999542,0.897215,0.988507,0.989993,0.164316,...,0.992253,0.987635,0.996609,0.93164,0.055803,0.966866,0.995131,0.933164,0.927901,0.840045
1638492,2016-01-28,15903,0.99982,0.999954,0.771519,0.999542,0.897215,0.988507,0.989993,0.164316,...,0.992253,0.987635,0.996609,0.93164,0.055803,0.966866,0.995131,0.933164,0.927901,0.840045
2663070,2016-02-28,15903,0.99982,0.999954,0.771519,0.999542,0.897215,0.988507,0.989993,0.164316,...,0.992253,0.987635,0.996609,0.93164,0.055803,0.966866,0.995131,0.933164,0.927901,0.840045


Useful structures

In [6]:
def get_common_clients(df1, mask1, mask2, df2=None):
    active_clients1 = df1[mask1]['ncodpers'].unique()
    if df2 is not None:
        active_clients2 = df2[mask2]['ncodpers'].unique()
    else:
        active_clients2 = df1[mask2]['ncodpers'].unique()
    active_clients = list(set(active_clients1) & set(active_clients2)) 
    
    if df2 is not None:
        return df1['ncodpers'].isin(active_clients), df2['ncodpers'].isin(active_clients)
    return df1['ncodpers'].isin(active_clients)

In [7]:
months_ym_map = {}
# months = list(set(train_df['fecha_dato'].unique()) | set(val_df['fecha_dato'].unique()))
months = train_df['fecha_dato'].unique()
for m in months:
    months_ym_map[to_yearmonth(m)] = m

        
train_months = train_df['fecha_dato'].unique()
# val_months = val_df['fecha_dato'].unique()
    

### Train a model

In [8]:
from utils import get_added_products, remove_last_choice, apk, map7_score
from visualization import visualize_train_test, visualize_folds, compare_two_datasets, compare_folds, compare_folds2

In [9]:
target_features = ['targets_diff', 'targets_logdiff', 'targets_logcount2_diff', 'targets_logcount2', 'targets_logcount1', 'targets_logDec']

In [10]:
def get_XY(current_month, df1, next_year_month, df2, months_ym_map):
    month_mask = df1['fecha_dato'] == months_ym_map[current_month]
    next_year_month_mask = df2['fecha_dato'] == months_ym_map[next_year_month]
    next_year_prev_month_mask = df2['fecha_dato'] == months_ym_map[next_year_month - 1]
    
    # get common clients from df1 at this month and df2 at next year month
    common_clients_mask1, common_clients_mask2 = get_common_clients(df1, month_mask, next_year_month_mask, df2)
    common_clients_mask2, common_clients_mask3 = get_common_clients(df2, common_clients_mask2 & next_year_month_mask, next_year_prev_month_mask, df2)
        
    c1 = df1[common_clients_mask1 & month_mask]['ncodpers'].values
    c2 = df2[common_clients_mask2 & next_year_month_mask]['ncodpers'].values
    c3 = df2[common_clients_mask3 & next_year_prev_month_mask]['ncodpers'].values
    assert (c1 == c2).all() and (c2 == c3).all(), "Problem with common clients" 
    
    X = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + target_features + features + TARGET_LABELS_FRQ.tolist()]            

    if TARGET_LABELS[0] in df2.columns and TARGET_LABELS_FRQ[0] in df2.columns:
        Y = df2[common_clients_mask2 & next_year_month_mask][['ncodpers', 'fecha_dato', 'targets_str'] + TARGET_LABELS]    
        assert (X['ncodpers'].values == Y['ncodpers'].values).all(), "There is a problem in alignment"
        Y.index = X.index
        
        # Add TARGET_LABELS_FRQ to X:
        target_labels_frq = df2[common_clients_mask3 & next_year_prev_month_mask][['ncodpers'] + TARGET_LABELS_FRQ.tolist()]
        assert (X['ncodpers'].values == target_labels_frq['ncodpers'].values).all(), "There is a problem in alignment"
        target_labels_frq = target_labels_frq[TARGET_LABELS_FRQ]
        target_labels_frq.columns = [c + '_prev' for c in TARGET_LABELS_FRQ]
        target_labels_frq.index = X.index
        X = pd.concat([X, target_labels_frq], axis=1)        
        
    else:
        Y = None
    
    if LC_TARGET_LABELS[0] in df2.columns:
        clients_last_choice = df2[common_clients_mask2 & next_year_month_mask][['ncodpers', 'fecha_dato'] + LC_TARGET_LABELS.tolist()]
    else:
        clients_last_choice = None
        
    return X, Y, clients_last_choice


In [None]:
current_month = 201505
next_year_month = current_month + 100

df1 = train_df if months_ym_map[current_month] in train_months else val_df
#df1 = train_df
df2 = train_df if months_ym_map[next_year_month] in train_months else val_df
#df2 = train_df

X, Y, clients_last_choice = get_XY(current_month, df1, next_year_month, df2, months_ym_map)

In [239]:
assert (X['ncodpers'].values == Y['ncodpers'].values).all(), "WTF"
assert (X['ncodpers'].values == clients_last_choice['ncodpers'].values).all(), "WTF"

In [12]:
print X.shape
X.head(10)

(149985, 75)


Unnamed: 0,ncodpers,fecha_dato,targets_diff,targets_logdiff,targets_logcount2_diff,targets_logcount2,targets_logcount1,targets_logDec,ind_empleado,pais_residencia,...,ind_hip_fin_ult1_frq_prev,ind_plan_fin_ult1_frq_prev,ind_pres_fin_ult1_frq_prev,ind_reca_fin_ult1_frq_prev,ind_tjcr_fin_ult1_frq_prev,ind_valo_fin_ult1_frq_prev,ind_viv_fin_ult1_frq_prev,ind_nomina_ult1_frq_prev,ind_nom_pens_ult1_frq_prev,ind_recibo_ult1_frq_prev
1051662,15892,2015-05-28,0.0,0.0,0.0,9e-06,1.3e-05,13.181662,2,0,...,0.992253,0.987635,0.996609,0.06836,0.055803,0.033134,0.995131,0.933164,0.927901,0.159955
1051671,15903,2015-05-28,0.0,0.0,0.0,0.001516,0.001627,14.586878,2,0,...,0.992253,0.987635,0.996609,0.93164,0.055803,0.966866,0.995131,0.933164,0.927901,0.840045
1051672,15906,2015-05-28,-2.0,-1.098612,1e-06,2e-06,7e-06,13.349075,0,0,...,0.992253,0.987635,0.996609,0.93164,0.944197,0.966866,0.995131,0.933164,0.072099,0.159955
1051674,15908,2015-05-28,0.0,0.0,0.0,7e-06,7e-06,14.792546,2,0,...,0.992253,0.012365,0.996609,0.93164,0.055803,0.033134,0.995131,0.066836,0.072099,0.159955
1051675,15911,2015-05-28,0.0,0.0,0.0,2e-06,7e-06,13.196866,2,0,...,0.992253,0.987635,0.996609,0.93164,0.055803,0.033134,0.995131,0.933164,0.927901,0.159955
1051643,15919,2015-05-28,0.0,0.0,0.0,0.000108,0.00012,14.588785,3,0,...,0.992253,0.987635,0.996609,0.06836,0.944197,0.966866,0.995131,0.933164,0.927901,0.159955
1051645,15921,2015-05-28,0.0,0.0,0.0,2e-06,7e-06,13.296737,2,0,...,0.992253,0.012365,0.996609,0.93164,0.055803,0.033134,0.995131,0.933164,0.927901,0.159955
1051649,15925,2015-05-28,0.0,0.0,0.0,0.001516,0.001627,14.586878,2,0,...,0.992253,0.987635,0.996609,0.93164,0.055803,0.966866,0.995131,0.933164,0.927901,0.159955
1051653,15929,2015-05-28,1.0,0.693147,2e-05,2.8e-05,2.7e-05,14.57258,3,0,...,0.992253,0.987635,0.996609,0.93164,0.055803,0.033134,0.995131,0.933164,0.927901,0.840045
1051659,15935,2015-05-28,0.0,0.0,0.0,0.000182,0.000173,14.5897,3,0,...,0.992253,0.987635,0.996609,0.93164,0.944197,0.966866,0.995131,0.933164,0.927901,0.159955


In [13]:
print Y.shape
Y.head(10)

(149985, 27)


Unnamed: 0,ncodpers,fecha_dato,targets_str,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
1051662,15892,2016-05-28,001000000001100001110001,0,0,1,0,0,0,0,...,0,0,0,1,1,1,0,0.0,0.0,1
1051671,15903,2016-05-28,001000010000000000100000,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0.0,0.0,0
1051672,15906,2016-05-28,000010011001000000000011,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0.0,1.0,1
1051674,15908,2016-05-28,001010001000100100110111,0,0,1,0,1,0,0,...,0,1,0,0,1,1,0,1.0,1.0,1
1051675,15911,2016-05-28,000010000001100000110001,0,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0.0,0.0,1
1051643,15919,2016-05-28,001000000001000001000001,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0.0,0.0,1
1051645,15921,2016-05-28,001000010001010100110001,0,0,1,0,0,0,0,...,0,1,0,0,1,1,0,0.0,0.0,1
1051649,15925,2016-05-28,001000010000000000100000,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0.0,0.0,0
1051653,15929,2016-05-28,001000001000100000110001,0,0,1,0,0,0,0,...,0,0,0,0,1,1,0,0.0,0.0,1
1051659,15935,2016-05-28,001000010001100000000001,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,1


In [14]:
print clients_last_choice.shape
clients_last_choice.head(10)

(149985, 26)


Unnamed: 0,ncodpers,fecha_dato,lc_ind_ahor_fin_ult1,lc_ind_aval_fin_ult1,lc_ind_cco_fin_ult1,lc_ind_cder_fin_ult1,lc_ind_cno_fin_ult1,lc_ind_ctju_fin_ult1,lc_ind_ctma_fin_ult1,lc_ind_ctop_fin_ult1,...,lc_ind_hip_fin_ult1,lc_ind_plan_fin_ult1,lc_ind_pres_fin_ult1,lc_ind_reca_fin_ult1,lc_ind_tjcr_fin_ult1,lc_ind_valo_fin_ult1,lc_ind_viv_fin_ult1,lc_ind_nomina_ult1,lc_ind_nom_pens_ult1,lc_ind_recibo_ult1
4338256,15892,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
4338245,15903,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4338244,15906,2016-05-28,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4338242,15908,2016-05-28,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
4338240,15911,2016-05-28,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
4338260,15919,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4338262,15921,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
4338284,15925,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4338280,15929,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
4338275,15935,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Another train/predict + CV implementation

### Input

- `X` : `[nb_samples, nb_features]` shaped pd.DataFrame
    - `features_masks_list` : `{fm1_name: features_mask_1, fm2_name: features_mask_2, ...]` with `features_mask_i` is a list of feature column names. They can oversect.
    
- `Y` : `[nb_samples, nb_labels]` shaped pd.DataFrame
    - `labels_masks_list` : `{lm1_name: labels_mask_1, lm2_name: labels_mask_2, ...}` with `labels_mask_i` is a list of labels column names. They can oversect.

- `samples_masks_list` : `[samples_mask_1, samples_mask_2, ...]` with samples_mask_i is a function to produce a boolean pd.DataFrame . Used only for training. 


- Set of models `models` : list of functions to create a model, e.g. `[create_RF, create_NN, create_GBT]`


### Training phase




In [305]:
samples_masks_list = [
#    lambda x:  ~x['targets_diff'].isin([0]), 
]

TARGET_LABELS_FRQ_PREV = [c + '_prev' for c in TARGET_LABELS_FRQ]

features_masks_dict = {
#     'fm_all': None,
    'fm0': features + target_features + TARGET_LABELS_FRQ.tolist() + TARGET_LABELS_FRQ_PREV,
#     'fm1': ['pais_residencia', 'sexo', 'age', 'ind_nuevo', 'segmento', 'ind_empleado', 'ind_actividad_cliente', 'indresi'],
#     'fm2': target_features,
#     'fm3': ['pais_residencia', 'sexo', 'age', 'segmento', 'renta'],
#     'fm4': ['pais_residencia', 'sexo', 'age', 'renta', 'targets_logdiff', 'targets_logcount2_diff','targets_logcount2','targets_logcount1'],
#     'fm5': ['nomprov', 'ind_nuevo', 'renta', 'ind_actividad_cliente', 'canal_entrada'],
#     'fm6': TARGET_LABELS_FRQ,
}

In [306]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier

# https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/

def create_RF(input_shape, output_shape):        
    return RandomForestClassifier(n_estimators=10, 
#                                   min_samples_split=100,
#                                   min_samples_leaf=50,
                                  max_depth=7
                                 )

def create_ET(input_shape, output_shape):
    return ExtraTreesClassifier(n_estimators=10, max_depth=5)

def create_GB(input_shape, output_shape):
    return GradientBoostingClassifier()

models_dict = {
#     'rf': create_RF,
#     'et': create_ET,
    'gb': create_GB,
}

In [316]:
common_groups = [
#     [2, 18, 23], 
#     [21, 22],
#     [2, ],
#     [2, 12],
#     [2, 18],
#     [18, 23],
#     [21, 23],
#     [23, ],
#     [18, ],
#     [12, ],
#     [21, ],
#     [22, ],
#     [3, 4, 7, 8],
#     [17, ],
    [i] for i in range(24)
]



def flatten(array):
    out = []
    for item in array:
        out += item
    return out

others = list(set(range(24)) - set(flatten(common_groups)))
NP_TARGET_LABELS = np.array(TARGET_LABELS)

for i, a in enumerate(zip(TARGET_LABELS2, TARGET_LABELS)):
    print i, a
    
s = set({})
labels_masks_dict = {}
for i, g in enumerate(common_groups):
    labels_masks_dict['lm_%i' % i] = NP_TARGET_LABELS[g]
    s |= set(g)
labels_masks_dict['lm_others'] = NP_TARGET_LABELS[others]
s |= set(others)

assert len(s) == len(TARGET_LABELS), "Sum is not equal 24, s=%i" % s
print labels_masks_dict

0 ('Saving Account', 'ind_ahor_fin_ult1')
1 ('Guarantees', 'ind_aval_fin_ult1')
2 ('Current Accounts', 'ind_cco_fin_ult1')
3 ('Derivada Account', 'ind_cder_fin_ult1')
4 ('Payroll Account', 'ind_cno_fin_ult1')
5 ('Junior Account', 'ind_ctju_fin_ult1')
6 ('Mas particular Account', 'ind_ctma_fin_ult1')
7 ('particular Account', 'ind_ctop_fin_ult1')
8 ('particular Plus Account', 'ind_ctpp_fin_ult1')
9 ('Short-term deposits', 'ind_deco_fin_ult1')
10 ('Medium-term deposits', 'ind_deme_fin_ult1')
11 ('Long-term deposits', 'ind_dela_fin_ult1')
12 ('e-account', 'ind_ecue_fin_ult1')
13 ('Funds', 'ind_fond_fin_ult1')
14 ('Mortgage', 'ind_hip_fin_ult1')
15 ('Pensions (plan fin)', 'ind_plan_fin_ult1')
16 ('Loans', 'ind_pres_fin_ult1')
17 ('Taxes', 'ind_reca_fin_ult1')
18 ('Credit Card', 'ind_tjcr_fin_ult1')
19 ('Securities', 'ind_valo_fin_ult1')
20 ('Home Account', 'ind_viv_fin_ult1')
21 ('Payroll', 'ind_nomina_ult1')
22 ('Pensions', 'ind_nom_pens_ult1')
23 ('Direct Debit', 'ind_recibo_ult1')
{'lm_8

In [308]:
models_pipelines = {
    'gb' : [(None, key) for key in labels_masks_dict if len(labels_masks_dict[key]) == 1],
    'rf' : [(None, key) for key in labels_masks_dict if len(labels_masks_dict[key]) > 1],
}
models_pipelines

{'gb': [(None, 'lm_8'),
  (None, 'lm_9'),
  (None, 'lm_0'),
  (None, 'lm_1'),
  (None, 'lm_2'),
  (None, 'lm_3'),
  (None, 'lm_4'),
  (None, 'lm_5'),
  (None, 'lm_6'),
  (None, 'lm_7'),
  (None, 'lm_12'),
  (None, 'lm_13'),
  (None, 'lm_10'),
  (None, 'lm_11'),
  (None, 'lm_16'),
  (None, 'lm_17'),
  (None, 'lm_14'),
  (None, 'lm_15'),
  (None, 'lm_18'),
  (None, 'lm_19'),
  (None, 'lm_23'),
  (None, 'lm_22'),
  (None, 'lm_21'),
  (None, 'lm_20')],
 'rf': []}

In [309]:
from trainval import train_all, predict_all, probas_to_indices

In [313]:
ll = 100000
mask = X.index.isin(X.index[:ll])

X1 = X[mask]
Y1 = Y[mask]
clc = clients_last_choice[mask]
print X1.shape, Y1.shape, clc.shape

mask = X.index.isin(X.index[ll:ll+ll//2])
X2 = X[mask]
Y2 = Y[mask]
clc2 = clients_last_choice[mask]
print X2.shape, Y2.shape, clc2.shape

(100000, 75) (100000, 27) (100000, 26)
(49985, 75) (49985, 27) (49985, 26)


In [314]:
_kwargs = {'samples_masks_list': samples_masks_list, 
            'features_masks_dict': features_masks_dict, 
            'labels_masks_dict': labels_masks_dict, 
            'models_dict': models_dict,
            'labels': TARGET_LABELS,
            'transform_proba_func': probas_to_indices,
            'threshold': 0.0,
            'n_highest': 7,
            'mode': 'sum',
            'verbose': True,
            'models_pipelines': models_pipelines,
            'return_probas': True
          }

In [315]:
estimators = train_all(X1, Y1, **_kwargs)

#print estimators

INFO:root:-- Train all --
INFO:root:-- Process : sample_mask=100000/100000, features_mask=fm0, labels_mask=lm_8
INFO:root:--- Score : model='gb', fit accuracy : 0.998960
INFO:root:

	 -- Feature ranking : -- 


INFO:root:--- Estimator : fm0, lm_8, gb
INFO:root:1. feature 57 'ind_ctpp_fin_ult1_frq_prev' (0.449825)
INFO:root:2. feature 24 'targets_logDec' (0.120568)
INFO:root:3. feature 21 'targets_logcount2_diff' (0.071395)
INFO:root:4. feature 19 'targets_diff' (0.061032)
INFO:root:5. feature 33 'ind_ctpp_fin_ult1_frq' (0.050025)
INFO:root:6. feature 5 'antiguedad' (0.038676)
INFO:root:7. feature 20 'targets_logdiff' (0.036538)
INFO:root:8. feature 3 'age' (0.025140)
INFO:root:9. feature 13 'canal_entrada' (0.017637)
INFO:root:10. feature 15 'nomprov' (0.016927)
INFO:root:11. feature 22 'targets_logcount2' (0.015860)
INFO:root:12. feature 23 'targets_logcount1' (0.013192)
INFO:root:13. feature 17 'renta' (0.010340)
INFO:root:14. feature 28 'ind_cder_fin_ult1_frq' (0.010043)
INFO:root:1

In [317]:
y_preds, Y_probas = predict_all(estimators, X2, **_kwargs)
#print y_preds[:5]

INFO:root:-- Predict all --


In [318]:
Y_probas.head()

Unnamed: 0,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
639908,0,0,0.9913,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
639893,0,0,0.9913,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.97628
639898,0,0,0.9913,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
639902,0,0,0.9913,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
639905,0,0,0.9913,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


In [319]:
y_val = targets_str_to_indices(Y2[TARGET_LABELS].values)

logging.info("- Compute max map7 score")
map7_score(y_val, y_val, clc2[LC_TARGET_LABELS].values)
logging.info("- Compute map7 score")
map7_score(y_val, y_preds, clc2[LC_TARGET_LABELS].values)


INFO:root:- Compute max map7 score
INFO:root:-- Predicted map7 score: 0.02788836651
INFO:root:- Compute map7 score
INFO:root:-- Predicted map7 score: 0.00214230935947


0.002142309359474509

In [224]:
# print labels_masks_dict[estimators[0][0][1]]
# print estimators[0][1].classes_
# print estimators[0][1].n_classes_
# print estimators[0][1].n_features_
# print estimators[0][1].n_outputs_
# print estimators[0][1].estimators_

In [251]:
from utils import targets_to_labels, targets_indices_to_labels, remove_last_choice
from collections import defaultdict

In [300]:
limit = 25
count = 0

not_predicted_predicted = defaultdict(int)
for last_choice, targets, products, proba in zip(clc2[LC_TARGET_LABELS].values, y_val, y_preds, Y_probas.values):
    added_products = remove_last_choice(targets, last_choice)
    predictions = remove_last_choice(products, last_choice)
    
    if len(added_products) == 0:
        continue
        
    if len(set(added_products) & set(predictions)) > 0:
#         print "Predicted : ", added_products, predictions
#         print set(added_products) & set(predictions)
        continue

    count += 1
    if count < limit:
        print "--- Count = ", count
        print targets_indices_to_labels(added_products, TARGET_LABELS2)#, targets_indices_to_labels(targets, TARGET_LABELS2)
        print targets_indices_to_labels(predictions, TARGET_LABELS2), targets_indices_to_labels(products, TARGET_LABELS2)#, proba
    
    for p in added_products:
        not_predicted_predicted[TARGET_LABELS2[p]] += 1
    

--- Count =  1
['Current Accounts']
[] []
--- Count =  2
['Current Accounts']
[] []
--- Count =  3
['Payroll Account']
[] ['Current Accounts', 'Pensions', 'Payroll']
--- Count =  4
['Payroll', 'Pensions']
[] ['Current Accounts']
--- Count =  5
['Direct Debit']
[] ['Current Accounts']
--- Count =  6
['Current Accounts']
[] ['Direct Debit']
--- Count =  7
['Direct Debit']
[] ['Current Accounts']
--- Count =  8
['Credit Card']
[] ['Direct Debit']
--- Count =  9
['Payroll', 'Pensions']
[] ['e-account', 'Payroll Account', 'Direct Debit']
--- Count =  10
['Payroll Account']
['Direct Debit'] ['Pensions', 'Direct Debit']
--- Count =  11
['Payroll Account']
[] ['Direct Debit']
--- Count =  12
['Payroll Account']
[] ['Pensions', 'Payroll', 'Direct Debit']
--- Count =  13
['Payroll', 'Pensions']
[] ['Payroll Account']
--- Count =  14
['Payroll', 'Pensions']
[] ['Taxes', 'Payroll Account', 'Direct Debit']
--- Count =  15
['Current Accounts']
[] []
--- Count =  16
['Payroll', 'Pensions']
[] ['e-acc

In [301]:
print not_predicted_predicted

defaultdict(<type 'int'>, {'Long-term deposits': 1, 'Direct Debit': 325, 'e-account': 49, 'Payroll': 225, 'Payroll Account': 87, 'Taxes': 14, 'Pensions': 227, 'Securities': 2, 'Credit Card': 51, 'Current Accounts': 111})


In [None]:
#print y_probas[:10, target_groups[0]]
#print Y[np.array(TARGET_LABELS)[target_groups[0]]].head(10)

### Run KFold Cross-validation 

In [66]:
from trainval import cross_val_score

In [228]:
nb_folds = 5
results = cross_val_score((X, Y, clients_last_choice[LC_TARGET_LABELS].values), 
                            nb_folds=nb_folds, **_kwargs)

print "Cross-Validation \n %i | %f | %f | %f | %f | %.5f " % (nb_folds, results.min(), results.mean(), np.median(results), results.max(), results.std())


INFO:root:- Cross validation : 
INFO:root:

		-- Fold : 1 / 5

INFO:root:-- Train all --
INFO:root:-- Process : sample_mask=119988/119988, features_mask=fm0, labels_mask=lm_12
INFO:root:--- Score : model='rf', fit accuracy : 0.989791
INFO:root:-- Process : sample_mask=119988/119988, features_mask=fm0, labels_mask=lm_13
INFO:root:-- Process : sample_mask=119988/119988, features_mask=fm0, labels_mask=lm_10
INFO:root:-- Process : sample_mask=119988/119988, features_mask=fm0, labels_mask=lm_11
INFO:root:-- Process : sample_mask=119988/119988, features_mask=fm0, labels_mask=lm_8
INFO:root:-- Process : sample_mask=119988/119988, features_mask=fm0, labels_mask=lm_9
INFO:root:-- Process : sample_mask=119988/119988, features_mask=fm0, labels_mask=lm_others
INFO:root:--- Score : model='rf', fit accuracy : 0.975989
INFO:root:-- Process : sample_mask=119988/119988, features_mask=fm0, labels_mask=lm_0
INFO:root:--- Score : model='rf', fit accuracy : 0.942019
INFO:root:-- Process : sample_mask=11998

Cross-Validation 
 5 | 0.002189 | 0.002502 | 0.002414 | 0.002951 | 0.00030 



### 201505 -> 201605 

#### Single profiles:

Profiles   [0, 100] 
Cross-Validation (normalized)
 3 | 0.011683 | 0.012817 | 0.014958 | 0.00152 

Cross-Validation (not normalized)
 3 | 0.009244 | 0.010407 | 0.011922 | 0.00112 
 
 
Profiles :  [1, 101]
Cross-Validation (normalized)
 3 | 0.006793 | 0.009161 | 0.012219 | 0.00227 

Cross-Validation (not normalized)
 3 | 0.004787 | 0.009852 | 0.014922 | 0.00414


Profiles :  [112, 12]
Cross-Validation (normalized)
 3 | 0.008856 | 0.012124 | 0.016443 | 0.00318 

Cross-Validation (not normalized)
 3 | 0.007298 | 0.010140 | 0.014101 | 0.00289 

Compute cross-validation across several months

In [None]:
nb_folds = 3
yms = [201504, 201505]
#yms = [201505]

for ym in yms:
    logging.info("\n-------------------------")
    logging.info("- Process month : %s" % ym)
    logging.info("-------------------------\n")
    
    ym1 = ym + 100    
    df1 = train_df if months_ym_map[ym] in train_months else val_df
    df2 = train_df if months_ym_map[ym1] in train_months else val_df
    X, Y, clients_last_choice = get_XY(ym, df1, ym1, df2) 
    results = cross_val_score2((X, Y, clients_last_choice[LC_TARGET_LABELS].values), 
                                profiles=profiles,
                                nb_folds=nb_folds)
    print "Cross-Validation \n %i | %f | %f | %f | %.5f " % (nb_folds, results.min(), results.mean(), results.max(), results.std())

## Train model for predictions

In [None]:
current_month = 201505
next_year_month = current_month + 100

df1 = train_df
#df1 = val_df
df2 = train_df #if months_ym_map[next_year_month] in train_months else val_df
#df2 = val_df

X, Y, clients_last_choice = get_XY(current_month, df1, next_year_month, df2, months_ym_map)

In [None]:
estimators = train_all(X, Y, **_kwargs)

In [None]:
y_preds = predict_all(estimators, X, **_kwargs)

Check score on the data 2016/05

In [None]:
logging.info("- Compute map7 score")
print map7_score(y_val, y_preds, clients_last_choice[LC_TARGET_LABELS].values)
logging.info("- Compute max map7 score")
print map7_score(y_val, y_val, clients_last_choice[LC_TARGET_LABELS].values)

## Prediction for 2016/06

In [None]:
from dataset import load_train_test

In [None]:
full_train_df, test_df = load_train_test([201506])

In [None]:
full_train_df.head()

In [None]:
test_df.head()

In [None]:
months_ym_map = {}
months = list(set(full_train_df['fecha_dato'].unique()) | set(test_df['fecha_dato'].unique()))
for m in months:
    months_ym_map[to_yearmonth(m)] = m
    
full_train_months = full_train_df['fecha_dato'].unique()
test_months = test_df['fecha_dato'].unique()

In [None]:
current_month = 201506
next_year_month = current_month + 100

df1 = full_train_df
df2 = test_df
X, _, clients_last_choice = get_XY(current_month, df1, next_year_month, df2, months_ym_map)

In [None]:
print X.shape, test_df.shape

In [None]:
X.head(10)

In [None]:
clients_last_choice.head(10)

In [None]:
def get_submission(predicted_added_products, clients, clc, target_labels):
    added_products_col = []
    count = 0 
    for products, last_choice in zip(predicted_added_products, clc):
        predictions = remove_last_choice(products, last_choice)
        added_products_col.append(' '.join([target_labels[i] for i in predictions]))
        count+=1
        if count % 100000 == 0:
            logging.info("Elapsed : %i", count)
            
    out = pd.DataFrame(data={'ncodpers': clients, 'added_products': added_products_col}, columns=['ncodpers', 'added_products'])
    return out

In [None]:
y_pred = predict_with_model(estimators, X, profiles, threshold=0.5)

logging.info("- Get submission dataframe:")
clients = X['ncodpers'].values
submission = get_submission(y_pred, clients, clients_last_choice[TARGET_LABELS].values, TARGET_LABELS)

In [None]:
submission_clients = set(submission['ncodpers'].unique())
test_clients = set(test_df['ncodpers'].unique())
if submission_clients != test_clients:
    missing_clients = list(test_clients - submission_clients)
    missing_added_products = np.zeros((len(missing_clients)))
    submission = pd.concat([submission, 
                            pd.DataFrame(data={
                                'ncodpers': missing_clients, 
                                'added_products': missing_added_products
                            }, columns=['ncodpers', 'added_products'])])

Get submission DataFrame and write csv file

In [None]:
print submission.shape
submission.head()

In [None]:
from datetime import datetime
import csv

logging.info('- Generate submission')
submission_file = '../results/submission_' + \
                  str(datetime.now().strftime("%Y-%m-%d-%H-%M")) + \
                  '.csv'

submission.to_csv(submission_file, index=False, index_label=False)

In [None]:
with open('../results/submission_2016-11-17-16-37.csv', 'r') as r:
    print r.readline()
    print r.readline()
    print r.readline()
    print r.readline()
    print r.readline()
    