# Decision trees and NN tryouts on SPR data

Load training and validation data as 
    month : [ Features | Targets| Difference | Last Choice Targets  ]
    

In [2]:
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

In [3]:
import os
import numpy as np
import pandas as pd

import logging
logging.getLogger().handlers = []
logging.basicConfig(level=logging.INFO)

import matplotlib.pylab as plt
%matplotlib inline

In [4]:
import sys
sys.path.append("../common")

from dataset import load_trainval, LC_TARGET_LABELS, TARGET_LABELS_FRQ, TARGET_LABELS_DIFF
from utils import to_yearmonth, TARGET_LABELS, TARGET_LABELS2
from utils import target_str_to_labels, decimal_to_dummies, targets_str_to_indices, targets_dec_to_indices

In [5]:
features = [
    u'ind_empleado', u'pais_residencia',
    u'sexo', u'age', u'ind_nuevo', u'antiguedad', u'indrel',
    u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
    u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
    u'ind_actividad_cliente', u'renta', u'segmento'    
]

In [6]:
# train_yearmonths_list = [201504, 201505, 201604]
train_yearmonths_list = [201505, 201605]
# train_yearmonths_list = [201505]
#val_yearmonth = [201605]
train_nb_clients = 15000
# train_nb_clients = 1500
#train_df, val_df = load_trainval(train_yearmonths_list, val_yearmonth, train_nb_clients, val_nb_clients=1500)
train_df = load_trainval(train_yearmonths_list, train_nb_clients=train_nb_clients)

INFO:root:- Load training data : 
INFO:root:- Load data : [201504, 201505, 201604, 201605]
INFO:root:-- Select 15000 clients
INFO:root:- Number of lines with unknown data : 0
INFO:root:- Number of columns with nan : 6
INFO:root:-- Process date : 201505
INFO:root:-- Process date : 201605
INFO:root:-- Add logCount columns
INFO:root:-- Process month : 2015-04-28
INFO:root:-- Process month : 2015-05-28
INFO:root:-- Process month : 2016-04-28
INFO:root:-- Process month : 2016-05-28
INFO:root:-- Add logDecimal columns
INFO:root:-- Transform age/renta/logdiff
INFO:root:-- Add target values frequencies
INFO:root:-- Add target diff


Display loaded data

In [7]:
train_df[['fecha_dato', 'ncodpers'] + TARGET_LABELS_FRQ.tolist()].head(10)

Unnamed: 0,fecha_dato,ncodpers,ind_ahor_fin_ult1_frq,ind_aval_fin_ult1_frq,ind_cco_fin_ult1_frq,ind_cder_fin_ult1_frq,ind_cno_fin_ult1_frq,ind_ctju_fin_ult1_frq,ind_ctma_fin_ult1_frq,ind_ctop_fin_ult1_frq,...,ind_hip_fin_ult1_frq,ind_plan_fin_ult1_frq,ind_pres_fin_ult1_frq,ind_reca_fin_ult1_frq,ind_tjcr_fin_ult1_frq,ind_valo_fin_ult1_frq,ind_viv_fin_ult1_frq,ind_nomina_ult1_frq,ind_nom_pens_ult1_frq,ind_recibo_ult1_frq
210134,2015-04-28,15927,0.999867,0.999867,0.226217,0.999767,0.900933,0.9884,0.98935,0.838733,...,0.007933,0.0111,0.0029,0.937383,0.942883,0.96735,0.994917,0.934733,0.929367,0.1573
1051651,2015-05-28,15927,0.999867,0.999867,0.226217,0.999767,0.900933,0.9884,0.98935,0.838733,...,0.007933,0.0111,0.0029,0.937383,0.942883,0.96735,0.994917,0.934733,0.929367,0.1573
1695759,2016-04-28,15927,0.999867,0.999867,0.773783,0.999767,0.900933,0.9884,0.98935,0.838733,...,0.992067,0.0111,0.9971,0.937383,0.942883,0.96735,0.994917,0.934733,0.929367,0.1573
2501109,2016-05-28,15927,0.999867,0.999867,0.226217,0.999767,0.900933,0.9884,0.98935,0.838733,...,0.992067,0.0111,0.9971,0.937383,0.942883,0.96735,0.994917,0.934733,0.929367,0.1573
210108,2015-04-28,15935,0.999867,0.999867,0.773783,0.999767,0.900933,0.9884,0.98935,0.161267,...,0.992067,0.9889,0.9971,0.937383,0.942883,0.96735,0.994917,0.934733,0.929367,0.1573
1051659,2015-05-28,15935,0.999867,0.999867,0.773783,0.999767,0.900933,0.9884,0.98935,0.161267,...,0.992067,0.9889,0.9971,0.937383,0.942883,0.96735,0.994917,0.934733,0.929367,0.1573
1695766,2016-04-28,15935,0.999867,0.999867,0.773783,0.999767,0.900933,0.9884,0.98935,0.161267,...,0.992067,0.9889,0.9971,0.937383,0.942883,0.96735,0.994917,0.934733,0.929367,0.1573
2501102,2016-05-28,15935,0.999867,0.999867,0.773783,0.999767,0.900933,0.9884,0.98935,0.161267,...,0.992067,0.9889,0.9971,0.937383,0.942883,0.96735,0.994917,0.934733,0.929367,0.1573
210080,2015-04-28,15948,0.999867,0.999867,0.226217,0.999767,0.900933,0.9884,0.98935,0.838733,...,0.992067,0.0111,0.9971,0.937383,0.942883,0.96735,0.994917,0.934733,0.929367,0.8427
1051706,2015-05-28,15948,0.999867,0.999867,0.226217,0.999767,0.900933,0.9884,0.98935,0.838733,...,0.992067,0.0111,0.9971,0.937383,0.942883,0.96735,0.994917,0.934733,0.929367,0.8427


Useful structures

In [8]:
def get_common_clients(df1, mask1, mask2, df2=None):
    active_clients1 = df1[mask1]['ncodpers'].unique()
    if df2 is not None:
        active_clients2 = df2[mask2]['ncodpers'].unique()
    else:
        active_clients2 = df1[mask2]['ncodpers'].unique()
    active_clients = list(set(active_clients1) & set(active_clients2)) 
    
    if df2 is not None:
        return df1['ncodpers'].isin(active_clients), df2['ncodpers'].isin(active_clients)
    return df1['ncodpers'].isin(active_clients)

In [9]:
months_ym_map = {}
# months = list(set(train_df['fecha_dato'].unique()) | set(val_df['fecha_dato'].unique()))
months = train_df['fecha_dato'].unique()
for m in months:
    months_ym_map[to_yearmonth(m)] = m

        
train_months = train_df['fecha_dato'].unique()
# val_months = val_df['fecha_dato'].unique()
    

### Train a model

In [10]:
from utils import get_added_products, remove_last_choice, apk, map7_score
from visualization import visualize_train_test, visualize_folds, compare_two_datasets, compare_folds, compare_folds2

In [20]:
target_features = ['targets_diff', 'targets_logdiff', 'targets_logcount2_diff', 'targets_logcount2', 'targets_logcount1', 'targets_logDec']
TARGET_LABELS_FRQ_PREV = [c + '_prev' for c in TARGET_LABELS_FRQ]

In [12]:
def get_XY(current_month, df1, next_year_month, df2, months_ym_map):
    month_mask = df1['fecha_dato'] == months_ym_map[current_month]
    next_year_month_mask = df2['fecha_dato'] == months_ym_map[next_year_month]
    next_year_prev_month_mask = df2['fecha_dato'] == months_ym_map[next_year_month - 1]
    
    # get common clients from df1 at this month and df2 at next year month
    common_clients_mask1, common_clients_mask2 = get_common_clients(df1, month_mask, next_year_month_mask, df2)
    common_clients_mask2, common_clients_mask3 = get_common_clients(df2, common_clients_mask2 & next_year_month_mask, next_year_prev_month_mask, df2)
        
    c1 = df1[common_clients_mask1 & month_mask]['ncodpers'].values
    c2 = df2[common_clients_mask2 & next_year_month_mask]['ncodpers'].values
    c3 = df2[common_clients_mask3 & next_year_prev_month_mask]['ncodpers'].values
    assert (c1 == c2).all() and (c2 == c3).all(), "Problem with common clients" 
    
    X = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + target_features + features + TARGET_LABELS_FRQ.tolist()]            
   
    if TARGET_LABELS[0] in df2.columns and TARGET_LABELS_DIFF[0] in df2.columns and not df2[next_year_month_mask][TARGET_LABELS].isnull().all().all():
        Y = df2[common_clients_mask2 & next_year_month_mask][['ncodpers', 'fecha_dato', 'targets_str', 'lc_targets_str', 'targets_diff'] + TARGET_LABELS + TARGET_LABELS_DIFF.tolist()]    
        assert (X['ncodpers'].values == Y['ncodpers'].values).all(), "There is a problem in alignment"
        Y.index = X.index                
    else:
        Y = None
        
    if TARGET_LABELS_FRQ[0] in df2.columns and not df2[next_year_prev_month_mask][TARGET_LABELS].isnull().all().all():
        # Add TARGET_LABELS_FRQ from previous month to X:
        target_labels_frq = df2[common_clients_mask3 & next_year_prev_month_mask][['ncodpers'] + TARGET_LABELS_FRQ.tolist()]
        assert (X['ncodpers'].values == target_labels_frq['ncodpers'].values).all(), "There is a problem in alignment"
        target_labels_frq = target_labels_frq[TARGET_LABELS_FRQ]
        target_labels_frq.columns = TARGET_LABELS_FRQ_PREV
        target_labels_frq.index = X.index
        X = pd.concat([X, target_labels_frq], axis=1)        

    
    if LC_TARGET_LABELS[0] in df2.columns:
        clients_last_choice = df2[common_clients_mask2 & next_year_month_mask][['ncodpers', 'fecha_dato', 'targets_str'] + LC_TARGET_LABELS.tolist()]
    else:
        clients_last_choice = None
        
    return X, Y, clients_last_choice


In [13]:
current_month = 201505
next_year_month = current_month + 100

df1 = train_df if months_ym_map[current_month] in train_months else val_df
#df1 = train_df
df2 = train_df if months_ym_map[next_year_month] in train_months else val_df
#df2 = train_df

X, Y, clients_last_choice = get_XY(current_month, df1, next_year_month, df2, months_ym_map)

In [14]:
assert (X['ncodpers'].values == Y['ncodpers'].values).all(), "WTF"
assert (X['ncodpers'].values == clients_last_choice['ncodpers'].values).all(), "WTF"

In [15]:
print X.shape
X.head(10)

(15000, 75)


Unnamed: 0,ncodpers,fecha_dato,targets_diff,targets_logdiff,targets_logcount2_diff,targets_logcount2,targets_logcount1,targets_logDec,ind_empleado,pais_residencia,...,ind_hip_fin_ult1_frq_prev,ind_plan_fin_ult1_frq_prev,ind_pres_fin_ult1_frq_prev,ind_reca_fin_ult1_frq_prev,ind_tjcr_fin_ult1_frq_prev,ind_valo_fin_ult1_frq_prev,ind_viv_fin_ult1_frq_prev,ind_nomina_ult1_frq_prev,ind_nom_pens_ult1_frq_prev,ind_recibo_ult1_frq_prev
1051651,15927,2015-05-28,0.0,0.0,0.0,3.3e-05,6.7e-05,10.424244,3,0,...,0.992067,0.0111,0.9971,0.937383,0.942883,0.96735,0.994917,0.934733,0.929367,0.1573
1051659,15935,2015-05-28,0.0,0.0,0.0,0.000217,0.000133,14.5897,3,0,...,0.992067,0.9889,0.9971,0.937383,0.942883,0.96735,0.994917,0.934733,0.929367,0.1573
1051706,15948,2015-05-28,0.0,0.0,0.0,0.001333,0.001467,5.549076,0,0,...,0.992067,0.0111,0.9971,0.937383,0.942883,0.96735,0.994917,0.934733,0.929367,0.8427
1051698,15961,2015-05-28,0.0,0.0,0.0,0.031583,0.0258,0.0,3,0,...,0.992067,0.9889,0.9971,0.937383,0.942883,0.96735,0.994917,0.934733,0.929367,0.8427
1051601,16046,2015-05-28,0.0,0.0,0.0,0.001783,0.0018,14.556107,1,0,...,0.992067,0.9889,0.9971,0.937383,0.057117,0.96735,0.994917,0.934733,0.929367,0.1573
1051855,16229,2015-05-28,-65568.0,-11.090858,0.0,1.7e-05,6.7e-05,13.182566,0,0,...,0.007933,0.9889,0.9971,0.062617,0.942883,0.03265,0.994917,0.934733,0.929367,0.1573
1051860,16236,2015-05-28,-2.0,-1.098612,0.000433,0.000533,0.000667,13.287581,0,0,...,0.992067,0.9889,0.9971,0.937383,0.942883,0.96735,0.994917,0.934733,0.070633,0.8427
1051839,16265,2015-05-28,0.0,0.0,0.0,0.514933,0.521667,14.556091,0,0,...,0.992067,0.9889,0.9971,0.937383,0.942883,0.96735,0.994917,0.934733,0.929367,0.8427
1051790,16521,2015-05-28,0.0,0.0,0.0,3.3e-05,6.7e-05,13.184072,2,0,...,0.992067,0.0111,0.9971,0.062617,0.942883,0.03265,0.994917,0.934733,0.929367,0.1573
1051265,16808,2015-05-28,0.0,0.0,0.0,0.000433,0.000467,14.556137,0,0,...,0.992067,0.9889,0.9971,0.062617,0.057117,0.96735,0.994917,0.934733,0.929367,0.8427


In [16]:
print Y.shape
Y[Y['targets_diff'] > 0][['fecha_dato', 'ncodpers', 'targets_str', 'lc_targets_str'] + TARGET_LABELS_DIFF.tolist() ].head(10)

(15000, 53)


Unnamed: 0,fecha_dato,ncodpers,targets_str,lc_targets_str,ind_ahor_fin_ult1_diff,ind_aval_fin_ult1_diff,ind_cco_fin_ult1_diff,ind_cder_fin_ult1_diff,ind_cno_fin_ult1_diff,ind_ctju_fin_ult1_diff,...,ind_hip_fin_ult1_diff,ind_plan_fin_ult1_diff,ind_pres_fin_ult1_diff,ind_reca_fin_ult1_diff,ind_tjcr_fin_ult1_diff,ind_valo_fin_ult1_diff,ind_viv_fin_ult1_diff,ind_nomina_ult1_diff,ind_nom_pens_ult1_diff,ind_recibo_ult1_diff
1052365,2016-05-28,17841,000010000001100000100001,000010000001100000000001,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1050393,2016-05-28,19077,000010000000100100100111,000010000000100100100001,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1050090,2016-05-28,19914,001000000000000000110001,001000000000000000010001,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1051042,2016-05-28,20514,001000001000010000000001,001000001000010000000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1047171,2016-05-28,32094,000010000000101001100001,000010000000101000100111,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1046224,2016-05-28,35264,001000000000000000000001,001000000000000000000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1061139,2016-05-28,44866,001010011000000000111110,001010011000000000111000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1061975,2016-05-28,46968,001000001000010000000001,001000001000010000000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1060282,2016-05-28,47196,000010000000100001100011,000010000000100001000011,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1060439,2016-05-28,47479,000010000000000000100111,000000000000000000100111,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
print clients_last_choice.shape
clients_last_choice.head(10)

(15000, 27)


Unnamed: 0,ncodpers,fecha_dato,targets_str,lc_ind_ahor_fin_ult1,lc_ind_aval_fin_ult1,lc_ind_cco_fin_ult1,lc_ind_cder_fin_ult1,lc_ind_cno_fin_ult1,lc_ind_ctju_fin_ult1,lc_ind_ctma_fin_ult1,...,lc_ind_hip_fin_ult1,lc_ind_plan_fin_ult1,lc_ind_pres_fin_ult1,lc_ind_reca_fin_ult1,lc_ind_tjcr_fin_ult1,lc_ind_valo_fin_ult1,lc_ind_viv_fin_ult1,lc_ind_nomina_ult1,lc_ind_nom_pens_ult1,lc_ind_recibo_ult1
2501109,15927,2016-05-28,000000001000000100000001,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2501102,15935,2016-05-28,001000010001100000000001,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2501090,15948,2016-05-28,000000000000000100000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2501022,15961,2016-05-28,000000000000000000000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2501184,16046,2016-05-28,001000000000000000100001,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2500849,16229,2016-05-28,000010001000101001010001,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2500842,16236,2016-05-28,000010010000000000000010,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2500880,16265,2016-05-28,001000000000000000000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2500968,16521,2016-05-28,001000000000110101010000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2501548,16808,2016-05-28,001000000000000001100000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


## Another train/predict + CV implementation

### Input

- `X` : `[nb_samples, nb_features]` shaped pd.DataFrame
    - `features_masks_list` : `{fm1_name: features_mask_1, fm2_name: features_mask_2, ...]` with `features_mask_i` is a list of feature column names. They can oversect.
    
- `Y` : `[nb_samples, nb_labels]` shaped pd.DataFrame
    - `labels_masks_list` : `{lm1_name: labels_mask_1, lm2_name: labels_mask_2, ...}` with `labels_mask_i` is a list of labels column names. They can oversect.

- `samples_masks_list` : `[samples_mask_1, samples_mask_2, ...]` with samples_mask_i is a function to produce a boolean pd.DataFrame . Used only for training. 


- Set of models `models` : list of functions to create a model, e.g. `[create_RF, create_NN, create_GBT]`


### Training phase




In [18]:
from collections import defaultdict
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier

In [56]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Merge
from keras.utils import np_utils

from keras.wrappers.scikit_learn import KerasClassifier

In [106]:
samples_masks_list = [
    lambda x, y:  ~(x['targets_diff'].isin([0])) | ~(y['targets_diff'].isin([0])), 
    lambda x, y:  (x['targets_diff'] > 0) | (y['targets_diff'] > 0), 
]

features_masks_dict = {
    'fm0': features + target_features + TARGET_LABELS_FRQ.tolist() + TARGET_LABELS_FRQ_PREV,
    'fm1': ['pais_residencia', 'sexo', 'age', 'ind_nuevo', 'segmento', 'ind_empleado', 'ind_actividad_cliente', 'indresi'],
#     'fm2': target_features,
    'fm3': ['pais_residencia', 'sexo', 'age', 'segmento', 'renta'],
    'fm4': ['pais_residencia', 'sexo', 'age', 'renta', 'targets_logdiff', 'targets_logcount2_diff','targets_logcount2','targets_logcount1'],
    'fm5': ['nomprov', 'ind_nuevo', 'renta', 'ind_actividad_cliente', 'canal_entrada'],
#     'fm6': TARGET_LABELS_FRQ,
}

In [112]:
# https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/

def create_RF(input_shape, output_shape):        
    # https://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/
    return RandomForestClassifier(n_estimators=100, 
#                                   min_samples_split=100,
#                                   min_samples_leaf=25,
#                                   max_depth=10
                                  max_features=1.0, 
#                                   oob_score=True,
#                                   bootstrap=True,
                                  n_jobs=-1
                                 )

def create_ET(input_shape, output_shape):
    return ExtraTreesClassifier(n_estimators=100,
#                                   min_samples_leaf=25,
#                                   max_depth=10
                                  max_features=1.0, 
                                  oob_score=True,
                                  bootstrap=True,
                                  n_jobs=-1

                               )

def create_GB(input_shape, output_shape):
    return GradientBoostingClassifier(n_estimators=75)


def create_NN0(input_shape, output_shape):
        
    assert len(input_shape) == 2, "Input shape should be 2D"
    assert len(output_shape) == 2, "Input shape should be 2D"
    n_features = input_shape[1]
    output_dim = output_shape[1]
    
    def create_model(input_dim=n_features, output_dim=output_dim):
        model = Sequential()
        model.add(Dense(30, init='uniform', input_shape=(input_dim,), activation='relu'))
        model.add(Dropout(0.15))
#         model.add(Dense(output_dim, activation='sigmoid'))
        model.add(Dense(output_dim, activation='softmax'))
#         model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])
        model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
        return model

    return KerasClassifier(build_fn=create_model, nb_epoch=200, batch_size=2000, verbose=0)
    

models_dict = {
#     'rf': create_RF,
#     'et': create_ET,
#     'gb': create_GB,
    'nn0': create_NN0
}

In [113]:
NP_TARGET_LABELS = np.array(TARGET_LABELS)
target_labels = NP_TARGET_LABELS

common_groups = [
    [2, 3, 4, 5],
#     [2, 4],    
#     [2, 6, 7, 8],
#     [2, 18, 23, 12], 
#     [21, 22],
#     [2, 12, 18],
#     [2, 12, 23],
#     [2, 18, 23],
#     [18, 23, 21, 22],
#     [21, 23, 22, 4],
#     [3, 4], 
#     [22, 7, 8, 23],
#     [0, 1, 14, 15, 17]
]



def flatten(array):
    out = []
    for item in array:
        out += item
    return out

others = list(set(range(24)) - set(flatten(common_groups)))

# for i, a in enumerate(zip(TARGET_LABELS2, TARGET_LABELS)):
#     print i, a
    
s = set({})
labels_masks_dict = {}
for i, g in enumerate(common_groups):
    print 'lm_%i' % i, " <=> ", g, "<==>", TARGET_LABELS2[g]
    labels_masks_dict['lm_%i' % i] = target_labels[g]
    s |= set(g)
    
# print 'lm_others', "<=>", others, "<==>", TARGET_LABELS2[others]
# labels_masks_dict['lm_others'] = target_labels[others]
# s |= set(others)

# assert len(s) == len(target_labels), "Sum is not equal 24, s=%i" % s
print labels_masks_dict

lm_0  <=>  [2, 3, 4, 5] <==> ['Current Accounts' 'Derivada Account' 'Payroll Account' 'Junior Account']
{'lm_0': array(['ind_cco_fin_ult1', 'ind_cder_fin_ult1', 'ind_cno_fin_ult1',
       'ind_ctju_fin_ult1'], 
      dtype='|S17')}


In [114]:
# {model_name: [(samples_mask_code, features_mask_name, labels_mask_name), ...]}
models_pipelines = {
    #'gb' : [(None, None, key) for key in labels_masks_dict if len(labels_masks_dict[key]) > 1],
    #'rf' : [(None, None, key) for key in labels_masks_dict if len(labels_masks_dict[key]) > 1],
    #'et' : [(None, None, key) for key in labels_masks_dict if len(labels_masks_dict[key]) > 1],
}
models_pipelines

{}

In [115]:
from trainval import train_all, predict_all, probas_to_indices, score_estimators
from utils import map7_score0

In [121]:
# ll = 110000
ll = 1100

mask = X.index.isin(X.index[:ll])

X1 = X[mask]
Y1 = Y[mask]
clc1 = clients_last_choice[mask]
print X1.shape, Y1.shape, clc1.shape

mask = X.index.isin(X.index[ll:ll+ll//2])
X2 = X[mask]
Y2 = Y[mask]
clc2 = clients_last_choice[mask]
print X2.shape, Y2.shape, clc2.shape

(1100, 75) (1100, 53) (1100, 27)
(550, 75) (550, 53) (550, 27)


In [122]:
# res = Y1[labels_masks_dict['lm_0']].apply(dummies_to_decimal, axis=1)
# res = pd.get_dummies(res)
# res.head()

In [123]:
from utils import dummies_to_decimal
from sklearn.preprocessing import StandardScaler


def prepare_to_fit(X_train, Y_train):    
    x_train = X_train.values
    x_train = StandardScaler().fit_transform(x_train)
    y_train = Y_train.apply(dummies_to_decimal, axis=1)
    y_train = pd.get_dummies(y_train)
    y_train = y_train.values    
    return x_train, y_train


def prepare_to_test(X_val, Y_val=None):
    x_val = X_val.values
    x_val = StandardScaler().fit_transform(x_val)
    if Y_val is not None:
        y_val = Y_val.apply(dummies_to_decimal, axis=1)
        y_val = pd.get_dummies(y_val)
        y_val = y_val.values 
    else:
        y_val = None
    return x_val, y_val


def probas_to_labels_probas(y_probas, class_indices, labels):
    l = len(labels)
    out = np.zeros((len(y_probas), l))
    i = 0
    for probas in y_probas:
        if np.sum(probas) > 0:
            pr = np.zeros((l,))
            for index, p in zip(class_indices, probas):
                dummies_str = decimal_to_dummies(index, l)
                pr += p * np.array([float(v) for v in dummies_str])
            out[i, :] = pr    
        i += 1
    return out
    

In [124]:
_kwargs = {
    'samples_masks_list': samples_masks_list, 
    'features_masks_dict': features_masks_dict, 
    'labels_masks_dict': labels_masks_dict, 
    'models_dict': models_dict,
    'labels': target_labels,
    'transform_proba_func': probas_to_indices,
#     'prepare_to_fit_func': prepare_to_fit,
#     'prepare_to_test_func': prepare_to_test,   
#     'probas_to_labels_probas_func': probas_to_labels_probas,
    'threshold': 0.15,
    'n_highest': 7,
    'mode': 'sum',
    'verbose': False,
    'models_pipelines': models_pipelines,
    'return_probas': True
}

In [125]:
estimators = train_all(X1, Y1, **_kwargs)

#print estimators

INFO:root:-- Train all --
INFO:root:-- Process : sample_mask=186/1100, features_mask=fm4, labels_mask=lm_0
INFO:root:--- Score : model='nn0', fit accuracy : 0.829301
INFO:root:-- Process : sample_mask=186/1100, features_mask=fm5, labels_mask=lm_0
INFO:root:--- Score : model='nn0', fit accuracy : 0.776882
INFO:root:-- Process : sample_mask=186/1100, features_mask=fm3, labels_mask=lm_0
INFO:root:--- Score : model='nn0', fit accuracy : 0.813172
INFO:root:-- Process : sample_mask=186/1100, features_mask=fm0, labels_mask=lm_0
INFO:root:--- Score : model='nn0', fit accuracy : 0.978495
INFO:root:-- Process : sample_mask=186/1100, features_mask=fm1, labels_mask=lm_0
INFO:root:--- Score : model='nn0', fit accuracy : 0.810484
INFO:root:-- Process : sample_mask=86/1100, features_mask=fm4, labels_mask=lm_0
INFO:root:--- Score : model='nn0', fit accuracy : 0.875000
INFO:root:-- Process : sample_mask=86/1100, features_mask=fm5, labels_mask=lm_0
INFO:root:--- Score : model='nn0', fit accuracy : 0.796

In [126]:
accuracies = defaultdict(list)
for e in estimators:
    accuracies[e[0][2]].append(e[2])

mean_accuracy = {}
for key in accuracies:
    accuracy_list = accuracies[key]
    mean_accuracy[key] = sum(accuracy_list)/len(accuracy_list)
    
mean_accuracy

{'nn0': 0.8513565838336945}

In [127]:
_ = score_estimators(estimators, X2, Y2, **_kwargs)

INFO:root:-- Score : model=nn0, features_mask=fm4, labels_mask=lm_0 -> 0.740000009537
INFO:root:-- Score : model=nn0, features_mask=fm5, labels_mask=lm_0 -> 0.7586363554
INFO:root:-- Score : model=nn0, features_mask=fm3, labels_mask=lm_0 -> 0.763636350632
INFO:root:-- Score : model=nn0, features_mask=fm0, labels_mask=lm_0 -> 0.925909101963
INFO:root:-- Score : model=nn0, features_mask=fm1, labels_mask=lm_0 -> 0.774999976158
INFO:root:-- Score : model=nn0, features_mask=fm4, labels_mask=lm_0 -> 0.787727296352
INFO:root:-- Score : model=nn0, features_mask=fm5, labels_mask=lm_0 -> 0.775454521179
INFO:root:-- Score : model=nn0, features_mask=fm3, labels_mask=lm_0 -> 0.7909091115
INFO:root:-- Score : model=nn0, features_mask=fm0, labels_mask=lm_0 -> 0.930000007153
INFO:root:-- Score : model=nn0, features_mask=fm1, labels_mask=lm_0 -> 0.837272703648


In [128]:
y_preds, Y_probas = predict_all(estimators, X2, **_kwargs)
#print y_preds[:5]

INFO:root:-- Predict all --


In [129]:
print y_preds[:5]
Y_probas.head()

[[4, 2] [4, 2] [4, 2] [2, 4] [2, 4]]


Unnamed: 0,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
1088858,0.0,0.0,0.440133,0.059859,0.441169,0.058839,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1088898,0.0,0.0,0.421584,0.007819,0.561388,0.009208,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1088819,0.0,0.0,0.288252,0.004119,0.701721,0.005908,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1088788,0.0,0.0,0.724943,0.00558,0.260354,0.009122,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1088364,0.0,0.0,0.594109,0.011126,0.380816,0.013949,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [102]:
#labels_masks_dict['lm_0'], common_groups[0]

In [103]:
y_val = targets_str_to_indices(Y2[target_labels].values)
# y_val = targets_str_to_indices(Y2[labels_masks_dict['lm_0']].values, index_map=common_groups[0])
#print y_val[:100]
#print y_preds[:100]

In [104]:
#np.unique(y_preds)

In [105]:
logging.info("- Compute max map7 score")
map7_score(y_val, y_val, clc2[LC_TARGET_LABELS].values)
# map7_score0(y_val, y_val)
logging.info("- Compute map7 score")
map7_score(y_val, y_preds, clc2[LC_TARGET_LABELS].values)
# map7_score0(y_val, y_preds)

INFO:root:- Compute max map7 score
INFO:root:-- Predicted map7 score: 0.0454545454545
INFO:root:- Compute map7 score
INFO:root:-- Predicted map7 score: 0.00272727272727


0.0027272727272727275


On columns lm_0=['ind_cco_fin_ult1', 'ind_cder_fin_ult1', 'ind_cno_fin_ult1','ind_ctju_fin_ult1']

- Feature mask: all : 'fm6', 'fm4', 'fm5', 'fm2', 'fm3', 'fm0', 'fm1'
- threshold = 0.0

Model | Map@7 | Max Map@7 | Labels mask | Samples mask
--- | --- | --- | ---
et | 0.007074370518592629 | 0.0075787893947 | lm_0 | all 



- Feature mask: fm0, fm1, fm3, fm4, fm5
- threshold = 0.0

Model | Map@7 | Max Map@7 | Labels mask | Samples mask
--- | --- | --- | ---
rf + et + gb | 0.006920126730031681 | 0.0075787893947 | lm_0 | all
rf | 0.0068805235951309 | 0.0075787893947 | lm_0 | all
et | 0.006936801734200433 | 0.0075787893947 | lm_0 | all 
gb | 0.0068805235951309 | 0.0075787893947 | lm_0 | all


- Feature mask: fm0, fm1

Model | Map@7 | Max Map@7 | Labels mask | Samples mask
--- | --- | --- | ---
rf + et + gb | 0.004627313656828414 | 0.0075787893947 | lm_0 | all
rf | 0.004664832416208104 | 0.0075787893947 | lm_0 | all
et | 0.004952476238119059 | 0.0075787893947 | lm_0 | all 
gb | 0.004489744872436218 | 0.0075787893947 | lm_0 | all

- Features mask: fm0

Model | Map@7 | Max Map@7 | Labels mask | Samples mask
--- | --- | --- | --- | ---
rf + et + gb | 0.0021010505252626313 | 0.0075787893947 | lm_0 | all
rf + et | 0.001950975487743872 | 0.0075787893947 | lm_0 | all
rf | 0.001550775387693847 | 0.0075787893947 | lm_0 | all
gb | 0.0013006503251625813 | 0.0075787893947 | lm_0 | all
et | 0.0017008504252126063 | 0.0075787893947 | lm_0 | all
et |  0.0014007003501750874 | 0.0075787893947 | lm_0 | x>0 or y>0
rf |  0.0008254127063531766 | 0.0075787893947 | lm_0 | .

--------------------------------------

0.021295269099703414 (GB on 'all')

0.021271936353906683 (RF tunning)

0.021668245671284416 (RF tunning)

0.02136609107928888

0.0211362663776694

In [74]:
# print labels_masks_dict[estimators[0][0][1]]
# print estimators[0][1].classes_
# print estimators[0][1].n_classes_
# print estimators[0][1].n_features_
# print estimators[0][1].n_outputs_
# print estimators[0][1].estimators_

In [816]:
from utils import targets_to_labels, targets_indices_to_labels, remove_last_choice

In [817]:
limit = 25
count = 0

not_predicted_predicted = defaultdict(int)
for last_choice, targets, products, proba in zip(clc2[LC_TARGET_LABELS].values, y_val, y_preds, Y_probas.values):
    added_products = remove_last_choice(targets, last_choice)
    predictions = remove_last_choice(products, last_choice)
#     print "---", count, last_choice
#     print targets, '->', added_products
#     print products, '->', predictions
#     if count == 3:
#         break
    
    if len(added_products) == 0:
        continue
        
    if len(set(added_products) & set(predictions)) > 0:
#         print "Predicted : ", added_products, predictions
#         print set(added_products) & set(predictions)
        continue

    count += 1
    if count < limit:
        print "--- Count = ", count
        print targets_indices_to_labels(added_products, TARGET_LABELS2)#, targets_indices_to_labels(targets, TARGET_LABELS2)
        print targets_indices_to_labels(predictions, TARGET_LABELS2)#, targets_indices_to_labels(products, TARGET_LABELS2)#, proba
    
    for p in added_products:
        not_predicted_predicted[TARGET_LABELS2[p]] += 1
    

In [818]:
print not_predicted_predicted, y_val.shape[0]

defaultdict(<type 'int'>, {}) 39980


In [None]:
#print y_probas[:10, target_groups[0]]
#print Y[np.array(TARGET_LABELS)[target_groups[0]]].head(10)

### Run KFold Cross-validation 

In [52]:
from trainval import cross_val_score0, cross_val_score

In [823]:
# Unitary run
nb_folds = 5
results = cross_val_score((X, Y, clients_last_choice[LC_TARGET_LABELS].values), nb_folds=nb_folds, **_kwargs)

print "Cross-Validation \n %i | %f | %f | %f | %f | %.5f " % (nb_folds, results.min(), results.mean(), np.median(results), results.max(), results.std())


INFO:root:- Cross validation : 
INFO:root:

		-- Fold : 1 / 5

INFO:root:-- Train all --
INFO:root:-- Process : sample_mask=119984/119984, features_mask=fm6, labels_mask=lm_0
INFO:root:--- Score : model='et', fit accuracy : 0.934575
INFO:root:-- Process : sample_mask=119984/119984, features_mask=fm4, labels_mask=lm_0
INFO:root:--- Score : model='et', fit accuracy : 0.955102
INFO:root:-- Process : sample_mask=119984/119984, features_mask=fm5, labels_mask=lm_0
INFO:root:--- Score : model='et', fit accuracy : 0.802824
INFO:root:-- Process : sample_mask=119984/119984, features_mask=fm2, labels_mask=lm_0
INFO:root:--- Score : model='et', fit accuracy : 0.938317
INFO:root:-- Process : sample_mask=119984/119984, features_mask=fm3, labels_mask=lm_0
INFO:root:--- Score : model='et', fit accuracy : 0.802690
INFO:root:-- Process : sample_mask=119984/119984, features_mask=fm0, labels_mask=lm_0
INFO:root:--- Score : model='et', fit accuracy : 0.999650
INFO:root:-- Process : sample_mask=119984/11998

Cross-Validation 
 5 | 0.003763 | 0.004522 | 0.004101 | 0.005990 | 0.00080 


In [61]:
from itertools import combinations
# CV on various combinations :

_samples_masks_list = [
    'all',
#     lambda x, y:  ~(x['targets_diff'].isin([0])), 
#     lambda x, y:  x['targets_diff'] > 0, 
#     lambda x, y:  x['targets_diff'] < 0, 
    lambda x, y:  ~(x['targets_diff'].isin([0])) | ~(y['targets_diff'].isin([0])), 
    lambda x, y:  (x['targets_diff'] > 0) | (y['targets_diff'] > 0), 
#     lambda x, y:  (x['targets_diff'] < 0) | (y['targets_diff'] < 0), 
#     lambda x, y:  (y['targets_diff'] > 0), 
#     lambda x, y:  y['targets_diff'] < 0, 
]

_features_masks_dict = {
    'fm0': features + target_features + TARGET_LABELS_FRQ.tolist() + TARGET_LABELS_FRQ_PREV,
    'fm1': ['pais_residencia', 'sexo', 'age', 'ind_nuevo', 'segmento', 'ind_empleado', 'ind_actividad_cliente', 'indresi'],
    'fm2': target_features,
    'fm3': ['pais_residencia', 'sexo', 'age', 'segmento', 'renta'],
    'fm4': ['pais_residencia', 'sexo', 'age', 'renta', 'targets_logdiff', 'targets_logcount2_diff','targets_logcount2','targets_logcount1'],
    'fm5': ['nomprov', 'ind_nuevo', 'renta', 'ind_actividad_cliente', 'canal_entrada'],
    'fm6': TARGET_LABELS_FRQ,
}

_models_dict = {
    'rf': create_RF,
    'et': create_ET,
    'gb': create_GB,
}

_labels_masks_dict = {
    'lm_0': labels_masks_dict['lm_0']
}

nb_folds = 5

def BruteForceSearchCV():
    
    def get_models_combinations(items):
        combins = list(combinations(items, 1))
        combins += list(combinations(items, len(items)))
        return combins
    
    def get_combinations(items):
        combins = list(combinations(items, 1))
        for i in range(2, len(items)+1):
            combins += list(combinations(items, i))
        return combins
    
    def get_items(items):
        out = [[items[0],], ]
        for i in items[1:]:
            tmp = list(out[-1])
            tmp.append(i)
            out.append(tmp)        
        return out

    
    _labels_masks_combinations = get_items(sorted(_labels_masks_dict.keys()))
    _features_masks_combinations = get_items(sorted(_features_masks_dict.keys()))
    _models_combinations = get_models_combinations(_models_dict.keys())

    # Very big loop:
    for lm_keys in _labels_masks_combinations:
        __labels_masks_dict = {}
        for lm_key in lm_keys:
            __labels_masks_dict[lm_key] = _labels_masks_dict[lm_key]

        for i, sm in enumerate(_samples_masks_list):
            __samples_masks_list = [sm]
              
            for fm_keys in _features_masks_combinations:
                __features_masks_dict = {}
                for fm_key in fm_keys:
                    __features_masks_dict[fm_key] = _features_masks_dict[fm_key]
                    
                for m_keys in _models_combinations:
                    __models_dict = {}
                    for m_key in m_keys:
                        __models_dict[m_key] = _models_dict[m_key]
                    
                    print "\n\n---------------------------------------------------------------" 
                    print "--- PROCESS : ", __labels_masks_dict.keys(), i, __features_masks_dict.keys(), __models_dict.keys()
                    print "---------------------------------------------------------------\n" 
                    
                    __kwargs = {
                        'samples_masks_list': __samples_masks_list, 
                        'features_masks_dict': __features_masks_dict, 
                        'labels_masks_dict': __labels_masks_dict, 
                        'models_dict': __models_dict,
                        'labels': target_labels,
                        'transform_proba_func': probas_to_indices,
                        'prepare_to_fit_func': prepare_to_fit,
                        'prepare_to_test_func': prepare_to_test,   
                        'probas_to_labels_probas_func': probas_to_labels_probas,
                        'threshold': 0.0,
                        'n_highest': 7,
                        'mode': 'sum',
                        'verbose': False,
                        'return_probas': True
                    }
                    #  DEBUG : results = cross_val_score((X1, Y1, clc1[LC_TARGET_LABELS].values), nb_folds=nb_folds, **__kwargs)
                    results = cross_val_score((X, Y, clients_last_choice[LC_TARGET_LABELS].values), nb_folds=nb_folds, **__kwargs)
                    print "=> CV : ", results

In [None]:
logging.getLogger().setLevel(logging.WARNING)
BruteForceSearchCV()
logging.getLogger().setLevel(logging.INFO)



---------------------------------------------------------------
--- PROCESS :  ['lm_0'] 0 ['fm0'] ['et']
---------------------------------------------------------------




### 201505 -> 201605 

Cross-Validation 
 5 | 0.014585 | 0.018385 | 0.019147 | 0.022227 | 0.00294 

Compute cross-validation across several months

In [None]:
nb_folds = 3
yms = [201504, 201505]
#yms = [201505]

for ym in yms:
    logging.info("\n-------------------------")
    logging.info("- Process month : %s" % ym)
    logging.info("-------------------------\n")
    
    ym1 = ym + 100    
    df1 = train_df if months_ym_map[ym] in train_months else val_df
    df2 = train_df if months_ym_map[ym1] in train_months else val_df
    X, Y, clients_last_choice = get_XY(ym, df1, ym1, df2) 
    results = cross_val_score2((X, Y, clients_last_choice[LC_TARGET_LABELS].values), 
                                profiles=profiles,
                                nb_folds=nb_folds)
    print "Cross-Validation \n %i | %f | %f | %f | %.5f " % (nb_folds, results.min(), results.mean(), results.max(), results.std())

## Train model for predictions

In [None]:
current_month = 201505
next_year_month = current_month + 100

df1 = train_df
#df1 = val_df
df2 = train_df #if months_ym_map[next_year_month] in train_months else val_df
#df2 = val_df

X, Y, clients_last_choice = get_XY(current_month, df1, next_year_month, df2, months_ym_map)

In [None]:
estimators = train_all(X, Y, **_kwargs)

In [None]:
y_preds, Y_probas = predict_all(estimators, X, **_kwargs)

Check score on the data 2016/05

In [None]:
logging.info("- Compute map7 score")
print map7_score(y_val, y_preds, clients_last_choice[LC_TARGET_LABELS].values)
logging.info("- Compute max map7 score")
print map7_score(y_val, y_val, clients_last_choice[LC_TARGET_LABELS].values)

## Prediction for 2016/06

In [None]:
from dataset import load_train_test

In [None]:
full_train_df, test_df = load_train_test([201506])

In [None]:
full_train_df.head()

In [None]:
test_df.head()

In [None]:
months_ym_map = {}
months = list(set(full_train_df['fecha_dato'].unique()) | set(test_df['fecha_dato'].unique()))
for m in months:
    months_ym_map[to_yearmonth(m)] = m
    
full_train_months = full_train_df['fecha_dato'].unique()
test_months = test_df['fecha_dato'].unique()

In [None]:
current_month = 201506
next_year_month = current_month + 100

df1 = full_train_df
df2 = test_df
X, _, clients_last_choice = get_XY(current_month, df1, next_year_month, df2, months_ym_map)

In [None]:
print X.shape, test_df.shape

In [None]:
X.head(10)

In [None]:
clients_last_choice.head(10)

In [None]:
def get_submission(predicted_added_products, clients, clc, target_labels):
    added_products_col = []
    count = 0 
    for products, last_choice in zip(predicted_added_products, clc):
        predictions = remove_last_choice(products, last_choice)
        added_products_col.append(' '.join([target_labels[i] for i in predictions]))
        count+=1
        if count % 100000 == 0:
            logging.info("Elapsed : %i", count)
            
    out = pd.DataFrame(data={'ncodpers': clients, 'added_products': added_products_col}, columns=['ncodpers', 'added_products'])
    return out

In [None]:
y_preds, Y_probas = predict_all(estimators, X, **_kwargs)

logging.info("- Get submission dataframe:")
clients = X['ncodpers'].values
submission = get_submission(y_pred, clients, clients_last_choice[TARGET_LABELS].values, TARGET_LABELS)

In [None]:
selected_estimators = []
for e in estimators:
    # estimators = [([features_mask_name, labels_mask_name, model_name], estimator_object, accuracy), ...]
    features_mask_name, labels_mask_name, model_name = e[0]
#     if features_masks_dict[features_mask_name]
#     if e[0]

In [None]:
features_masks_dict[features_mask_name] in test_df.columns

In [None]:
submission_clients = set(submission['ncodpers'].unique())
test_clients = set(test_df['ncodpers'].unique())
if submission_clients != test_clients:
    missing_clients = list(test_clients - submission_clients)
    missing_clients_mask = test_df['ncodpers'].isin(missing_clients)
    
    X1 = test_df[missing_clients_mask]
    
    

        
    
    missing_added_products = np.zeros((len(missing_clients)))
    submission = pd.concat([submission, 
                            pd.DataFrame(data={
                                'ncodpers': missing_clients, 
                                'added_products': missing_added_products
                            }, columns=['ncodpers', 'added_products'])])

Get submission DataFrame and write csv file

In [None]:
print submission.shape
submission.head()

In [None]:
from datetime import datetime
import csv

logging.info('- Generate submission')
submission_file = '../results/submission_' + \
                  str(datetime.now().strftime("%Y-%m-%d-%H-%M")) + \
                  '.csv'

submission.to_csv(submission_file, index=False, index_label=False)

In [None]:
with open('../results/submission_2016-11-17-16-37.csv', 'r') as r:
    print r.readline()
    print r.readline()
    print r.readline()
    print r.readline()
    print r.readline()
    