# Decision trees tryouts on SPR data, inspired by Kaggle Forum "When less is more"

Load training and validation data as 
    month : [ Features | Targets| Difference | Last Choice Targets  ]
    

In [1]:
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import pandas as pd

import logging
logging.getLogger().handlers = []
logging.basicConfig(level=logging.INFO)

import matplotlib.pylab as plt
%matplotlib inline



In [3]:
import sys
sys.path.append("../common")

from dataset import load_trainval, LC_TARGET_LABELS, TARGET_LABELS_FRQ, TARGET_LABELS_DIFF
from utils import to_yearmonth, TARGET_LABELS, TARGET_LABELS2
from utils import target_str_to_labels, decimal_to_dummies, targets_str_to_indices, targets_dec_to_indices

In [4]:
features = [
    u'ind_empleado', u'pais_residencia',
    u'sexo', u'age', u'ind_nuevo', u'antiguedad', u'indrel',
    u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
    u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
    u'ind_actividad_cliente', u'renta', u'segmento'    
]

In [5]:
# train_yearmonths_list = [201504, 201505, 201604]
train_yearmonths_list = [201505, 201602, 201605]
# train_yearmonths_list = [201505]
#val_yearmonth = [201605]
train_nb_clients = 150000
# train_nb_clients = 1500
#train_df, val_df = load_trainval(train_yearmonths_list, val_yearmonth, train_nb_clients, val_nb_clients=1500)
train_df = load_trainval(train_yearmonths_list, train_nb_clients=train_nb_clients)

INFO:root:- Load training data : 
INFO:root:- Load data : [201504, 201505, 201601, 201602, 201604, 201605]
INFO:root:-- Select 150000 clients
INFO:root:- Number of lines with unknown data : 24
INFO:root:- Number of columns with nan : 9
INFO:root:-- Process date : 201505
INFO:root:-- Process date : 201602
INFO:root:-- Process date : 201605
INFO:root:-- Add logCount columns
INFO:root:-- Process month : 2015-04-28
INFO:root:-- Process month : 2015-05-28
INFO:root:-- Process month : 2016-01-28
INFO:root:-- Process month : 2016-02-28
INFO:root:-- Process month : 2016-04-28
INFO:root:-- Process month : 2016-05-28
INFO:root:-- Add logDecimal columns
INFO:root:-- Transform age/renta/logdiff
INFO:root:-- Add target values frequencies
INFO:root:-- Add target diff


Display loaded data

In [6]:
train_df[['fecha_dato', 'ncodpers'] + TARGET_LABELS_FRQ.tolist()].head(10)

Unnamed: 0,fecha_dato,ncodpers,ind_ahor_fin_ult1_frq,ind_aval_fin_ult1_frq,ind_cco_fin_ult1_frq,ind_cder_fin_ult1_frq,ind_cno_fin_ult1_frq,ind_ctju_fin_ult1_frq,ind_ctma_fin_ult1_frq,ind_ctop_fin_ult1_frq,...,ind_hip_fin_ult1_frq,ind_plan_fin_ult1_frq,ind_pres_fin_ult1_frq,ind_reca_fin_ult1_frq,ind_tjcr_fin_ult1_frq,ind_valo_fin_ult1_frq,ind_viv_fin_ult1_frq,ind_nomina_ult1_frq,ind_nom_pens_ult1_frq,ind_recibo_ult1_frq
210118,2015-04-28,15897,0.999877,0.999974,0.771917,0.999471,0.102862,0.988494,0.989508,0.165461,...,0.992379,0.012021,0.996619,0.067668,0.05683,0.03342,0.994801,0.93296,0.072066,0.159387
1051666,2015-05-28,15897,0.999877,0.999974,0.771917,0.999471,0.102862,0.988494,0.989508,0.165461,...,0.992379,0.012021,0.996619,0.067668,0.05683,0.03342,0.994801,0.93296,0.927934,0.159387
1638553,2016-01-28,15897,0.999877,0.999974,0.771917,0.999471,0.102862,0.988494,0.989508,0.165461,...,0.992379,0.012021,0.996619,0.067668,0.05683,0.03342,0.994801,0.93296,0.927934,0.159387
2663084,2016-02-28,15897,0.999877,0.999974,0.771917,0.999471,0.102862,0.988494,0.989508,0.165461,...,0.992379,0.012021,0.996619,0.067668,0.05683,0.03342,0.994801,0.93296,0.072066,0.159387
3532963,2016-04-28,15897,0.999877,0.999974,0.228083,0.999471,0.102862,0.988494,0.989508,0.165461,...,0.992379,0.012021,0.996619,0.067668,0.05683,0.03342,0.994801,0.93296,0.072066,0.159387
4338251,2016-05-28,15897,0.999877,0.999974,0.228083,0.999471,0.102862,0.988494,0.989508,0.165461,...,0.992379,0.012021,0.996619,0.067668,0.05683,0.03342,0.994801,0.93296,0.072066,0.159387
210141,2015-04-28,15920,0.999877,0.999974,0.771917,0.999471,0.102862,0.988494,0.989508,0.165461,...,0.992379,0.987979,0.996619,0.067668,0.94317,0.96658,0.994801,0.93296,0.927934,0.159387
1051644,2015-05-28,15920,0.999877,0.999974,0.771917,0.999471,0.102862,0.988494,0.989508,0.165461,...,0.992379,0.987979,0.996619,0.067668,0.94317,0.96658,0.994801,0.93296,0.927934,0.159387
1638479,2016-01-28,15920,0.999877,0.999974,0.771917,0.999471,0.897138,0.988494,0.989508,0.165461,...,0.992379,0.987979,0.996619,0.067668,0.94317,0.03342,0.994801,0.93296,0.927934,0.159387
2663097,2016-02-28,15920,0.999877,0.999974,0.771917,0.999471,0.897138,0.988494,0.989508,0.165461,...,0.992379,0.987979,0.996619,0.067668,0.94317,0.03342,0.994801,0.93296,0.927934,0.159387


Useful structures

In [7]:
def get_common_clients(df1, mask1, mask2, df2=None):
    active_clients1 = df1[mask1]['ncodpers'].unique()
    if df2 is not None:
        active_clients2 = df2[mask2]['ncodpers'].unique()
    else:
        active_clients2 = df1[mask2]['ncodpers'].unique()
    active_clients = list(set(active_clients1) & set(active_clients2)) 
    
    if df2 is not None:
        return df1['ncodpers'].isin(active_clients), df2['ncodpers'].isin(active_clients)
    return df1['ncodpers'].isin(active_clients)

In [8]:
months_ym_map = {}
# months = list(set(train_df['fecha_dato'].unique()) | set(val_df['fecha_dato'].unique()))
months = train_df['fecha_dato'].unique()
for m in months:
    months_ym_map[to_yearmonth(m)] = m

        
train_months = train_df['fecha_dato'].unique()
# val_months = val_df['fecha_dato'].unique()
    

### Train a model

In [9]:
from utils import get_added_products, remove_last_choice, apk, map7_score
from visualization import visualize_train_test, visualize_folds, compare_two_datasets, compare_folds, compare_folds2

In [10]:
target_features = ['targets_diff', 'targets_logdiff', 'targets_logcount2_diff', 'targets_logcount2', 'targets_logcount1', 'targets_logDec']

In [11]:
def get_XY(current_month, df1, next_year_month, df2, months_ym_map):
    month_mask = df1['fecha_dato'] == months_ym_map[current_month]
    next_year_month_mask = df2['fecha_dato'] == months_ym_map[next_year_month]
    next_year_prev_month_mask = df2['fecha_dato'] == months_ym_map[next_year_month - 1]
    
    # get common clients from df1 at this month and df2 at next year month
    common_clients_mask1, common_clients_mask2 = get_common_clients(df1, month_mask, next_year_month_mask, df2)
    common_clients_mask2, common_clients_mask3 = get_common_clients(df2, common_clients_mask2 & next_year_month_mask, next_year_prev_month_mask, df2)
        
    c1 = df1[common_clients_mask1 & month_mask]['ncodpers'].values
    c2 = df2[common_clients_mask2 & next_year_month_mask]['ncodpers'].values
    c3 = df2[common_clients_mask3 & next_year_prev_month_mask]['ncodpers'].values
    assert (c1 == c2).all() and (c2 == c3).all(), "Problem with common clients" 
    
    X = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + target_features + features + TARGET_LABELS_FRQ.tolist()]            
   
    if TARGET_LABELS[0] in df2.columns and TARGET_LABELS_DIFF[0] in df2.columns and not df2[next_year_month_mask][TARGET_LABELS].isnull().all().all():
        Y = df2[common_clients_mask2 & next_year_month_mask][['ncodpers', 'fecha_dato', 'targets_str', 'lc_targets_str', 'targets_diff'] + TARGET_LABELS + TARGET_LABELS_DIFF.tolist()]    
        assert (X['ncodpers'].values == Y['ncodpers'].values).all(), "There is a problem in alignment"
        Y.index = X.index                
    else:
        Y = None
        
    if TARGET_LABELS_FRQ[0] in df2.columns and not df2[next_year_prev_month_mask][TARGET_LABELS].isnull().all().all():
        # Add TARGET_LABELS_FRQ from previous month to X:
        target_labels_frq = df2[common_clients_mask3 & next_year_prev_month_mask][['ncodpers'] + TARGET_LABELS_FRQ.tolist()]
        assert (X['ncodpers'].values == target_labels_frq['ncodpers'].values).all(), "There is a problem in alignment"
        target_labels_frq = target_labels_frq[TARGET_LABELS_FRQ]
        target_labels_frq.columns = [c + '_prev' for c in TARGET_LABELS_FRQ]
        target_labels_frq.index = X.index
        X = pd.concat([X, target_labels_frq], axis=1)        

    
    if LC_TARGET_LABELS[0] in df2.columns:
        clients_last_choice = df2[common_clients_mask2 & next_year_month_mask][['ncodpers', 'fecha_dato', 'targets_str'] + LC_TARGET_LABELS.tolist()]
    else:
        clients_last_choice = None
        
    return X, Y, clients_last_choice


In [12]:
current_month = 201505
next_year_month = current_month + 100

df1 = train_df if months_ym_map[current_month] in train_months else val_df
#df1 = train_df
df2 = train_df if months_ym_map[next_year_month] in train_months else val_df
#df2 = train_df

X, Y, clients_last_choice = get_XY(current_month, df1, next_year_month, df2, months_ym_map)

In [13]:
assert (X['ncodpers'].values == Y['ncodpers'].values).all(), "WTF"
assert (X['ncodpers'].values == clients_last_choice['ncodpers'].values).all(), "WTF"

In [14]:
print X.shape
X.head(10)

(149988, 75)


Unnamed: 0,ncodpers,fecha_dato,targets_diff,targets_logdiff,targets_logcount2_diff,targets_logcount2,targets_logcount1,targets_logDec,ind_empleado,pais_residencia,...,ind_hip_fin_ult1_frq_prev,ind_plan_fin_ult1_frq_prev,ind_pres_fin_ult1_frq_prev,ind_reca_fin_ult1_frq_prev,ind_tjcr_fin_ult1_frq_prev,ind_valo_fin_ult1_frq_prev,ind_viv_fin_ult1_frq_prev,ind_nomina_ult1_frq_prev,ind_nom_pens_ult1_frq_prev,ind_recibo_ult1_frq_prev
1051666,15897,2015-05-28,-2.0,-1.098612,0.0,2e-06,7e-06,14.805207,1,0,...,0.992379,0.012021,0.996619,0.067668,0.05683,0.03342,0.994801,0.93296,0.072066,0.159387
1051644,15920,2015-05-28,0.0,0.0,0.0,4.6e-05,6.7e-05,14.803952,3,0,...,0.992379,0.987979,0.996619,0.067668,0.94317,0.03342,0.994801,0.93296,0.927934,0.159387
1051649,15925,2015-05-28,0.0,0.0,0.0,0.001333,0.001527,14.586878,3,0,...,0.992379,0.987979,0.996619,0.932332,0.05683,0.96658,0.994801,0.93296,0.927934,0.159387
1051651,15927,2015-05-28,0.0,0.0,0.0,2e-06,7e-06,10.424244,2,0,...,0.992379,0.012021,0.996619,0.932332,0.94317,0.96658,0.994801,0.93296,0.927934,0.159387
1051655,15932,2015-05-28,0.0,0.0,0.0,0.001372,0.001367,2.833213,0,0,...,0.992379,0.987979,0.996619,0.932332,0.94317,0.03342,0.994801,0.93296,0.927934,0.840613
1051678,15937,2015-05-28,0.0,0.0,0.0,3e-06,7e-06,14.792953,1,0,...,0.992379,0.012021,0.996619,0.067668,0.94317,0.03342,0.994801,0.93296,0.927934,0.159387
1051680,15939,2015-05-28,0.0,0.0,0.0,6.4e-05,8.7e-05,14.557312,0,0,...,0.007621,0.987979,0.996619,0.932332,0.94317,0.96658,0.994801,0.93296,0.927934,0.159387
1051700,15940,2015-05-28,0.0,0.0,0.0,2e-05,2.7e-05,14.589708,2,0,...,0.992379,0.987979,0.996619,0.932332,0.94317,0.03342,0.994801,0.93296,0.927934,0.159387
1051706,15948,2015-05-28,0.0,0.0,0.0,0.00152,0.00154,5.549076,0,0,...,0.992379,0.012021,0.996619,0.932332,0.94317,0.96658,0.994801,0.93296,0.927934,0.840613
1051712,15956,2015-05-28,0.0,0.0,0.0,0.000604,0.00056,14.601901,2,0,...,0.992379,0.987979,0.996619,0.932332,0.94317,0.96658,0.994801,0.93296,0.927934,0.159387


In [15]:
print Y.shape
Y[Y['targets_diff'] > 0][['fecha_dato', 'ncodpers', 'targets_str', 'lc_targets_str'] + TARGET_LABELS_DIFF.tolist() ].head(10)

(149988, 53)


Unnamed: 0,fecha_dato,ncodpers,targets_str,lc_targets_str,ind_ahor_fin_ult1_diff,ind_aval_fin_ult1_diff,ind_cco_fin_ult1_diff,ind_cder_fin_ult1_diff,ind_cno_fin_ult1_diff,ind_ctju_fin_ult1_diff,...,ind_hip_fin_ult1_diff,ind_plan_fin_ult1_diff,ind_pres_fin_ult1_diff,ind_reca_fin_ult1_diff,ind_tjcr_fin_ult1_diff,ind_valo_fin_ult1_diff,ind_viv_fin_ult1_diff,ind_nomina_ult1_diff,ind_nom_pens_ult1_diff,ind_recibo_ult1_diff
1051641,2016-05-28,15988,001000000000000000100000,001000000000000000000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1051627,2016-05-28,16056,001010001000000000000110,001010001000000000000001,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1051616,2016-05-28,16125,001000010000100000000001,001000000000100000000001,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1051805,2016-05-28,16202,001010000000010001100111,001010000000010001000111,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1051739,2016-05-28,16294,000010000000100000000111,000010000000000000000111,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1051779,2016-05-28,16506,000010000000001001000111,000010000000001001000001,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1051792,2016-05-28,16525,001010001000110100100111,001010000000110100100111,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1051258,2016-05-28,16787,001000000000100101110111,001000000000100101110001,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1051523,2016-05-28,16988,000010001000000001100011,000010001000000001000011,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1051478,2016-05-28,17236,000010000000100101110111,000010000000100101110011,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [16]:
print clients_last_choice.shape
clients_last_choice.head(10)

(149988, 27)


Unnamed: 0,ncodpers,fecha_dato,targets_str,lc_ind_ahor_fin_ult1,lc_ind_aval_fin_ult1,lc_ind_cco_fin_ult1,lc_ind_cder_fin_ult1,lc_ind_cno_fin_ult1,lc_ind_ctju_fin_ult1,lc_ind_ctma_fin_ult1,...,lc_ind_hip_fin_ult1,lc_ind_plan_fin_ult1,lc_ind_pres_fin_ult1,lc_ind_reca_fin_ult1,lc_ind_tjcr_fin_ult1,lc_ind_valo_fin_ult1,lc_ind_viv_fin_ult1,lc_ind_nomina_ult1,lc_ind_nom_pens_ult1,lc_ind_recibo_ult1
4338251,15897,2016-05-28,000010010000110101110011,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
4338261,15920,2016-05-28,001000010000000001010001,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4338284,15925,2016-05-28,001000010000000000100000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4338282,15927,2016-05-28,000000001000000100000001,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4338278,15932,2016-05-28,000000000000000000010000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4338273,15937,2016-05-28,001010001000110101010001,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4338271,15939,2016-05-28,001000000000101000000001,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4338270,15940,2016-05-28,001000010001100000010001,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4338263,15948,2016-05-28,000000000000000100000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4338200,15956,2016-05-28,001000011000000000000001,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Another train/predict + CV implementation

### Input

- `X` : `[nb_samples, nb_features]` shaped pd.DataFrame
    - `features_masks_list` : `{fm1_name: features_mask_1, fm2_name: features_mask_2, ...]` with `features_mask_i` is a list of feature column names. They can oversect.
    
- `Y` : `[nb_samples, nb_labels]` shaped pd.DataFrame
    - `labels_masks_list` : `{lm1_name: labels_mask_1, lm2_name: labels_mask_2, ...}` with `labels_mask_i` is a list of labels column names. They can oversect.

- `samples_masks_list` : `[samples_mask_1, samples_mask_2, ...]` with samples_mask_i is a function to produce a boolean pd.DataFrame . Used only for training. 


- Set of models `models` : list of functions to create a model, e.g. `[create_RF, create_NN, create_GBT]`


### Training phase




In [17]:
from collections import defaultdict
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier

In [373]:
samples_masks_list = [
#    lambda x:  ~(x['targets_diff'].isin([0])), 
#     lambda x, y:  x['targets_diff'] > 0, 
#     lambda x, y:  x['targets_diff'] < 0, 
    lambda x, y:  (x['targets_diff'] > 0) | (y['targets_diff'] > 0), 
    lambda x, y:  (x['targets_diff'] < 0) | (y['targets_diff'] < 0), 
#     lambda x, y:  (y['targets_diff'] > 0), 
#     lambda x, y:  y['targets_diff'] < 0, 
]

TARGET_LABELS_FRQ_PREV = [c + '_prev' for c in TARGET_LABELS_FRQ]

features_masks_dict = {
#     'fm_all': None,
    'fm0': features + target_features + TARGET_LABELS_FRQ.tolist() + TARGET_LABELS_FRQ_PREV,
#     'fm1': ['pais_residencia', 'sexo', 'age', 'ind_nuevo', 'segmento', 'ind_empleado', 'ind_actividad_cliente', 'indresi'],
    'fm2': target_features,
#     'fm3': ['pais_residencia', 'sexo', 'age', 'segmento', 'renta'],
#     'fm4': ['pais_residencia', 'sexo', 'age', 'renta', 'targets_logdiff', 'targets_logcount2_diff','targets_logcount2','targets_logcount1'],
    'fm5': ['nomprov', 'ind_nuevo', 'renta', 'ind_actividad_cliente', 'canal_entrada'],
#     'fm6': TARGET_LABELS_FRQ,
}

In [374]:
# https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/

def create_RF(input_shape, output_shape):        
    # https://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/
    return RandomForestClassifier(n_estimators=100, 
#                                   min_samples_split=100,
#                                   min_samples_leaf=25,
#                                   max_depth=10
                                  max_features=1.0, 
#                                   oob_score=True,
#                                   bootstrap=True,
                                  n_jobs=-1
                                 )

def create_ET(input_shape, output_shape):
    return ExtraTreesClassifier(n_estimators=100,
#                                   min_samples_leaf=25,
#                                   max_depth=10
                                  max_features=1.0, 
                                  oob_score=True,
                                  bootstrap=True,
                                  n_jobs=-1

                               )

def create_GB(input_shape, output_shape):
    return GradientBoostingClassifier(n_estimators=75)

models_dict = {
    'rf': create_RF,
    'et': create_ET,
    'gb': create_GB,
}

In [375]:
NP_TARGET_LABELS = np.array(TARGET_LABELS)
target_labels = NP_TARGET_LABELS

common_groups = [
##    [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
#     [2, ],
    [2, 3, 4, 5],
    [2, 6, 7, 8],
    [2, 18, 23, 12], 
    [21, 22],
    [2, 12, 18],
    [2, 12, 23],
    [2, 18, 23],
    [18, 23, 21, 22],
    [21, 23, 22, 4],
#     [18, ],
#     [12, ],
#     [21, ],
#     [22, ],
#     [23, ],
    [3, 4], 
    [22, 7, 8, 23],
    [0, 1, 14, 15, 17]
#     [17, ],
#     [i] for i in range(24)
]



def flatten(array):
    out = []
    for item in array:
        out += item
    return out

others = list(set(range(24)) - set(flatten(common_groups)))

# for i, a in enumerate(zip(TARGET_LABELS2, TARGET_LABELS)):
#     print i, a
    
s = set({})
labels_masks_dict = {}
for i, g in enumerate(common_groups):
    print 'lm_%i' % i, " <=> ", g, "<==>", TARGET_LABELS2[g]
    labels_masks_dict['lm_%i' % i] = target_labels[g]
    s |= set(g)
print 'lm_others', "<=>", others, "<==>", TARGET_LABELS2[others]
labels_masks_dict['lm_others'] = target_labels[others]
s |= set(others)

assert len(s) == len(target_labels), "Sum is not equal 24, s=%i" % s
print labels_masks_dict

lm_0  <=>  [2, 3, 4, 5] <==> ['Current Accounts' 'Derivada Account' 'Payroll Account' 'Junior Account']
lm_1  <=>  [2, 6, 7, 8] <==> ['Current Accounts' 'Mas particular Account' 'particular Account'
 'particular Plus Account']
lm_2  <=>  [2, 18, 23, 12] <==> ['Current Accounts' 'Credit Card' 'Direct Debit' 'e-account']
lm_3  <=>  [21, 22] <==> ['Payroll' 'Pensions']
lm_4  <=>  [2, 12, 18] <==> ['Current Accounts' 'e-account' 'Credit Card']
lm_5  <=>  [2, 12, 23] <==> ['Current Accounts' 'e-account' 'Direct Debit']
lm_6  <=>  [2, 18, 23] <==> ['Current Accounts' 'Credit Card' 'Direct Debit']
lm_7  <=>  [18, 23, 21, 22] <==> ['Credit Card' 'Direct Debit' 'Payroll' 'Pensions']
lm_8  <=>  [21, 23, 22, 4] <==> ['Payroll' 'Direct Debit' 'Pensions' 'Payroll Account']
lm_9  <=>  [3, 4] <==> ['Derivada Account' 'Payroll Account']
lm_10  <=>  [22, 7, 8, 23] <==> ['Pensions' 'particular Account' 'particular Plus Account' 'Direct Debit']
lm_11  <=>  [0, 1, 14, 15, 17] <==> ['Saving Account' 'Guara

In [376]:
# {model_name: [(samples_mask_code, features_mask_name, labels_mask_name), ...]}
models_pipelines = {
    'gb' : [(None, None, key) for key in labels_masks_dict if len(labels_masks_dict[key]) > 1],
    'rf' : [(None, None, key) for key in labels_masks_dict if len(labels_masks_dict[key]) > 1],
    'et' : [(None, None, key) for key in labels_masks_dict if len(labels_masks_dict[key]) > 1],
}
models_pipelines

{'et': [(None, None, 'lm_10'),
  (None, None, 'lm_11'),
  (None, None, 'lm_8'),
  (None, None, 'lm_9'),
  (None, None, 'lm_others'),
  (None, None, 'lm_0'),
  (None, None, 'lm_1'),
  (None, None, 'lm_2'),
  (None, None, 'lm_3'),
  (None, None, 'lm_4'),
  (None, None, 'lm_5'),
  (None, None, 'lm_6'),
  (None, None, 'lm_7')],
 'gb': [(None, None, 'lm_10'),
  (None, None, 'lm_11'),
  (None, None, 'lm_8'),
  (None, None, 'lm_9'),
  (None, None, 'lm_others'),
  (None, None, 'lm_0'),
  (None, None, 'lm_1'),
  (None, None, 'lm_2'),
  (None, None, 'lm_3'),
  (None, None, 'lm_4'),
  (None, None, 'lm_5'),
  (None, None, 'lm_6'),
  (None, None, 'lm_7')],
 'rf': [(None, None, 'lm_10'),
  (None, None, 'lm_11'),
  (None, None, 'lm_8'),
  (None, None, 'lm_9'),
  (None, None, 'lm_others'),
  (None, None, 'lm_0'),
  (None, None, 'lm_1'),
  (None, None, 'lm_2'),
  (None, None, 'lm_3'),
  (None, None, 'lm_4'),
  (None, None, 'lm_5'),
  (None, None, 'lm_6'),
  (None, None, 'lm_7')]}

In [390]:
{
    'gb': [(None, 'fm5', _lm) for _lm in ['lm_11', 'lm_9', 'lm_others', 'lm_0', 'lm_3', 'lm_4']] + \
    [(None, 'fm2', _lm) for _lm in ['lm_11', 'lm_9', 'lm_others', 'lm_0', 'lm_3']] + \
    [(None, 'fm3', _lm) for _lm in ['lm_11', 'lm_others', 'lm_3']] + \
    [(None, 'fm0', _lm) for _lm in ['lm_11', 'lm_8', 'lm_others', 'lm_0', 'lm_2', 'lm_3', 'lm_4', 'lm_5', 'lm_6', 'lm_7']] + \
    [(None, 'fm1', _lm) for _lm in ['lm_11', 'lm_others', 'lm_3']]
}

{'gb': [(None, 'fm5', 'lm_11'),
  (None, 'fm5', 'lm_9'),
  (None, 'fm5', 'lm_others'),
  (None, 'fm5', 'lm_0'),
  (None, 'fm5', 'lm_3'),
  (None, 'fm5', 'lm_4'),
  (None, 'fm2', 'lm_11'),
  (None, 'fm2', 'lm_9'),
  (None, 'fm2', 'lm_others'),
  (None, 'fm2', 'lm_0'),
  (None, 'fm2', 'lm_3'),
  (None, 'fm3', 'lm_11'),
  (None, 'fm3', 'lm_others'),
  (None, 'fm3', 'lm_3'),
  (None, 'fm0', 'lm_11'),
  (None, 'fm0', 'lm_8'),
  (None, 'fm0', 'lm_others'),
  (None, 'fm0', 'lm_0'),
  (None, 'fm0', 'lm_2'),
  (None, 'fm0', 'lm_3'),
  (None, 'fm0', 'lm_4'),
  (None, 'fm0', 'lm_5'),
  (None, 'fm0', 'lm_6'),
  (None, 'fm0', 'lm_7'),
  (None, 'fm1', 'lm_11'),
  (None, 'fm1', 'lm_others'),
  (None, 'fm1', 'lm_3')]}

In [377]:
from trainval import train_all, predict_all, probas_to_indices, score_estimators
from utils import map7_score0

In [378]:
ll = 110000
# ll = 1100

mask = X.index.isin(X.index[:ll])

X1 = X[mask]
Y1 = Y[mask]
clc = clients_last_choice[mask]
print X1.shape, Y1.shape, clc.shape

mask = X.index.isin(X.index[ll:ll+ll//2])
X2 = X[mask]
Y2 = Y[mask]
clc2 = clients_last_choice[mask]
print X2.shape, Y2.shape, clc2.shape

(110000, 75) (110000, 53) (110000, 27)
(39988, 75) (39988, 53) (39988, 27)


In [379]:
from utils import dummies_to_decimal
from sklearn.preprocessing import StandardScaler


def prepare_to_fit(X_train, Y_train):    
    x_train = X_train.values
    x_train = StandardScaler().fit_transform(x_train)
    y_train = Y_train.apply(dummies_to_decimal, axis=1)
    y_train = y_train.values    
    return x_train, y_train


def prepare_to_test(X_val, Y_val=None):
    x_val = X_val.values
    x_val = StandardScaler().fit_transform(x_val)
    if Y_val is not None:
        y_val = Y_val.apply(dummies_to_decimal, axis=1)
        y_val = y_val.values 
    else:
        y_val = None
    return x_val, y_val


def probas_to_labels_probas(y_probas, class_indices, labels):
    l = len(labels)
    out = np.zeros((len(y_probas), l))
    _y_probas = class_indices[np.argmax(y_probas, axis=1)]
    max_pr = np.max(y_probas, axis=1)
    i = 0
    for index, pr in zip(_y_probas, max_pr):
        dummies_str = decimal_to_dummies(index, l)
        out[i, :] = np.array([pr*float(v) for v in dummies_str])
        i += 1
    return out
    

In [380]:
_kwargs = {
    'samples_masks_list': samples_masks_list, 
    'features_masks_dict': features_masks_dict, 
    'labels_masks_dict': labels_masks_dict, 
    'models_dict': models_dict,
    'labels': target_labels,
    'transform_proba_func': probas_to_indices,
    'prepare_to_fit_func': prepare_to_fit,
    'prepare_to_test_func': prepare_to_test,   
    'probas_to_labels_probas_func': probas_to_labels_probas,
    'threshold': 0.1,
    'n_highest': 7,
    'mode': 'sum',
    'verbose': False,
    'models_pipelines': models_pipelines,
    'return_probas': True
}

In [381]:
estimators = train_all(X1, Y1, **_kwargs)

#print estimators

INFO:root:-- Train all --
INFO:root:-- Process : sample_mask=6023/110000, features_mask=fm5, labels_mask=lm_10
INFO:root:--- Score : model='et', fit accuracy : 0.441972
INFO:root:-- Process : sample_mask=6023/110000, features_mask=fm5, labels_mask=lm_10
INFO:root:--- Score : model='rf', fit accuracy : 0.441972
INFO:root:-- Process : sample_mask=6023/110000, features_mask=fm5, labels_mask=lm_10
INFO:root:--- Score : model='gb', fit accuracy : 0.303171
INFO:root:-- Process : sample_mask=6023/110000, features_mask=fm5, labels_mask=lm_11
INFO:root:--- Score : model='et', fit accuracy : 0.751287
INFO:root:-- Process : sample_mask=6023/110000, features_mask=fm5, labels_mask=lm_11
INFO:root:--- Score : model='rf', fit accuracy : 0.751287
INFO:root:-- Process : sample_mask=6023/110000, features_mask=fm5, labels_mask=lm_11
INFO:root:--- Score : model='gb', fit accuracy : 0.709779
INFO:root:-- Process : sample_mask=6023/110000, features_mask=fm5, labels_mask=lm_8
INFO:root:--- Score : model='et'

In [382]:
accuracies = defaultdict(list)
for e in estimators:
    accuracies[e[0][2]].append(e[2])

mean_accuracy = {}
for key in accuracies:
    accuracy_list = accuracies[key]
    mean_accuracy[key] = sum(accuracy_list)/len(accuracy_list)
    
mean_accuracy

{'et': 0.6738840369626834,
 'gb': 0.59930223946783467,
 'rf': 0.67389580325027343}

In [383]:
_ = score_estimators(estimators, X2, Y2, **_kwargs)

INFO:root:-- Score : model=et, features_mask=fm5, labels_mask=lm_10 -> 0.374562368711
INFO:root:-- Score : model=rf, features_mask=fm5, labels_mask=lm_10 -> 0.223066920076
INFO:root:-- Score : model=gb, features_mask=fm5, labels_mask=lm_10 -> 0.241197359208
INFO:root:-- Score : model=et, features_mask=fm5, labels_mask=lm_11 -> 0.928528558568
INFO:root:-- Score : model=rf, features_mask=fm5, labels_mask=lm_11 -> 0.856006802041
INFO:root:-- Score : model=gb, features_mask=fm5, labels_mask=lm_11 -> 0.9711163349
INFO:root:-- Score : model=et, features_mask=fm5, labels_mask=lm_8 -> 0.474742422727
INFO:root:-- Score : model=rf, features_mask=fm5, labels_mask=lm_8 -> 0.283785135541
INFO:root:-- Score : model=gb, features_mask=fm5, labels_mask=lm_8 -> 0.255801740522
INFO:root:-- Score : model=et, features_mask=fm5, labels_mask=lm_9 -> 0.690007002101
INFO:root:-- Score : model=rf, features_mask=fm5, labels_mask=lm_9 -> 0.598729618886
INFO:root:-- Score : model=gb, features_mask=fm5, labels_mask

In [None]:
[(None, 'fm5', _lm) for _lm in ['lm_11', 'lm_9', 'lm_others', 'lm_0', 'lm_3', 'lm_4']] + \
[(None, 'fm2', _lm) for _lm in ['lm_11', 'lm_9', 'lm_others', 'lm_0', 'lm_3']] + \
[(None, 'fm3', _lm) for _lm in ['lm_11', 'lm_others', 'lm_3']] + \
[(None, 'fm0', _lm) for _lm in ['lm_11', 'lm_8', 'lm_others', 'lm_0', 'lm_2', 'lm_3', 'lm_4', 'lm_5', 'lm_6', 'lm_7']] + \
[(None, 'fm1', _lm) for _lm in ['lm_11', 'lm_others', 'lm_3']]

In [384]:
y_preds, Y_probas = predict_all(estimators, X2, **_kwargs)
#print y_preds[:5]

INFO:root:-- Predict all --


In [385]:
Y_probas.head()

Unnamed: 0,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
670965,0.0,0.0,0.185343,0.0,0.069844,0.0,0.0,0.097959,0.105646,0.0,...,0.011159,0.011159,0.004247,0.014944,0.194729,0.011217,0.004091,0.03678,0.085365,0.13231
670966,0.0,0.0,0.476599,0.004411,0.067432,0.0,0.0,0.104801,0.082653,0.0,...,0.0,0.0,0.0,0.0,0.024866,0.012335,0.018421,0.009636,0.012525,0.111593
670968,0.0,0.0,0.096411,0.0,0.125717,0.008202,0.003539,0.098438,0.118432,0.0,...,0.010928,0.010928,0.0,0.015064,0.039078,0.020597,0.0,0.050595,0.108789,0.177258
670972,0.0,0.0,0.086864,0.0,0.172941,0.0,0.0,0.064459,0.066157,0.0,...,0.005903,0.005903,0.0,0.008398,0.032353,0.003635,0.0,0.120621,0.17342,0.224267
670973,0.0,0.0,0.148467,0.0,0.115957,0.008305,0.012054,0.105563,0.119755,0.0,...,0.011065,0.011065,0.0,0.015252,0.041141,0.020854,0.0,0.026096,0.082942,0.173538


In [386]:
#y_preds[:5]

In [387]:
#from sklearn.metrics import roc_auc_score

In [388]:
y_val = targets_str_to_indices(Y2[target_labels].values)
print y_val[:10]
print y_preds[:10]

[[18] [2] [5] [4, 21, 22, 23] [5] [2] [4, 23] [2] [4, 17, 21, 22, 23]
 [2, 4, 12, 18, 21, 22, 23]]
[[18, 2, 23, 8] [2, 23, 7] [23, 4, 8, 22, 12] [23, 22, 4, 21]
 [23, 2, 8, 4, 7] [2, 23, 7] [23, 4, 2] [2, 23] [23, 22, 2, 4, 21]
 [23, 2, 4, 22]]


In [389]:
y_val = targets_str_to_indices(Y2[target_labels].values)

logging.info("- Compute max map7 score")
map7_score(y_val, y_val, clc2[LC_TARGET_LABELS].values)
# map7_score0(y_val, y_val)
logging.info("- Compute map7 score")
map7_score(y_val, y_preds, clc2[LC_TARGET_LABELS].values)
# map7_score0(y_val, y_preds)
#logging.info("- Compute AUC ROC : ")
#print roc_auc_score(y_val, y_preds)

INFO:root:- Compute max map7 score
INFO:root:-- Predicted map7 score: 0.0295838751625
INFO:root:- Compute map7 score
INFO:root:-- Predicted map7 score: 0.0142559434497


0.014255943449701576

0.021295269099703414 (GB on 'all')

0.021271936353906683 (RF tunning)

0.021668245671284416 (RF tunning)

0.02136609107928888

0.0211362663776694

In [362]:
# print labels_masks_dict[estimators[0][0][1]]
# print estimators[0][1].classes_
# print estimators[0][1].n_classes_
# print estimators[0][1].n_features_
# print estimators[0][1].n_outputs_
# print estimators[0][1].estimators_

In [363]:
from utils import targets_to_labels, targets_indices_to_labels, remove_last_choice

In [364]:
limit = 25
count = 0

not_predicted_predicted = defaultdict(int)
for last_choice, targets, products, proba in zip(clc2[LC_TARGET_LABELS].values, y_val, y_preds, Y_probas.values):
    added_products = remove_last_choice(targets, last_choice)
    predictions = remove_last_choice(products, last_choice)
#     print "---", count, last_choice
#     print targets, '->', added_products
#     print products, '->', predictions
#     if count == 3:
#         break
    
    if len(added_products) == 0:
        continue
        
    if len(set(added_products) & set(predictions)) > 0:
#         print "Predicted : ", added_products, predictions
#         print set(added_products) & set(predictions)
        continue

    count += 1
    if count < limit:
        print "--- Count = ", count
        print targets_indices_to_labels(added_products, TARGET_LABELS2)#, targets_indices_to_labels(targets, TARGET_LABELS2)
        print targets_indices_to_labels(predictions, TARGET_LABELS2)#, targets_indices_to_labels(products, TARGET_LABELS2)#, proba
    
    for p in added_products:
        not_predicted_predicted[TARGET_LABELS2[p]] += 1
    

--- Count =  1
['Credit Card']
[]
--- Count =  2
['e-account']
['Direct Debit']
--- Count =  3
['Current Accounts', 'e-account']
[]
--- Count =  4
['Credit Card']
[]
--- Count =  5
['Credit Card']
['Pensions', 'Payroll']
--- Count =  6
['Payroll', 'Pensions']
['Current Accounts', 'particular Account', 'particular Plus Account', 'Direct Debit']
--- Count =  7
['Payroll', 'Pensions']
['Direct Debit', 'Current Accounts', 'particular Account', 'particular Plus Account']
--- Count =  8
['Payroll Account']
['Current Accounts']
--- Count =  9
['e-account']
[]
--- Count =  10
['Payroll', 'Pensions']
['Current Accounts', 'particular Account', 'particular Plus Account']
--- Count =  11
['e-account', 'Payroll', 'Pensions']
[]
--- Count =  12
['Credit Card']
[]
--- Count =  13
['e-account']
['Direct Debit', 'particular Account', 'particular Plus Account']
--- Count =  14
['Credit Card']
[]
--- Count =  15
['Payroll Account', 'Payroll', 'Pensions']
[]
--- Count =  16
['Payroll Account']
['Direct De

In [365]:
print not_predicted_predicted, y_val.shape[0]

defaultdict(<type 'int'>, {'Securities': 1, 'Direct Debit': 19, 'e-account': 56, 'Payroll': 247, 'Pensions': 247, 'Taxes': 11, 'Payroll Account': 91, 'Long-term deposits': 1, 'Credit Card': 79, 'Current Accounts': 49}) 39988


In [None]:
#print y_probas[:10, target_groups[0]]
#print Y[np.array(TARGET_LABELS)[target_groups[0]]].head(10)

### Run KFold Cross-validation 

In [368]:
from trainval import cross_val_score0, cross_val_score

In [370]:
nb_folds = 5
results = cross_val_score0((X, Y), nb_folds=nb_folds, **_kwargs)

print "Cross-Validation \n %i | %f | %f | %f | %f | %.5f " % (nb_folds, results.min(), results.mean(), np.median(results), results.max(), results.std())


INFO:root:- Cross validation : 
INFO:root:

		-- Fold : 1 / 5

INFO:root:-- Train all --
INFO:root:-- Process : sample_mask=5797/119990, features_mask=fm2, labels_mask=lm_10
INFO:root:--- Score : model='et', fit accuracy : 0.725203
INFO:root:-- Process : sample_mask=5797/119990, features_mask=fm2, labels_mask=lm_10
INFO:root:--- Score : model='rf', fit accuracy : 0.725203
INFO:root:-- Process : sample_mask=5797/119990, features_mask=fm2, labels_mask=lm_11
INFO:root:--- Score : model='et', fit accuracy : 0.967224
INFO:root:-- Process : sample_mask=5797/119990, features_mask=fm2, labels_mask=lm_11
INFO:root:--- Score : model='rf', fit accuracy : 0.967224
INFO:root:-- Process : sample_mask=5797/119990, features_mask=fm2, labels_mask=lm_8
INFO:root:--- Score : model='et', fit accuracy : 0.712610
INFO:root:-- Process : sample_mask=5797/119990, features_mask=fm2, labels_mask=lm_8
INFO:root:--- Score : model='rf', fit accuracy : 0.712782
INFO:root:-- Process : sample_mask=5797/119990, feature

KeyboardInterrupt: 


### 201505 -> 201605 

Cross-Validation 
 5 | 0.014585 | 0.018385 | 0.019147 | 0.022227 | 0.00294 

Compute cross-validation across several months

In [None]:
nb_folds = 3
yms = [201504, 201505]
#yms = [201505]

for ym in yms:
    logging.info("\n-------------------------")
    logging.info("- Process month : %s" % ym)
    logging.info("-------------------------\n")
    
    ym1 = ym + 100    
    df1 = train_df if months_ym_map[ym] in train_months else val_df
    df2 = train_df if months_ym_map[ym1] in train_months else val_df
    X, Y, clients_last_choice = get_XY(ym, df1, ym1, df2) 
    results = cross_val_score2((X, Y, clients_last_choice[LC_TARGET_LABELS].values), 
                                profiles=profiles,
                                nb_folds=nb_folds)
    print "Cross-Validation \n %i | %f | %f | %f | %.5f " % (nb_folds, results.min(), results.mean(), results.max(), results.std())

## Train model for predictions

In [None]:
current_month = 201505
next_year_month = current_month + 100

df1 = train_df
#df1 = val_df
df2 = train_df #if months_ym_map[next_year_month] in train_months else val_df
#df2 = val_df

X, Y, clients_last_choice = get_XY(current_month, df1, next_year_month, df2, months_ym_map)

In [None]:
estimators = train_all(X, Y, **_kwargs)

In [None]:
y_preds, Y_probas = predict_all(estimators, X, **_kwargs)

Check score on the data 2016/05

In [None]:
logging.info("- Compute map7 score")
print map7_score(y_val, y_preds, clients_last_choice[LC_TARGET_LABELS].values)
logging.info("- Compute max map7 score")
print map7_score(y_val, y_val, clients_last_choice[LC_TARGET_LABELS].values)

## Prediction for 2016/06

In [None]:
from dataset import load_train_test

In [None]:
full_train_df, test_df = load_train_test([201506])

In [None]:
full_train_df.head()

In [None]:
test_df.head()

In [None]:
months_ym_map = {}
months = list(set(full_train_df['fecha_dato'].unique()) | set(test_df['fecha_dato'].unique()))
for m in months:
    months_ym_map[to_yearmonth(m)] = m
    
full_train_months = full_train_df['fecha_dato'].unique()
test_months = test_df['fecha_dato'].unique()

In [None]:
current_month = 201506
next_year_month = current_month + 100

df1 = full_train_df
df2 = test_df
X, _, clients_last_choice = get_XY(current_month, df1, next_year_month, df2, months_ym_map)

In [None]:
print X.shape, test_df.shape

In [None]:
X.head(10)

In [None]:
clients_last_choice.head(10)

In [None]:
def get_submission(predicted_added_products, clients, clc, target_labels):
    added_products_col = []
    count = 0 
    for products, last_choice in zip(predicted_added_products, clc):
        predictions = remove_last_choice(products, last_choice)
        added_products_col.append(' '.join([target_labels[i] for i in predictions]))
        count+=1
        if count % 100000 == 0:
            logging.info("Elapsed : %i", count)
            
    out = pd.DataFrame(data={'ncodpers': clients, 'added_products': added_products_col}, columns=['ncodpers', 'added_products'])
    return out

In [None]:
y_preds, Y_probas = predict_all(estimators, X, **_kwargs)

logging.info("- Get submission dataframe:")
clients = X['ncodpers'].values
submission = get_submission(y_pred, clients, clients_last_choice[TARGET_LABELS].values, TARGET_LABELS)

In [None]:
submission_clients = set(submission['ncodpers'].unique())
test_clients = set(test_df['ncodpers'].unique())
if submission_clients != test_clients:
    missing_clients = list(test_clients - submission_clients)
        
#     selected_estimators = []
#     for e in estimators:
#         if e[0]
        
    
    missing_added_products = np.zeros((len(missing_clients)))
    submission = pd.concat([submission, 
                            pd.DataFrame(data={
                                'ncodpers': missing_clients, 
                                'added_products': missing_added_products
                            }, columns=['ncodpers', 'added_products'])])

Get submission DataFrame and write csv file

In [None]:
print submission.shape
submission.head()

In [None]:
from datetime import datetime
import csv

logging.info('- Generate submission')
submission_file = '../results/submission_' + \
                  str(datetime.now().strftime("%Y-%m-%d-%H-%M")) + \
                  '.csv'

submission.to_csv(submission_file, index=False, index_label=False)

In [None]:
with open('../results/submission_2016-11-17-16-37.csv', 'r') as r:
    print r.readline()
    print r.readline()
    print r.readline()
    print r.readline()
    print r.readline()
    