# NN tryouts on SPR data, inspired by Kaggle Forum "When less is more"

Load training and validation data as 
    month : [ Features | Targets| Difference | Last Choice Targets  ]
    
    - Features : 
    - Target labels : TARGET_LABELS(month), 'targets_str', 'targets_features'
    - Difference with prev month
    - Last Choice Targets : LC_TARGET_LABELS(month-1), 'lc_targets_str', 'lc_targets_features'


In [1]:
import os
import numpy as np
import pandas as pd

import logging
logging.getLogger().handlers = []
logging.basicConfig(level=logging.INFO)

import matplotlib.pylab as plt
%matplotlib inline

In [2]:
from dataset import load_trainval, LC_TARGET_LABELS, TARGET_LABELS_FRQ, TARGET_LABELS_DIFF, decimal_to_dummies, targets_str_to_indices, targets_dec_to_indices
from common import to_yearmonth, TARGET_LABELS 
from common import target_str_to_labels, TARGET_LABELS2

In [3]:
features = [
    u'ind_empleado', u'pais_residencia',
    u'sexo', u'age', u'ind_nuevo', u'antiguedad', u'indrel',
    u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
    u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
    u'ind_actividad_cliente', u'renta', u'segmento'    
]

In [5]:
# train_yearmonths_list = [201504, 201505, 201604]
train_yearmonths_list = [201505, 201605]
# train_yearmonths_list = [201505]
val_yearmonth = [201605]
train_nb_clients = 150000
# train_nb_clients = 1500
train_df, val_df = load_trainval(train_yearmonths_list, val_yearmonth, train_nb_clients, val_nb_clients=1500)

INFO:root:- Load training data : 
INFO:root:- Load data : [201504, 201505, 201604, 201605]
INFO:root:-- Select 150000 clients
INFO:root:- Number of lines with unknown data : 48
INFO:root:- Number of columns with nan : 10
INFO:root:-- Process date : 201505
INFO:root:-- Process date : 201605
INFO:root:- Load validation data
INFO:root:- Load data : [201604, 201605]
INFO:root:-- Select 1500 clients
INFO:root:- Number of lines with unknown data : 0
INFO:root:- Number of columns with nan : 9
INFO:root:-- Process date : 201605
INFO:root:-- Compute logCount dictionary
INFO:root:-- Add logCount columns
INFO:root:-- Process month : 2015-04-28
INFO:root:-- Process month : 2015-05-28
INFO:root:-- Process month : 2016-04-28
INFO:root:-- Process month : 2016-05-28
INFO:root:-- Process month : 2016-04-28
INFO:root:-- Process month : 2016-05-28
INFO:root:-- Add logDecimal columns
INFO:root:-- Transform age/renta/logdiff
INFO:root:-- Add target values frequencies
INFO:root:-- Add target diff


Display loaded data

In [6]:
train_df[['fecha_dato', 'ncodpers'] + TARGET_LABELS_DIFF.tolist()].head(10)

Unnamed: 0,fecha_dato,ncodpers,ind_ahor_fin_ult1_diff,ind_aval_fin_ult1_diff,ind_cco_fin_ult1_diff,ind_cder_fin_ult1_diff,ind_cno_fin_ult1_diff,ind_ctju_fin_ult1_diff,ind_ctma_fin_ult1_diff,ind_ctop_fin_ult1_diff,...,ind_hip_fin_ult1_diff,ind_plan_fin_ult1_diff,ind_pres_fin_ult1_diff,ind_reca_fin_ult1_diff,ind_tjcr_fin_ult1_diff,ind_valo_fin_ult1_diff,ind_viv_fin_ult1_diff,ind_nomina_ult1_diff,ind_nom_pens_ult1_diff,ind_recibo_ult1_diff
210122,2015-04-28,15892,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
1051662,2015-05-28,15892,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1695757,2016-04-28,15892,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
2501083,2016-05-28,15892,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
210121,2015-04-28,15893,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
1051663,2015-05-28,15893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1695786,2016-04-28,15893,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
2501082,2016-05-28,15893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
210125,2015-04-28,15911,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0
1051675,2015-05-28,15911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Useful structures

In [7]:
def get_common_clients(df1, mask1, mask2, df2=None):
    active_clients1 = df1[mask1]['ncodpers'].unique()
    if df2 is not None:
        active_clients2 = df2[mask2]['ncodpers'].unique()
    else:
        active_clients2 = df1[mask2]['ncodpers'].unique()
    active_clients = list(set(active_clients1) & set(active_clients2)) 
    
    if df2 is not None:
        return df1['ncodpers'].isin(active_clients), df2['ncodpers'].isin(active_clients)
    return df1['ncodpers'].isin(active_clients)

In [8]:
months_ym_map = {}
months = list(set(train_df['fecha_dato'].unique()) | set(val_df['fecha_dato'].unique()))
for m in months:
    months_ym_map[to_yearmonth(m)] = m

        
train_months = train_df['fecha_dato'].unique()
val_months = val_df['fecha_dato'].unique()
    

In [9]:
from common import get_added_products, remove_last_choice, apk, map7_score
from visualization import visualize_train_test, visualize_folds, compare_two_datasets, compare_folds, compare_folds2

Create profiles and create models for profiles

In [10]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Merge
from keras.utils import np_utils

Using TensorFlow backend.


### Train a model

In [11]:
target_features = ['targets_diff', 'targets_logdiff', 'targets_logcount2_diff', 'targets_logcount2', 'targets_logcount1', 'targets_logDec']

In [12]:
#active_clients_mask = ~train_df['targets_diff'].isin([0, -99999])
#print active_clients_mask.shape, active_clients_mask.sum()

In [166]:
def get_XY(current_month, df1, next_year_month, df2, months_ym_map):
    month_mask = df1['fecha_dato'] == months_ym_map[current_month]
    next_year_month_mask = df2['fecha_dato'] == months_ym_map[next_year_month]
    
    # get common clients from df1 at this month and df2 at next year month
    common_clients_mask1, common_clients_mask2 = get_common_clients(df1, month_mask, next_year_month_mask, df2)
    
    X = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + target_features + features + TARGET_LABELS_FRQ.tolist()]            

    if TARGET_LABELS[0] in df2.columns and TARGET_LABELS_DIFF[0] in df2.columns:
        Y = df2[common_clients_mask2 & next_year_month_mask][['ncodpers', 'fecha_dato', 'targets_str'] + TARGET_LABELS + TARGET_LABELS_DIFF.tolist()]    
        assert (X['ncodpers'].values == Y['ncodpers'].values).all(), "There is a problem in alignment"
        Y.index = X.index
    else:
        Y = None
    
    if LC_TARGET_LABELS[0] in df2.columns:
        clients_last_choice = df2[common_clients_mask2 & next_year_month_mask][['ncodpers', 'fecha_dato'] + LC_TARGET_LABELS.tolist()]
    else:
        clients_last_choice = None
        
    return X, Y, clients_last_choice


In [167]:
current_month = 201505
next_year_month = current_month + 100

df1 = train_df if months_ym_map[current_month] in train_months else val_df
#df1 = train_df
df2 = train_df if months_ym_map[next_year_month] in train_months else val_df
#df2 = train_df

X, Y, clients_last_choice = get_XY(current_month, df1, next_year_month, df2, months_ym_map)

In [168]:
print X.shape
X.head(10)

(149976, 51)


Unnamed: 0,ncodpers,fecha_dato,targets_diff,targets_logdiff,targets_logcount2_diff,targets_logcount2,targets_logcount1,targets_logDec,ind_empleado,pais_residencia,...,ind_hip_fin_ult1_frq,ind_plan_fin_ult1_frq,ind_pres_fin_ult1_frq,ind_reca_fin_ult1_frq,ind_tjcr_fin_ult1_frq,ind_valo_fin_ult1_frq,ind_viv_fin_ult1_frq,ind_nomina_ult1_frq,ind_nom_pens_ult1_frq,ind_recibo_ult1_frq
1051662,15892,2015-05-28,0.0,0.0,0.0,8e-06,2.7e-05,13.181662,1,0,...,0.992349,0.988762,0.996729,0.066437,0.056349,0.03256,0.995024,0.931526,0.926035,0.158099
1051663,15893,2015-05-28,0.0,0.0,0.0,0.001941,0.001227,2.833213,0,0,...,0.992349,0.988762,0.996729,0.933563,0.943651,0.03256,0.995024,0.931526,0.926035,0.841901
1051675,15911,2015-05-28,0.0,0.0,0.0,2e-06,7e-06,13.196866,1,0,...,0.992349,0.988762,0.996729,0.933563,0.056349,0.03256,0.995024,0.931526,0.926035,0.158099
1051677,15916,2015-05-28,0.0,0.0,0.0,3e-06,7e-06,13.234752,3,0,...,0.992349,0.011238,0.996729,0.066437,0.056349,0.03256,0.995024,0.931526,0.926035,0.158099
1051644,15920,2015-05-28,0.0,0.0,0.0,2.8e-05,5.3e-05,14.803952,1,0,...,0.992349,0.988762,0.996729,0.066437,0.943651,0.96744,0.995024,0.931526,0.926035,0.158099
1051653,15929,2015-05-28,1.0,0.693147,7e-06,8e-06,1.3e-05,14.57258,3,0,...,0.992349,0.988762,0.996729,0.933563,0.056349,0.03256,0.995024,0.931526,0.926035,0.158099
1051657,15934,2015-05-28,0.0,0.0,0.0,1.3e-05,2.7e-05,14.589708,1,0,...,0.992349,0.988762,0.996729,0.933563,0.943651,0.03256,0.995024,0.931526,0.926035,0.158099
1051680,15939,2015-05-28,0.0,0.0,0.0,4.2e-05,8e-05,14.557312,0,0,...,0.007651,0.988762,0.996729,0.933563,0.943651,0.96744,0.995024,0.931526,0.926035,0.158099
1051703,15943,2015-05-28,0.0,0.0,0.0,8e-06,1.3e-05,14.556351,3,0,...,0.007651,0.988762,0.996729,0.933563,0.056349,0.96744,0.995024,0.931526,0.926035,0.841901
1051716,15950,2015-05-28,0.0,0.0,0.0,6e-06,7e-06,14.55626,0,0,...,0.992349,0.011238,0.996729,0.066437,0.056349,0.96744,0.995024,0.931526,0.926035,0.158099


In [169]:
print Y.shape
Y.head(10)

(149976, 51)


Unnamed: 0,ncodpers,fecha_dato,targets_str,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,...,ind_hip_fin_ult1_diff,ind_plan_fin_ult1_diff,ind_pres_fin_ult1_diff,ind_reca_fin_ult1_diff,ind_tjcr_fin_ult1_diff,ind_valo_fin_ult1_diff,ind_viv_fin_ult1_diff,ind_nomina_ult1_diff,ind_nom_pens_ult1_diff,ind_recibo_ult1_diff
1051662,15892,2016-05-28,001000000001100001110001,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1051663,15893,2016-05-28,000000000000000000010000,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1051675,15911,2016-05-28,000010000001100000110001,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1051677,15916,2016-05-28,000010001000100101110001,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1051644,15920,2016-05-28,001000010000000001010001,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1051653,15929,2016-05-28,001000001000100000110001,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1051657,15934,2016-05-28,001000000000100000010001,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1051680,15939,2016-05-28,001000000000101000000001,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1051703,15943,2016-05-28,001000000000001000100000,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1051716,15950,2016-05-28,000000000000000101000001,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [170]:
print clients_last_choice.shape
clients_last_choice.head(10)

(149976, 26)


Unnamed: 0,ncodpers,fecha_dato,lc_ind_ahor_fin_ult1,lc_ind_aval_fin_ult1,lc_ind_cco_fin_ult1,lc_ind_cder_fin_ult1,lc_ind_cno_fin_ult1,lc_ind_ctju_fin_ult1,lc_ind_ctma_fin_ult1,lc_ind_ctop_fin_ult1,...,lc_ind_hip_fin_ult1,lc_ind_plan_fin_ult1,lc_ind_pres_fin_ult1,lc_ind_reca_fin_ult1,lc_ind_tjcr_fin_ult1,lc_ind_valo_fin_ult1,lc_ind_viv_fin_ult1,lc_ind_nomina_ult1,lc_ind_nom_pens_ult1,lc_ind_recibo_ult1
2501083,15892,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
2501082,15893,2016-05-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2501067,15911,2016-05-28,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2501063,15916,2016-05-28,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
2501088,15920,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2501107,15929,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2501103,15934,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2501098,15939,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2501094,15943,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2501115,15950,2016-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


### Run KFold Cross-validation 

In [171]:
from keras.optimizers import SGD

def create_model(profiles, ll):
    final_model = Sequential()
    models = []
    for key in profiles:
        length = len(profiles[key])
        model = Sequential()
        model.add(Dense(30, init='uniform', input_shape=(length,), activation='relu'))
        model.add(Dropout(0.15))
        model.add(Dense(ll, activation='sigmoid'))
#         model.add(Dense(ll, activation='softmax'))
        models.append(model)

    merged = Merge(models, mode='max')
    final_model.add(merged)
    final_model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['mean_absolute_percentage_error'])
    #sgd = SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=True)
#     final_model.compile(loss='mean_squared_error', optimizer='nadam', metrics=['accuracy'])
    return final_model

In [172]:
common_group1 = [2, 18, 23] #range(24)
common_group2 = [3, 4, 7, 8, 12] #range(24)
others = list(set(range(24)) - set(common_group1 + common_group2))
for i, a in enumerate(TARGET_LABELS2[common_group1]):
    print i, a

0 Current Accounts
1 Credit Card
2 Direct Debit


In [183]:
target_groups = [common_group1, common_group2, others]
# target_groups = [common_group1]
print sum([len(t) for t in target_groups])

24


In [184]:
not_normalized_profiles = []

profiles = {
    
## 
    0: ['pais_residencia', 'sexo', 'age', 'ind_nuevo', 'segmento', 'ind_empleado', 'ind_actividad_cliente', 'indresi'],
#   100: ['pais_residencia', 'sexo', 'age', 'ind_nuevo', 'segmento', 'ind_empleado', 'ind_actividad_cliente', 'indresi'],

## 
#    1: target_features,
#     101: target_features,
    
    2: ['pais_residencia', 'sexo', 'age', 'segmento', 'renta'],
    3: ['pais_residencia', 'sexo', 'age', 'renta', 'targets_logdiff', 'targets_logcount2_diff','targets_logcount2','targets_logcount1'],
    11: ['nomprov', 'ind_nuevo', 'renta', 'ind_actividad_cliente', 'canal_entrada'],

## 
#      12: TARGET_LABELS_FRQ,
#     112: TARGET_LABELS_FRQ,        
    
    12: TARGET_LABELS_FRQ[common_group1 + common_group2],
##    112: TARGET_LABELS_FRQ[common_group1 + common_group2],        
    
#     13: list(TARGET_LABELS_FRQ[common_group1 + common_group2]) + target_features,
#     113: list(TARGET_LABELS_FRQ[common_group1 + common_group2]) + target_features,

#     14: ['pais_residencia', 'sexo', 'age', 'segmento', 'renta'] + list(TARGET_LABELS_FRQ[common_group])
#     114: ['pais_residencia', 'sexo', 'age', 'segmento', 'renta'] + list(TARGET_LABELS_FRQ[common_group])
}

In [185]:
from sklearn.preprocessing import StandardScaler

def prepare_to_fit(X_train, Y_train, profiles, mask):
    X_train_ = X_train[mask]
    y_train = Y_train[mask].values
    x_train = []
    for key in profiles:
        x_train_ = X_train_[profiles[key]].values
        if key not in not_normalized_profiles:
            x_train_ = StandardScaler().fit_transform(x_train_)
        x_train.append(x_train_)
    return x_train, y_train


def prepare_to_test(X_val, profiles, Y_val=None):
    
    y_val = Y_val.values if Y_val is not None else None
    x_val = []
    for key in profiles:
        x_val_ = X_val[profiles[key]].values   
        if key not in not_normalized_profiles:
            x_val_ = StandardScaler().fit_transform(x_val_)  
        x_val.append(x_val_)    
    
    return x_val, y_val
    

In [186]:
n_highest = 7

    
def pred_to_targets_indices(y_probas, **kwargs):
    
    threshold = 0.5 if 'threshold' not in kwargs else kwargs['threshold']
    n_highest = 7 if 'n_highest' not in kwargs else kwargs['n_highest']
    
    y_probas[y_probas < threshold] = 0.0
    predicted_added_products = np.argsort(y_probas, axis=1)
    predicted_added_products = predicted_added_products[:,::-1][:,:n_highest]
    out = []
    index_map = lambda x: x if 'index_map' not in kwargs else kwargs['index_map'][x]
    for i, t in enumerate(predicted_added_products):
        out.append([index_map(j) for j in t if y_probas[i, j] > 0.0])
    return np.array(out)


def train_model0(X_train, Y_train, profiles, 
                prepare_to_fit_func=prepare_to_fit, 
                create_model_func=create_model,
                **kwargs):
    
    # Select only active client for the training part
    mask = ~X_train['targets_diff'].isin([0, -99999])

    x_train, y_train = prepare_to_fit_func(X_train, Y_train, profiles, mask)
    logging.info("- Train data shapes : {}, {}".format(
            [i.shape for i in x_train] if isinstance(x_train, list) else x_train.shape, 
            y_train.shape)
    )                
    logging.info("- Create the model")
    estimator = create_model_func(profiles, y_train.shape[1])
    
    nb_epoch = 100 if 'nb_epoch' not in kwargs else kwargs['nb_epoch']
    batch_size = min(7500, x_train[0].shape[0]) if 'batch_size' not in kwargs else kwargs['batch_size']

    logging.info("- Fit the model : (%i, %i)" % (nb_epoch, batch_size))
    hist = estimator.fit(x_train, y_train, nb_epoch=nb_epoch, batch_size=batch_size, verbose=0)        
    for key in hist.history:            
        logging.info("-- {} : min={:5f}, max={:5f}, last={}".format(key, np.min(hist.history[key]), np.max(hist.history[key]), hist.history[key][-4:]))
    return estimator


def train_model(X_train, Y_train, profiles, 
                prepare_to_fit_func=prepare_to_fit, 
                create_model_func=create_model,
                **kwargs):
    
    estimators = []
    for group in target_groups:
        logging.info("--- process group : {}".format(group))
        labels = np.array(TARGET_LABELS)[group]
        estimator = train_model0(X_train, Y_train[labels], profiles,
                                 create_model_func=create_model,
                                 prepare_to_fit_func=prepare_to_fit, 
                                 **kwargs)  
        estimators.append(estimator)            
    return estimators


def predict_with_model0(estimator, X_val, profiles, Y_val=None, 
                        prepare_to_test_func=prepare_to_test, 
                        transform_pred_func=pred_to_targets_indices, 
                       **kwargs):
    """
    kwargs : 
        threshold: e.g. 0.55
        return_probas: True/False
        group (list of indices)
    """
    logging.info("- Predict using trained model")
    
    Y_val = Y_val[TARGET_LABELS] if Y_val is not None else None
    x_val, y_val = prepare_to_test_func(X_val, profiles, Y_val)
    
    logging.info("- Test data shapes : {}, {}".format(
            [i.shape for i in x_val] if isinstance(x_val, list) else x_val.shape, 
            y_val.shape if y_val is not None else '')
    )                
    
    y_probas = estimator.predict(x_val, verbose=0)    
    y_pred = transform_pred_func(y_probas, **kwargs)
    
    return_probas = kwargs['return_probas'] if 'return_probas' in kwargs else False    
    
    if y_val is not None:
        y_val = targets_str_to_indices(y_val, **kwargs)
        
    if return_probas:
        return y_pred, y_val, y_probas
    return y_pred, y_val
    

def predict_with_model(estimators, X_val, profiles, Y_val=None, 
                        prepare_to_test_func=prepare_to_test, 
                        transform_pred_func=pred_to_targets_indices, 
                       **kwargs):
    """
    kwargs : 
        threshold: e.g. 0.55
        return_probas: True/False
        group (list of indices)
    """
    logging.info("- Predict using trained model")
    
    Y_val = Y_val[TARGET_LABELS] if Y_val is not None else None
    x_val, y_val = prepare_to_test_func(X_val, profiles, Y_val)
    logging.info("- Test data shapes : {}, {}".format(
            [i.shape for i in x_val] if isinstance(x_val, list) else x_val.shape, 
            y_val.shape if y_val is not None else ''))          
    if y_val is not None:
        y_val = targets_str_to_indices(y_val)
    
    _y_probas = []   
    clc_val = None if 'clc_val' not in kwargs else kwargs['clc_val']
    
    for i, group in enumerate(target_groups):
        logging.info("--- process group : {}".format(group))
            
        y_probas = estimators[i].predict(x_val, verbose=0)            
        
        if clc_val is not None and y_val is not None:
            y_subpred = transform_pred_func(y_probas, index_map=group)
            subscore = map7_score(y_val, y_subpred, clc_val)
            logging.info("--- group subscore: {}".format(subscore))
        
        _y_probas.append(y_probas)
    
    y_probas = merge_predictions(_y_probas, target_groups)
    y_pred = transform_pred_func(y_probas, **kwargs)
    return_probas = kwargs['return_probas'] if 'return_probas' in kwargs else False    
    
    if return_probas:
        return y_pred, y_val, y_probas
    return y_pred, y_val
    

def merge_predictions(y_array, groups):
    """
    y_array is a list of ndarrays : [y1, y2, ...]
        yi can be of shape (nb_samples, len(gi)) or (nb_samples,)
    groups is a list of indices : [g1, g2, ..., gN], len(y_array) == len(groups)
    return a properly ordered ndarray of shape (nb_samples, len(TARGET_LABELS)) or (nb_samples,) 
    """
    #ll = np.sum([len(g) for g in groups])
    ll = len(TARGET_LABELS)
    nb_samples = y_array[0].shape[0]
    two_d_case = len(y_array[0].shape) == 2    
    if two_d_case:
        out = np.zeros((nb_samples, ll)) 
    else:
        out = np.empty((nb_samples, ), dtype=np.object_)
        out.fill([])
    for g, y in zip(groups, y_array):
        if two_d_case:
            out[:,g] = y[:]
        else:
            out += y
    return out

In [187]:
# CROSS VALIDATION
from sklearn.model_selection import KFold
def cross_val_score2(data, 
                     profiles,
                     nb_folds=5, 
                     train_model_func=train_model,
                     predict_with_model_func=predict_with_model):
    
    logging.info("- Cross validation : ")
    x_df, y_df, clients_last_choice = data
    kf = KFold(n_splits=nb_folds)
    scores = []
    
    count = 0
    for train_index, test_index in kf.split(range(x_df.shape[0])):
        count += 1
        logging.info("\n\n\t\t-- Fold : %i / %i\n" % (count, nb_folds))
        
        X_train, X_val = x_df.loc[x_df.index[train_index], :], x_df.loc[x_df.index[test_index], :]
        Y_train, Y_val = y_df.loc[y_df.index[train_index], :], y_df.loc[y_df.index[test_index], :]
        clc_val = clients_last_choice[test_index, :]

        estimators = train_model_func(X_train, Y_train, profiles, nb_epoch=300)    
        y_pred, y_val = predict_with_model_func(estimators, X_val, profiles, Y_val, 
                                                clc_val=clc_val, 
                                                threshold=0.5, 
                                                n_highest=7)        
        
        logging.info("- Compute map7 score")
        scores.append(map7_score(y_val, y_pred, clc_val))   
        
    return np.array(scores)

In [188]:
nb_folds = 3
results = cross_val_score2((X, Y, clients_last_choice[LC_TARGET_LABELS].values), 
                            profiles=profiles,
                            nb_folds=nb_folds)
print "Profiles : ", profiles.keys()
print "Cross-Validation \n %i | %f | %f | %f | %.5f " % (nb_folds, results.min(), results.mean(), results.max(), results.std())


INFO:root:- Cross validation : 
INFO:root:

		-- Fold : 1 / 3

INFO:root:--- process group : [2, 18, 23]
INFO:root:- Train data shapes : [(5877, 8), (5877, 5), (5877, 5), (5877, 8), (5877, 8)], (5877, 3)
INFO:root:- Create the model
INFO:root:- Fit the model : (300, 5877)
INFO:root:-- loss : min=0.413410, max=0.705381, last=[0.41507229208946228, 0.41406261920928955, 0.4140680730342865, 0.41377314925193787]
INFO:root:-- mean_absolute_percentage_error : min=133262704.000000, max=323722880.000000, last=[133707760.0, 133262704.0, 133721040.0, 133533184.0]
INFO:root:--- process group : [3, 4, 7, 8, 12]
INFO:root:- Train data shapes : [(5877, 8), (5877, 5), (5877, 5), (5877, 8), (5877, 8)], (5877, 5)
INFO:root:- Create the model
INFO:root:- Fit the model : (300, 5877)
INFO:root:-- loss : min=0.141835, max=0.724304, last=[0.14241011440753937, 0.14324119687080383, 0.14216332137584686, 0.14265644550323486]
INFO:root:-- mean_absolute_percentage_error : min=46095144.000000, max=443682688.000000, 

KeyboardInterrupt: 


### 201505 -> 201605 

#### Multiple profiles :
Profiles :  [0, 1, 2, 3, 11, 12, 13]
Cross-Validation 
 3 | 0.008560 | 0.011849 | 0.016001 | 0.00310 


#### Single profiles:

Profiles   [0, 100] 
Cross-Validation (normalized)
 3 | 0.011683 | 0.012817 | 0.014958 | 0.00152 

Cross-Validation (not normalized)
 3 | 0.009244 | 0.010407 | 0.011922 | 0.00112 
 
 
Profiles :  [1, 101]
Cross-Validation (normalized)
 3 | 0.006793 | 0.009161 | 0.012219 | 0.00227 

Cross-Validation (not normalized)
 3 | 0.004787 | 0.009852 | 0.014922 | 0.00414


Profiles :  [112, 12]
Cross-Validation (normalized)
 3 | 0.008856 | 0.012124 | 0.016443 | 0.00318 

Cross-Validation (not normalized)
 3 | 0.007298 | 0.010140 | 0.014101 | 0.00289 

Compute cross-validation across several months

In [None]:
nb_folds = 3
yms = [201504, 201505]
#yms = [201505]

for ym in yms:
    logging.info("\n-------------------------")
    logging.info("- Process month : %s" % ym)
    logging.info("-------------------------\n")
    
    ym1 = ym + 100    
    df1 = train_df if months_ym_map[ym] in train_months else val_df
    df2 = train_df if months_ym_map[ym1] in train_months else val_df
    X, Y, clients_last_choice = get_XY(ym, df1, ym1, df2) 
    results = cross_val_score2((X, Y, clients_last_choice[LC_TARGET_LABELS].values), 
                                profiles=profiles,
                                nb_folds=nb_folds)
    print "Cross-Validation \n %i | %f | %f | %f | %.5f " % (nb_folds, results.min(), results.mean(), results.max(), results.std())

## Train model for predictions

In [139]:
current_month = 201505
next_year_month = current_month + 100

df1 = train_df
#df1 = val_df
df2 = train_df #if months_ym_map[next_year_month] in train_months else val_df
#df2 = val_df

X, Y, clients_last_choice = get_XY(current_month, df1, next_year_month, df2, months_ym_map)

In [140]:
estimators = train_model(X, Y, profiles, nb_epoch=300)

INFO:root:--- process group : [2, 18, 23]
INFO:root:- Train data shapes : [(10592, 8), (10592, 6), (10592, 5), (10592, 8), (10592, 5), (10592, 24), (10592, 14)], (10592, 3)
INFO:root:- Create the model
INFO:root:- Fit the model : (300, 7500)
INFO:root:-- acc : min=0.423338, max=0.839407, last=[0.83840004575306559, 0.83912385372146736, 0.83736154243485084, 0.83802238520898853]
INFO:root:-- loss : min=0.378637, max=0.701672, last=[0.37898034476351522, 0.37940003333495104, 0.38053285508357504, 0.37955082730134088]


Check score on the data 2016/05

In [141]:
y_pred, y_val, y_probas = predict_with_model(estimators, X, profiles, Y, threshold=0.5, return_probas=True)

logging.info("- Compute map7 score")
print map7_score(y_val, y_pred, clients_last_choice[LC_TARGET_LABELS].values)
logging.info("- Compute max map7 score")
print map7_score(y_val, y_val, clients_last_choice[LC_TARGET_LABELS].values)

INFO:root:- Predict using trained model
INFO:root:- Test data shapes : [(149976, 8), (149976, 6), (149976, 5), (149976, 8), (149976, 5), (149976, 24), (149976, 14)], (149976, 24)
INFO:root:--- process group : [2, 18, 23]
INFO:root:- Compute map7 score
INFO:root:-- Predicted map7 score: 0.0121374975552
INFO:root:- Compute max map7 score


0.0121374975552


INFO:root:-- Predicted map7 score: 0.0328119165733


0.0328119165733


In [142]:
from common import targets_to_labels, targets_indices_to_labels, remove_last_choice

In [145]:
clients_last_choice[LC_TARGET_LABELS].values

limit = 100
count = 0
for last_choice, targets, products in zip(clients_last_choice[LC_TARGET_LABELS].values, y_val, y_pred):
    added_products = remove_last_choice(targets, last_choice)
    predictions = remove_last_choice(products, last_choice)
    
    if len(added_products) == 0:
        continue
        
    #print added_products, common_group1, len(set(added_products) & set(common_group1))
    if len(set(added_products) & set(target_groups[0])) < 1:
        continue
    
    if len(set(added_products) - set(predictions)) < 1:
        continue
    
    print "--- Count = ", count
    print targets_indices_to_labels(added_products, TARGET_LABELS2)
    print targets_indices_to_labels(predictions, TARGET_LABELS2), targets_indices_to_labels(products, TARGET_LABELS2)
    
    count += 1
    if count == limit:
        break

--- Count =  0
['Credit Card']
[] ['Direct Debit']
--- Count =  1
['Direct Debit']
[] ['Current Accounts']
--- Count =  2
['Credit Card']
[] ['Direct Debit']
--- Count =  3
['Payroll', 'Direct Debit']
['Direct Debit'] ['Direct Debit']
--- Count =  4
['Direct Debit']
[] ['Current Accounts']
--- Count =  5
['Payroll', 'Pensions', 'Direct Debit']
[] []
--- Count =  6
['Direct Debit']
[] ['Current Accounts']
--- Count =  7
['Credit Card']
[] ['Direct Debit']
--- Count =  8
['Direct Debit']
[] ['Current Accounts']
--- Count =  9
['Direct Debit']
[] ['Current Accounts']
--- Count =  10
['Direct Debit']
[] ['Credit Card', 'Current Accounts']
--- Count =  11
['Credit Card']
[] ['Current Accounts']
--- Count =  12
['Credit Card']
[] ['Current Accounts']
--- Count =  13
['e-account', 'Direct Debit']
[] ['Current Accounts']
--- Count =  14
['Direct Debit']
[] ['Current Accounts']
--- Count =  15
['Direct Debit']
[] ['Current Accounts']
--- Count =  16
['Credit Card']
[] ['Direct Debit', 'Current 

In [147]:
print y_probas[:10, target_groups[0]]
print Y[np.array(TARGET_LABELS)[target_groups[0]]].head(10)

[[ 0.          0.97003615  0.99862111]
 [ 0.          0.          0.        ]
 [ 0.          0.99173892  0.9972778 ]
 [ 0.          0.97257733  0.99941421]
 [ 0.66886503  0.          0.99012005]
 [ 0.86318731  0.93404615  0.99419618]
 [ 0.83691907  0.          0.9809919 ]
 [ 0.78189623  0.          0.99596596]
 [ 0.79921281  0.98219782  0.72362268]
 [ 0.58424377  0.93034691  0.98500001]]
         ind_cco_fin_ult1  ind_tjcr_fin_ult1  ind_recibo_ult1
1051662                 1                  1                1
1051663                 0                  0                0
1051675                 0                  1                1
1051677                 0                  1                1
1051644                 1                  0                1
1051653                 1                  1                1
1051657                 1                  0                1
1051680                 1                  0                1
1051703                 1                  1       

## Prediction for 2016/06

In [None]:
from dataset import load_train_test

In [None]:
full_train_df, test_df = load_train_test([201506])

In [None]:
full_train_df.head()

In [None]:
test_df.head()

In [None]:
months_ym_map = {}
months = list(set(full_train_df['fecha_dato'].unique()) | set(test_df['fecha_dato'].unique()))
for m in months:
    months_ym_map[to_yearmonth(m)] = m
    
full_train_months = full_train_df['fecha_dato'].unique()
test_months = test_df['fecha_dato'].unique()

In [None]:
def get_XY(current_month, df1, next_year_month, df2, months_ym_map):
    month_mask = dfx1['fecha_dato'] == months_ym_map[current_month]
    next_year_month_mask = df2['fecha_dato'] == months_ym_map[next_year_month]
    
    # get common clients from df1 at this month and df2 at next year month
    common_clients_mask1, common_clients_mask2 = get_common_clients(df1, month_mask, next_year_month_mask, df2)
    
    X = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + target_features + features + TARGET_LABELS_FRQ.tolist()]            

    if TARGET_LABELS[0] in df2.columns:
        Y = df2[common_clients_mask2 & next_year_month_mask][['ncodpers', 'fecha_dato', 'targets_str'] + TARGET_LABELS]    
        assert (X['ncodpers'].values == Y['ncodpers'].values).all(), "There is a problem in alignment"
        Y.index = X.index
    else:
        Y = None
    
    if LC_TARGET_LABELS[0] in df2.columns:
        clients_last_choice = df2[common_clients_mask2 & next_year_month_mask][['ncodpers', 'fecha_dato'] + LC_TARGET_LABELS.tolist()]
    else:
        clients_last_choice = None
        
    return X, Y, clients_last_choice

In [None]:
current_month = 201506
next_year_month = current_month + 100

df1 = full_train_df
df2 = test_df
X, _, clients_last_choice = get_XY(current_month, df1, next_year_month, df2, months_ym_map)

In [None]:
print X.shape, test_df.shape

In [None]:
X.head(10)

In [None]:
clients_last_choice.head(10)

In [None]:
def get_submission(predicted_added_products, clients, clc, labels):
    added_products_col = []
    count = 0 
    for products, last_choice in zip(predicted_added_products, clc):
        predictions = remove_last_choice(products, last_choice)
        added_products_col.append(' '.join([labels[i] for i in predictions]))
        count+=1
        if count % 100000 == 0:
            logging.info("Elapsed : %i", count)
            
    out = pd.DataFrame(data={'ncodpers': clients, 'added_products': added_products_col}, columns=['ncodpers', 'added_products'])
    return out

In [None]:
y_pred = predict_with_model(estimators, X, profiles, threshold=0.5)

logging.info("- Get submission dataframe:")
clients = X['ncodpers'].values
submission = get_submission(y_pred, clients, clients_last_choice[TARGET_LABELS].values, TARGET_LABELS)

In [None]:
submission_clients = set(submission['ncodpers'].unique())
test_clients = set(test_df['ncodpers'].unique())
if submission_clients != test_clients:
    missing_clients = list(test_clients - submission_clients)
    missing_added_products = np.zeros((len(missing_clients)))
    submission = pd.concat([submission, 
                            pd.DataFrame(data={
                                'ncodpers': missing_clients, 
                                'added_products': missing_added_products
                            }, columns=['ncodpers', 'added_products'])])

Get submission DataFrame and write csv file

In [None]:
print submission.shape
submission.head()

In [None]:
from datetime import datetime
import csv

logging.info('- Generate submission')
submission_file = '../results/submission_' + \
                  str(datetime.now().strftime("%Y-%m-%d-%H-%M")) + \
                  '.csv'

submission.to_csv(submission_file, index=False, index_label=False)

In [None]:
with open('../results/submission_2016-11-17-16-37.csv', 'r') as r:
    print r.readline()
    print r.readline()
    print r.readline()
    print r.readline()
    print r.readline()
    