# NN tryouts on SPR data, inspired by Kaggle Forum "When less is more"

Load training and validation data as 
    month : [ Features | Targets| Difference | Last Choice Targets  ]
    
    - Features : 
    - Target labels : TARGET_LABELS(month), 'targets_str', 'targets_features'
    - Difference with prev month
    - Last Choice Targets : LC_TARGET_LABELS(month-1), 'lc_targets_str', 'lc_targets_features'


In [1]:
import os
import numpy as np
import pandas as pd

import logging
logging.getLogger().handlers = []
logging.basicConfig(level=logging.INFO)

import matplotlib.pylab as plt
%matplotlib inline



In [2]:
from dataset import load_trainval, LC_TARGET_LABELS, decimal_to_dummies, targets_str_to_indices, targets_dec_to_indices
from common import to_yearmonth, TARGET_LABELS 
from common import target_str_to_labels, TARGET_LABELS2

In [3]:
features = [
    u'ind_empleado', u'pais_residencia',
    u'sexo', u'age', u'ind_nuevo', u'antiguedad', u'indrel',
    u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
    u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
    u'ind_actividad_cliente', u'renta', u'segmento'    
]

In [4]:
# train_yearmonths_list = [201503, 201504, 201505, 201506, 201603, 201604]
train_yearmonths_list = [201505]
val_yearmonth = [201505]
train_nb_clients = 1500
train_df, val_df = load_trainval(train_yearmonths_list, val_yearmonth, train_nb_clients)

INFO:root:- Load training data
INFO:root:- Load data : [201504, 201505]
INFO:root:-- Select 1500 clients
INFO:root:- Number of lines with unknown data : 14
INFO:root:- Number of columns with nan : 5
INFO:root:-- Process date : 201505
INFO:root:- Load validation data
INFO:root:- Load data : [201504, 201505]
INFO:root:-- Select 100 clients
INFO:root:- Number of lines with unknown data : 0
INFO:root:- Number of columns with nan : 3
INFO:root:-- Process date : 201505
INFO:root:-- Compute logCount dictionary
INFO:root:-- Add logCount columns
INFO:root:-- Process month : 2015-04-28
INFO:root:-- Process month : 2015-05-28
INFO:root:-- Process month : 2015-04-28
INFO:root:-- Process month : 2015-05-28
INFO:root:-- Add logDecimal columns


Index([u'fecha_dato', u'ncodpers', u'ind_empleado', u'pais_residencia',
       u'sexo', u'age', u'fecha_alta', u'ind_nuevo', u'antiguedad', u'indrel',
       u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
       u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
       u'ind_actividad_cliente', u'renta', u'segmento', u'ind_ahor_fin_ult1',
       u'ind_aval_fin_ult1', u'ind_cco_fin_ult1', u'ind_cder_fin_ult1',
       u'ind_cno_fin_ult1', u'ind_ctju_fin_ult1', u'ind_ctma_fin_ult1',
       u'ind_ctop_fin_ult1', u'ind_ctpp_fin_ult1', u'ind_deco_fin_ult1',
       u'ind_deme_fin_ult1', u'ind_dela_fin_ult1', u'ind_ecue_fin_ult1',
       u'ind_fond_fin_ult1', u'ind_hip_fin_ult1', u'ind_plan_fin_ult1',
       u'ind_pres_fin_ult1', u'ind_reca_fin_ult1', u'ind_tjcr_fin_ult1',
       u'ind_valo_fin_ult1', u'ind_viv_fin_ult1', u'ind_nomina_ult1',
       u'ind_nom_pens_ult1', u'ind_recibo_ult1', u'targets_str',
       u'lc_ind_ahor_fin_ult1', u'lc_ind_aval_fin_ult1',
 

INFO:root:-- Transform age/renta/logdiff


Display loaded data

In [5]:
print train_df.shape, train_df.columns

(2986, 81) Index([u'fecha_dato', u'ncodpers', u'ind_empleado', u'pais_residencia',
       u'sexo', u'age', u'fecha_alta', u'ind_nuevo', u'antiguedad', u'indrel',
       u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
       u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
       u'ind_actividad_cliente', u'renta', u'segmento', u'ind_ahor_fin_ult1',
       u'ind_aval_fin_ult1', u'ind_cco_fin_ult1', u'ind_cder_fin_ult1',
       u'ind_cno_fin_ult1', u'ind_ctju_fin_ult1', u'ind_ctma_fin_ult1',
       u'ind_ctop_fin_ult1', u'ind_ctpp_fin_ult1', u'ind_deco_fin_ult1',
       u'ind_deme_fin_ult1', u'ind_dela_fin_ult1', u'ind_ecue_fin_ult1',
       u'ind_fond_fin_ult1', u'ind_hip_fin_ult1', u'ind_plan_fin_ult1',
       u'ind_pres_fin_ult1', u'ind_reca_fin_ult1', u'ind_tjcr_fin_ult1',
       u'ind_valo_fin_ult1', u'ind_viv_fin_ult1', u'ind_nomina_ult1',
       u'ind_nom_pens_ult1', u'ind_recibo_ult1', u'targets_str',
       u'lc_ind_ahor_fin_ult1', u'lc_ind_aval_f

In [6]:
print val_df.shape, val_df.columns

(200, 81) Index([u'fecha_dato', u'ncodpers', u'ind_empleado', u'pais_residencia',
       u'sexo', u'age', u'fecha_alta', u'ind_nuevo', u'antiguedad', u'indrel',
       u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
       u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
       u'ind_actividad_cliente', u'renta', u'segmento', u'ind_ahor_fin_ult1',
       u'ind_aval_fin_ult1', u'ind_cco_fin_ult1', u'ind_cder_fin_ult1',
       u'ind_cno_fin_ult1', u'ind_ctju_fin_ult1', u'ind_ctma_fin_ult1',
       u'ind_ctop_fin_ult1', u'ind_ctpp_fin_ult1', u'ind_deco_fin_ult1',
       u'ind_deme_fin_ult1', u'ind_dela_fin_ult1', u'ind_ecue_fin_ult1',
       u'ind_fond_fin_ult1', u'ind_hip_fin_ult1', u'ind_plan_fin_ult1',
       u'ind_pres_fin_ult1', u'ind_reca_fin_ult1', u'ind_tjcr_fin_ult1',
       u'ind_valo_fin_ult1', u'ind_viv_fin_ult1', u'ind_nomina_ult1',
       u'ind_nom_pens_ult1', u'ind_recibo_ult1', u'targets_str',
       u'lc_ind_ahor_fin_ult1', u'lc_ind_aval_fi

In [7]:
train_df.head(10)

Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,...,lc_targets_str,lc_targets_dec,targets_logcount1,targets_logcount2,lc_targets_logcount2,targets_logcount2_diff,targets_logDec,lc_targets_logDec,targets_diff,targets_logdiff
210092,2015-04-28,15982,1,0,0,10,1995-01-16,0,243,1.0,...,,,0.003139,0.00107,-99999.0,-99999.0,14.780031,-99999.0,-99999.0,-99999.0
1051693,2015-05-28,15982,1,0,0,10,1995-01-16,0,244,1.0,...,001010000000100000100111,2623527.0,0.003189,0.00107,0.00107,0.0,14.780028,2.758745,-6.0,-1.94591
210170,2015-04-28,16122,0,0,1,8,1995-02-23,0,242,1.0,...,,,0.030142,0.060245,-99999.0,-99999.0,14.556091,-99999.0,-99999.0,-99999.0
1051615,2015-05-28,16122,0,0,1,8,1995-02-23,0,243,1.0,...,001000000000000000000000,2097152.0,0.030661,0.060245,0.060245,0.0,14.556091,2.744452,0.0,0.0
210049,2015-04-28,16359,0,0,0,7,1995-03-30,0,241,1.0,...,,,0.008113,0.003392,-99999.0,-99999.0,13.169873,-99999.0,-99999.0,-99999.0
1051753,2015-05-28,16359,0,0,0,7,1995-03-30,0,242,1.0,...,000010000000000000100111,524327.0,0.003189,0.00107,0.003392,-0.002322,13.169871,2.651118,-1.0,-0.693147
210467,2015-04-28,16841,0,0,0,11,1998-01-12,0,207,1.0,...,,,0.030142,0.060245,-99999.0,-99999.0,14.556091,-99999.0,-99999.0,-99999.0
1051317,2015-05-28,16841,0,0,0,11,1998-01-12,0,208,1.0,...,001000000000000000000000,2097152.0,0.030661,0.060245,0.060245,0.0,14.556091,2.744452,0.0,0.0
209472,2015-04-28,17380,0,0,0,14,2000-12-07,0,172,1.0,...,,,0.003139,0.00107,-99999.0,-99999.0,13.291134,-99999.0,-99999.0,-99999.0
1052276,2015-05-28,17380,0,0,0,14,2000-12-07,0,173,1.0,...,000010010000100000110011,591923.0,0.003189,0.00107,0.00107,0.0,13.29113,2.659639,-2.0,-1.098612


Useful structures

In [10]:
def get_common_clients(df1, mask1, mask2, df2=None):
    active_clients1 = df1[mask1]['ncodpers'].unique()
    if df2 is not None:
        active_clients2 = df2[mask2]['ncodpers'].unique()
    else:
        active_clients2 = df1[mask2]['ncodpers'].unique()
    active_clients = list(set(active_clients1) & set(active_clients2)) 
    
    if df2 is not None:
        return df1['ncodpers'].isin(active_clients), df2['ncodpers'].isin(active_clients)
    return df1['ncodpers'].isin(active_clients)

In [11]:
months_ym_map = {}
months = list(set(train_df['fecha_dato'].unique()) | set(val_df['fecha_dato'].unique()))
for m in months:
    months_ym_map[to_yearmonth(m)] = m
    
train_months = train_df['fecha_dato'].unique()
val_months = val_df['fecha_dato'].unique()

In [109]:
from common import get_added_products, remove_last_choice, apk, map7_score2
from visualization import visualize_train_test, visualize_folds, compare_two_datasets, compare_folds, compare_folds2

Create profiles and create models for profiles

In [110]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Merge
from keras.utils import np_utils

Using Theano backend.


### Train a model

In [203]:
target_features = ['targets_diff', 'targets_logdiff', 'targets_logcount2_diff', 'targets_logcount2', 'targets_logcount1', 'targets_logDec']

In [204]:
#active_clients_mask = ~train_df['targets_diff'].isin([0, -99999])
#print active_clients_mask.shape, active_clients_mask.sum()

In [293]:
current_month = 201505
next_year_month = current_month + 100

df1 = train_df if months_ym_map[current_month] in train_months else val_df
df2 = train_df if months_ym_map[next_year_month] in train_months else val_df
#df2 = val_fe_df

month_mask = df1['fecha_dato'] == months_ym_map[current_month]
next_year_month_mask = df2['fecha_dato'] == months_ym_map[next_year_month]
        
common_clients_mask1, common_clients_mask2 = get_common_clients(df1, month_mask, next_year_month_mask, df2)

In [295]:
X = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + target_features + features]            
Y = df2[common_clients_mask2 & next_year_month_mask][['ncodpers', 'fecha_dato', 'targets_str'] + TARGET_LABELS]    
clients_last_choice = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + LC_TARGET_LABELS]
clients_last_choice.columns = ['ncodpers', 'fecha_dato'] + TARGET_LABELS
assert (X['ncodpers'].values == Y['ncodpers'].values).all(), "There is a problem in alignment"
Y.index = X.index

In [296]:
print X.shape
X.head(10)

(149875, 27)


Unnamed: 0,ncodpers,fecha_dato,targets_diff,targets_logdiff,targets_logcount2_diff,targets_logcount2,targets_logcount1,targets_logDec,ind_empleado,pais_residencia,...,tiprel_1mes,indresi,indext,conyuemp,canal_entrada,indfall,nomprov,ind_actividad_cliente,renta,segmento
2518926,15890,2015-05-28,0.0,0.0,0.0,0.000223,0.000247,13.23462,1,0,...,0,0,0,1,1,0,0,1.0,4,0
2308266,15893,2015-05-28,0.0,0.0,0.0,0.000589,0.001218,2.833213,0,0,...,0,0,0,1,1,0,0,1.0,11,0
2308272,15901,2015-05-28,0.0,0.0,0.0,0.000211,0.000247,14.57306,4,0,...,0,0,0,1,1,0,0,1.0,9,1
2308275,15906,2015-05-28,-2.0,-1.098612,0.0,4.6e-05,0.000156,13.349075,0,0,...,0,0,0,1,1,0,0,1.0,5,1
2308278,15911,2015-05-28,0.0,0.0,0.0,5.8e-05,0.000156,13.196866,4,0,...,0,0,0,1,1,0,0,1.0,10,1
2308263,15917,2015-05-28,0.0,0.0,0.0,0.00068,0.001471,14.586863,4,0,...,0,0,0,1,1,0,11,1.0,-1,0
2308246,15919,2015-05-28,0.0,0.0,0.0,0.000369,0.00065,14.588785,2,0,...,0,0,0,1,1,0,0,1.0,11,1
2308250,15923,2015-05-28,0.0,0.0,0.0,0.000621,0.0013,13.173771,2,0,...,0,0,0,1,1,0,0,1.0,11,0
2308252,15925,2015-05-28,0.0,0.0,0.0,0.000589,0.001255,14.586878,4,0,...,0,0,0,1,1,0,0,1.0,1,0
2308253,15926,2015-05-28,0.0,0.0,0.0,0.00038,0.000684,14.587344,4,0,...,0,0,0,1,1,0,0,1.0,8,0


In [297]:
print Y.shape
Y.head(10)

(149875, 27)


Unnamed: 0,ncodpers,fecha_dato,targets_str,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
2518926,15890,2016-05-28,000010001000100100100111,0,0,0,0,1,0,0,...,0,1,0,0,1,0,0,1,1,1
2308266,15893,2016-05-28,000000000000000000010000,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2308272,15901,2016-05-28,001000001000110000110001,0,0,1,0,0,0,0,...,0,0,0,0,1,1,0,0,0,1
2308275,15906,2016-05-28,000010011001000000000011,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1
2308278,15911,2016-05-28,000010000001100000110001,0,0,0,0,1,0,0,...,0,0,0,0,1,1,0,0,0,1
2308263,15917,2016-05-28,001000010000000000000001,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2308246,15919,2016-05-28,001000000001000001000001,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2308250,15923,2016-05-28,000010000000100001100111,0,0,0,0,1,0,0,...,0,0,0,1,1,0,0,1,1,1
2308252,15925,2016-05-28,001000010000000000100000,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2308253,15926,2016-05-28,001000010000010000010000,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [298]:
print clients_last_choice.shape
clients_last_choice.head(10)

(149875, 26)


Unnamed: 0,ncodpers,fecha_dato,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
2518926,15890,2015-05-28,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
2308266,15893,2015-05-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2308272,15901,2015-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2308275,15906,2015-05-28,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
2308278,15911,2015-05-28,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2308263,15917,2015-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2308246,15919,2015-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2308250,15923,2015-05-28,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
2308252,15925,2015-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2308253,15926,2015-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


### Run KFold Cross-validation 

In [299]:
def create_model(profiles):
    ll = len(TARGET_LABELS)
    final_model = Sequential()
    models = []
    for key in profiles:
        length = len(profiles[key])
        model = Sequential()
        model.add(Dense(50, init='uniform', input_shape=(length,), activation='relu'))
        model.add(Dropout(0.15))
        model.add(Dense(10 + length, activation='relu'))
        model.add(Dropout(0.15))
        model.add(Dense(ll, activation='sigmoid'))
        models.append(model)

    merged = Merge(models, mode='ave')
    final_model.add(merged)
    final_model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
    return final_model

In [300]:
profiles = {
    0: ['pais_residencia', 'sexo', 'age', 'ind_nuevo', 'segmento', 'ind_empleado', 'ind_actividad_cliente', 'indresi'],
    1: target_features,
    2: ['pais_residencia', 'sexo', 'age', 'segmento', 'renta'],
    3: ['pais_residencia', 'sexo', 'age', 'renta', 'targets_logdiff', 'targets_logcount2_diff','targets_logcount2','targets_logcount1'],
    11: ['nomprov', 'ind_nuevo', 'renta', 'ind_actividad_cliente', 'canal_entrada']
}


In [301]:
# CROSS VALIDATION
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
n_highest = 7


def prepare_to_fit(X_train, Y_train, profiles):
    # Select only active client for the training part
    mask = ~X_train['targets_diff'].isin([0, -99999])
    X_train_ = X_train[mask]
    y_train = Y_train[mask][TARGET_LABELS].values
    x_train = []
    for key in profiles:
        x_train_ = X_train_[profiles[key]].values
        #x_train_ = StandardScaler().fit_transform(x_train_)
        x_train.append(x_train_)
    return x_train, y_train


def prepare_to_test(X_val, profiles, Y_val=None):
    if Y_val is not None:
        y_val = Y_val[TARGET_LABELS].values
    
    x_val = []
    for key in profiles:
        x_val_ = X_val[profiles[key]].values    
        #x_val_ = StandardScaler().fit_transform(x_val_)  
        x_val.append(x_val_)    
    
    if Y_val is not None:
        return x_val, y_val
    else:
        return x_val

    
def pred_to_targets(y_pred, n_highest=7):
    predicted_added_products = np.argsort(y_pred, axis=1)
    predicted_added_products = predicted_added_products[:,::-1][:,:n_highest]
    return predicted_added_products
    

def cross_val_score2(data, 
                     profiles,
                     nb_folds=5, 
                     prepare_to_fit_func=prepare_to_fit, 
                     prepare_to_test_func=prepare_to_test,
                     transform_pred_func=pred_to_targets,
                     create_model_func=create_model):
    
    x_df, y_df, clients_last_choice_df = data
    kf = KFold(n_splits=nb_folds)
    scores = []
    
    for train_index, test_index in kf.split(range(x_df.shape[0])):
        X_train, X_val = x_df.loc[x_df.index[train_index], :], x_df.loc[x_df.index[test_index], :]
        Y_train, Y_val = y_df.loc[y_df.index[train_index], :], y_df.loc[y_df.index[test_index], :]
        CLC_val = clients_last_choice_df.loc[clients_last_choice_df.index[test_index], :]
                
        x_train, y_train = prepare_to_fit_func(X_train, Y_train, profiles)
        x_val, y_val = prepare_to_test_func(X_val, profiles, Y_val)
        
        logging.info("- Train/Val shapes : {}, {} | {}, {}".format(
                [i.shape for i in x_train] if isinstance(x_train, list) else x_train.shape, 
                [i.shape for i in x_val] if isinstance(x_val, list) else x_val.shape, 
                y_train.shape, 
                y_val.shape)
        )                
            
        logging.info("- Create the model : ")
        estimator = create_model_func(profiles)
        logging.info("- Fit the model")
        hist = estimator.fit(x_train, y_train, nb_epoch=250, batch_size=5000, verbose=0)        
        for key in hist.history:            
            logging.info("-- %s : min=%f, max=%f" % (key, np.min(hist.history[key]), np.max(hist.history[key])))
        
        logging.info("- Predict using trained model")
        y_pred = estimator.predict(x_val, verbose=0)    
        y_pred = transform_pred_func(y_pred)
        logging.info("- Compute map7 score")
        scores.append(map7_score2(y_val, y_pred, CLC_val[TARGET_LABELS].values))        
    
    return np.array(scores)

In [302]:
nb_folds = 3
results = cross_val_score2((X, Y, clients_last_choice), 
                            profiles=profiles,
                            nb_folds=nb_folds)
print "Cross-Validation \n %i | %f | %f | %f | %.5f " % (nb_folds, results.min(), results.mean(), results.max(), results.std())

INFO:root:- Train/Val shapes : [(5470, 8), (5470, 6), (5470, 5), (5470, 8), (5470, 5)], [(49959, 8), (49959, 6), (49959, 5), (49959, 8), (49959, 5)] | (5470, 24), (49959, 24)
INFO:root:- Create the model : 
INFO:root:- Fit the model
INFO:root:-- acc : min=0.544317, max=0.889321
INFO:root:-- loss : min=0.240433, max=0.687929
INFO:root:- Predict using trained model
INFO:root:- Compute map7 score
INFO:root:-- Predicted map7 score: 0.062641468385
INFO:root:- Train/Val shapes : [(7313, 8), (7313, 6), (7313, 5), (7313, 8), (7313, 5)], [(49958, 8), (49958, 6), (49958, 5), (49958, 8), (49958, 5)] | (7313, 24), (49958, 24)
INFO:root:- Create the model : 
INFO:root:- Fit the model
INFO:root:-- acc : min=0.552521, max=0.865838
INFO:root:-- loss : min=0.283247, max=0.688595
INFO:root:- Predict using trained model
INFO:root:- Compute map7 score
INFO:root:-- Predicted map7 score: 0.0508657224449
INFO:root:- Train/Val shapes : [(7505, 8), (7505, 6), (7505, 5), (7505, 8), (7505, 5)], [(49958, 8), (499

Cross-Validation 
 3 | 0.050866 | 0.060776 | 0.068821 | 0.00745 


Compute cross-validation across several months

In [361]:
nb_folds = 3
#yms = [201503, 201504, 201505]
yms = [201505]

for ym in yms:
    logging.info("- Process month : %s" % ym)
    ym1 = ym + 100    
    df1 = train_df if months_ym_map[ym] in train_months else val_df
    df2 = train_df if months_ym_map[ym1] in train_months else val_df

    month_mask = df1['fecha_dato'] == months_ym_map[ym]
    next_year_month_mask = df2['fecha_dato'] == months_ym_map[ym1]
        
    common_clients_mask1, common_clients_mask2 = get_common_clients(df1, month_mask, next_year_month_mask, df2)
    
    X = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + target_features + features]            
    Y = df2[common_clients_mask2 & next_year_month_mask][['ncodpers', 'fecha_dato', 'targets_str'] + TARGET_LABELS]    
    clients_last_choice = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + LC_TARGET_LABELS]
    clients_last_choice.columns = ['ncodpers', 'fecha_dato'] + TARGET_LABELS

    assert (X['ncodpers'].values == Y['ncodpers'].values).all(), "There is a problem in alignment"
    Y.index = X.index

    results = cross_val_score2((X, Y, clients_last_choice), 
                                profiles=profiles,
                                nb_folds=nb_folds)
    print "Cross-Validation \n %i | %f | %f | %f | %.5f " % (nb_folds, results.min(), results.mean(), results.max(), results.std())

INFO:root:- Process month : 201505
INFO:root:- Train/Val shapes : [(5470, 8), (5470, 6), (5470, 5), (5470, 8), (5470, 5)], [(49959, 8), (49959, 6), (49959, 5), (49959, 8), (49959, 5)] | (5470, 24), (49959, 24)
INFO:root:- Create the model : 
INFO:root:- Fit the model
INFO:root:-- acc : min=0.490097, max=0.890532
INFO:root:-- loss : min=0.236189, max=0.710621
INFO:root:- Predict using trained model
INFO:root:- Compute map7 score
INFO:root:-- Predicted map7 score: 0.0601623156385
INFO:root:- Train/Val shapes : [(7313, 8), (7313, 6), (7313, 5), (7313, 8), (7313, 5)], [(49958, 8), (49958, 6), (49958, 5), (49958, 8), (49958, 5)] | (7313, 24), (49958, 24)
INFO:root:- Create the model : 
INFO:root:- Fit the model
INFO:root:-- acc : min=0.533753, max=0.862676
INFO:root:-- loss : min=0.284812, max=0.695510
INFO:root:- Predict using trained model
INFO:root:- Compute map7 score
INFO:root:-- Predicted map7 score: 0.0510950976598
INFO:root:- Train/Val shapes : [(7505, 8), (7505, 6), (7505, 5), (750

Cross-Validation 
 3 | 0.051095 | 0.059726 | 0.067921 | 0.00688 


### Test 1 : Train on active clients + feature engineering

#### Data: 

- train_yearmonths_list = [201503, 201504, 201505, 201506, 201603, 201604]
- val_yearmonth = [201605]
- train_nb_clients = 150000

#### Feature engineering

- age -> age group
- income -> income group
- targets logcount1, logcount2, logcount2_diff 
- target logDec, diff

#### Model

- 75 -> 50 -> 30 ->
- sigmoid, binary_crossentropy, nadam, accuracy, 

**Conf:**
- batch_size=5000

*Cross-validation results :* 

 Year-Month | Nb epoch | Nb folds | Min MAP@7 | Mean MAP@7 | Max MAP@7 | STD MAP@7
--- | --- | --- | --- | --- | --- | ---
201503 | 250 | 3 | 0.051612 | 0.064816 | 0.078062 | 0.01080 
201504 | 250 | 3 | 0.052616 | 0.063971 | 0.072449 | 0.00835 
201505 | 250 | 3 | 0.051095 | 0.059726 | 0.067921 | 0.00688 

Kaggle : 0.020662

#### Model 1 

- 50 -> 30 ->
- sigmoid, binary_crossentropy, nadam, accuracy, 

**Conf:**
- batch_size=2000

*Cross-validation results :* 

 Nb epoch | Nb folds | Min MAP@7 | Mean MAP@7 | Max MAP@7 | STD MAP@7
 --- | --- | --- | --- | --- | ---
 150 | 3 | 0.025802 | 0.030137 | 0.037217 | 0.00505 


Kaggle : 0.0197579

## Train model for predictions

In [372]:
current_month = 201505
next_year_month = current_month + 100

df1 = train_df if months_ym_map[current_month] in train_months else val_df
df2 = train_df if months_ym_map[next_year_month] in train_months else val_df

month_mask = df1['fecha_dato'] == months_ym_map[current_month]
next_year_month_mask = df2['fecha_dato'] == months_ym_map[next_year_month]
        
common_clients_mask1, common_clients_mask2 = get_common_clients(df1, month_mask, next_year_month_mask, df2)

In [373]:
X = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + target_features + features]            
Y = df2[common_clients_mask2 & next_year_month_mask][['ncodpers', 'fecha_dato', 'targets_str'] + TARGET_LABELS]    
clients_last_choice = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + LC_TARGET_LABELS]
clients_last_choice.columns = ['ncodpers', 'fecha_dato'] + TARGET_LABELS
assert (X['ncodpers'].values == Y['ncodpers'].values).all(), "There is a problem in alignment"
assert (X['ncodpers'].values == clients_last_choice['ncodpers'].values).all(), "There is a problem in alignment"
Y.index = X.index

In [378]:
def train_model(X_train, Y_train, profiles):

    x_train, y_train = prepare_to_fit(X_train, Y_train, profiles)
    logging.info("- Train data shapes : {}, {}".format(
            [i.shape for i in x_train] if isinstance(x_train, list) else x_train.shape, 
            y_train.shape)
    )                

    logging.info("- Create the model")
    
    estimator = create_model(profiles)
    logging.info("- Fit the model")
    hist = estimator.fit(x_train, y_train, nb_epoch=250, batch_size=5000, verbose=0)        
    for key in hist.history:            
        logging.info("-- %s : min=%f, max=%f" % (key, np.min(hist.history[key]), np.max(hist.history[key])))
    return estimator

estimator = train_model(X, Y, profiles)

INFO:root:- Train data shapes : [(10144, 8), (10144, 6), (10144, 5), (10144, 8), (10144, 5)], (10144, 24)
INFO:root:- Create the model
INFO:root:- Fit the model
INFO:root:-- acc : min=0.606126, max=0.870129
INFO:root:-- loss : min=0.278354, max=0.679641


Check score on the data 2016/05

In [379]:
x_val, y_val = prepare_to_test(X, profiles, Y)

logging.info("- Predict using trained model")
y_pred = estimator.predict(x_val, verbose=0)    
y_pred = pred_to_targets(y_pred)
logging.info("- Compute map7 score")
print map7_score2(y_val, y_pred, clients_last_choice[TARGET_LABELS].values)

INFO:root:- Predict using trained model
INFO:root:- Compute map7 score
INFO:root:-- Predicted map7 score: 0.0617705903067


0.0617705903067


## Prediction for 2016/06

In [334]:
from common import minimal_clean_data_inplace, preprocess_data_inplace

In [335]:
test_df = load_data2(TEST_FILE_PATH, [])
minimal_clean_data_inplace(test_df)
preprocess_data_inplace(test_df)
test_df = test_df.sort_values(['ncodpers'])


INFO:root:-- Read all data from the file : ../data/test_ver2.csv
INFO:root:- Number of lines with unknown data : 0
INFO:root:- Number of columns with nan : 10


In [339]:
print test_df.shape, 

(929615, 22)


In [383]:
train_df, val_df = load_trainval([201502], [201506], 100)

INFO:root:- Load training data : [201501, 201502]
INFO:root:-- Select 100 clients
INFO:root:- Number of lines with unknown data : 2
INFO:root:- Number of columns with nan : 5
INFO:root:-- Process date : 201502
INFO:root:- Load validation : [201505, 201506]
INFO:root:-- Select max clients
INFO:root:- Number of lines with unknown data : 3716
INFO:root:- Number of columns with nan : 10
INFO:root:-- Process date : 201506


In [338]:
current_month = 201506

month_mask = val_fe_df['fecha_dato'] == months_ym_map[current_month]
next_month_mask = test_df['fecha_dato'] == '2016-06-28'
        
common_clients_mask1, common_clients_mask2 = get_common_clients(val_fe_df, month_mask, next_month_mask, test_df)

In [342]:
print (common_clients_mask1 & month_mask).sum(), common_clients_mask2.sum()

925252 925252


In [343]:
X = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + target_features + features]            
clients_last_choice = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + TARGET_LABELS]

In [344]:
print X.shape

(925252, 27)


In [345]:
def get_submission(predicted_added_products, clients, clc, target_labels):
    added_products_col = []
    count = 0 
    for products, last_choice in zip(predicted_added_products, clc):
        predictions = remove_last_choice(products, last_choice)
        added_products_col.append(' '.join([target_labels[i] for i in predictions]))
        count+=1
        if count % 100000 == 0:
            logging.info("Elapsed : %i", count)
            
    out = pd.DataFrame(data={'ncodpers': clients, 'added_products': added_products_col}, columns=['ncodpers', 'added_products'])
    return out

In [371]:
(clients_last_choice['ncodpers'].values == X['ncodpers'].values).all()

True

In [346]:
x_val = prepare_to_test(X, profiles)
logging.info("- Predict using trained model")
y_pred = estimator.predict(x_val, verbose=0)    
y_pred = pred_to_targets(y_pred)
logging.info("- Get submission dataframe:")
clients = X['ncodpers'].values
submission = get_submission(y_pred, clients, clients_last_choice[TARGET_LABELS].values, TARGET_LABELS)

INFO:root:- Predict using trained model
INFO:root:- Get submission dataframe:
INFO:root:Elapsed : 100000
INFO:root:Elapsed : 200000
INFO:root:Elapsed : 300000
INFO:root:Elapsed : 400000
INFO:root:Elapsed : 500000
INFO:root:Elapsed : 600000
INFO:root:Elapsed : 700000
INFO:root:Elapsed : 800000
INFO:root:Elapsed : 900000


In [358]:
submission_clients = set(submission['ncodpers'].unique())
test_clients = set(test_df['ncodpers'].unique())
if submission_clients != test_clients:
    missing_clients = list(test_clients - submission_clients)
    missing_added_products = np.zeros((len(missing_clients)))
    submission = pd.concat([submission, 
                            pd.DataFrame(data={
                                'ncodpers': missing_clients, 
                                'added_products': missing_added_products
                            }, columns=['ncodpers', 'added_products'])])

Get submission DataFrame and write csv file

In [359]:
print submission.shape
submission.head()

(929615, 2)


Unnamed: 0,ncodpers,added_products
0,15889,ind_recibo_ult1 ind_ecue_fin_ult1 ind_cno_fin_...
1,15890,ind_cco_fin_ult1 ind_fond_fin_ult1 ind_reca_fi...
2,15892,ind_cno_fin_ult1 ind_nom_pens_ult1 ind_fond_fi...
3,15893,ind_recibo_ult1 ind_cco_fin_ult1 ind_nom_pens_...
4,15894,ind_cno_fin_ult1 ind_fond_fin_ult1


In [360]:
from datetime import datetime
import csv

logging.info('- Generate submission')
submission_file = '../results/submission_' + \
                  str(datetime.now().strftime("%Y-%m-%d-%H-%M")) + \
                  '.csv'

submission.to_csv(submission_file, index=False, index_label=False)

INFO:root:- Generate submission


In [362]:
with open('../results/submission_2016-11-17-16-37.csv', 'r') as r:
    print r.readline()
    print r.readline()
    print r.readline()
    print r.readline()
    print r.readline()
    

ncodpers,added_products

15889,ind_ecue_fin_ult1 ind_recibo_ult1 ind_dela_fin_ult1 ind_cno_fin_ult1 ind_nom_pens_ult1 ind_ctop_fin_ult1 ind_nomina_ult1 

1170544,ind_recibo_ult1 ind_cno_fin_ult1 ind_ecue_fin_ult1 ind_ctop_fin_ult1 ind_nom_pens_ult1 ind_reca_fin_ult1 ind_ctpp_fin_ult1 

1170545,ind_recibo_ult1 ind_cno_fin_ult1 ind_ecue_fin_ult1 ind_nom_pens_ult1 ind_nomina_ult1 ind_reca_fin_ult1 ind_dela_fin_ult1 

1170547,ind_recibo_ult1 ind_cno_fin_ult1 ind_ecue_fin_ult1 ind_nom_pens_ult1 ind_nomina_ult1 ind_reca_fin_ult1 ind_dela_fin_ult1 

