# NN tryouts on SPR data, inspired by Kaggle Forum "When less is more"

Load training and validation data as 
    month : [ Features | Targets| Difference | Last Choice Targets  ]
    
    - Features : 
    - Target labels : TARGET_LABELS(month), 'targets_str', 'targets_features'
    - Difference with prev month
    - Last Choice Targets : LC_TARGET_LABELS(month-1), 'lc_targets_str', 'lc_targets_features'


In [1]:
import os
import numpy as np
import pandas as pd

import logging
logging.getLogger().handlers = []
logging.basicConfig(level=logging.INFO)

import matplotlib.pylab as plt
%matplotlib inline



In [2]:
from dataset import load_trainval, LC_TARGET_LABELS, TARGET_LABELS_FRQ,  decimal_to_dummies, targets_str_to_indices, targets_dec_to_indices
from common import to_yearmonth, TARGET_LABELS 
from common import target_str_to_labels, TARGET_LABELS2

In [3]:
features = [
    u'ind_empleado', u'pais_residencia',
    u'sexo', u'age', u'ind_nuevo', u'antiguedad', u'indrel',
    u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
    u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
    u'ind_actividad_cliente', u'renta', u'segmento'    
]

In [4]:
train_yearmonths_list = [201504, 201505, 201604]
# train_yearmonths_list = [201504]
val_yearmonth = [201605]
train_nb_clients = 150000
# train_nb_clients = 150
train_df, val_df = load_trainval(train_yearmonths_list, val_yearmonth, train_nb_clients)

INFO:root:- Load training data
INFO:root:- Load data : [201503, 201504, 201505, 201603, 201604]
INFO:root:-- Select 150000 clients
INFO:root:- Number of lines with unknown data : 75
INFO:root:- Number of columns with nan : 10
INFO:root:-- Process date : 201504
INFO:root:-- Process date : 201505
INFO:root:-- Process date : 201604
INFO:root:- Load validation data
INFO:root:- Load data : [201604, 201605]
INFO:root:-- Select max clients
INFO:root:- Number of lines with unknown data : 0
INFO:root:- Number of columns with nan : 10
INFO:root:-- Process date : 201605
INFO:root:-- Compute logCount dictionary
INFO:root:-- Add logCount columns
INFO:root:-- Process month : 2015-03-28
INFO:root:-- Process month : 2015-04-28
INFO:root:-- Process month : 2015-05-28
INFO:root:-- Process month : 2016-03-28
INFO:root:-- Process month : 2016-04-28
INFO:root:-- Process month : 2016-04-28
INFO:root:-- Process month : 2016-05-28
INFO:root:-- Add logDecimal columns
INFO:root:-- Transform age/renta/logdiff
IN

Display loaded data

In [6]:
train_df[['fecha_dato', 'ncodpers'] + TARGET_LABELS_FRQ].head(10)

Unnamed: 0,fecha_dato,ncodpers,ind_ahor_fin_ult1_frq,ind_aval_fin_ult1_frq,ind_cco_fin_ult1_frq,ind_cder_fin_ult1_frq,ind_cno_fin_ult1_frq,ind_ctju_fin_ult1_frq,ind_ctma_fin_ult1_frq,ind_ctop_fin_ult1_frq,...,ind_hip_fin_ult1_frq,ind_plan_fin_ult1_frq,ind_pres_fin_ult1_frq,ind_reca_fin_ult1_frq,ind_tjcr_fin_ult1_frq,ind_valo_fin_ult1_frq,ind_viv_fin_ult1_frq,ind_nomina_ult1_frq,ind_nom_pens_ult1_frq,ind_recibo_ult1_frq
421107,2015-03-28,15893,0.999916,0.999947,0.224104,0.999441,0.899346,0.988279,0.988851,0.832296,...,0.992017,0.987798,0.996571,0.935025,0.943944,0.033452,0.995269,0.931993,0.927308,0.84167
839330,2015-04-28,15893,0.999916,0.999947,0.224104,0.999441,0.899346,0.988279,0.988851,0.832296,...,0.992017,0.987798,0.996571,0.935025,0.943944,0.033452,0.995269,0.931993,0.927308,0.84167
1680872,2015-05-28,15893,0.999916,0.999947,0.224104,0.999441,0.899346,0.988279,0.988851,0.832296,...,0.992017,0.987798,0.996571,0.935025,0.943944,0.033452,0.995269,0.931993,0.927308,0.84167
2199895,2016-03-28,15893,0.999916,0.999947,0.224104,0.999441,0.899346,0.988279,0.988851,0.832296,...,0.992017,0.987798,0.996571,0.935025,0.943944,0.033452,0.995269,0.931993,0.927308,0.84167
3250071,2016-04-28,15893,0.999916,0.999947,0.224104,0.999441,0.899346,0.988279,0.988851,0.832296,...,0.992017,0.987798,0.996571,0.935025,0.943944,0.033452,0.995269,0.931993,0.927308,0.84167
421105,2015-03-28,15895,0.999916,0.999947,0.775896,0.999441,0.100654,0.988279,0.988851,0.832296,...,0.992017,0.012202,0.996571,0.064975,0.056056,0.033452,0.995269,0.931993,0.927308,0.15833
839328,2015-04-28,15895,0.999916,0.999947,0.775896,0.999441,0.100654,0.988279,0.988851,0.832296,...,0.992017,0.012202,0.996571,0.064975,0.056056,0.033452,0.995269,0.931993,0.927308,0.15833
1680874,2015-05-28,15895,0.999916,0.999947,0.775896,0.999441,0.100654,0.988279,0.988851,0.832296,...,0.992017,0.012202,0.996571,0.064975,0.056056,0.033452,0.995269,0.931993,0.927308,0.15833
2199893,2016-03-28,15895,0.999916,0.999947,0.775896,0.999441,0.899346,0.988279,0.988851,0.832296,...,0.992017,0.012202,0.996571,0.064975,0.056056,0.033452,0.995269,0.931993,0.927308,0.15833
3250073,2016-04-28,15895,0.999916,0.999947,0.775896,0.999441,0.899346,0.988279,0.988851,0.832296,...,0.992017,0.012202,0.996571,0.064975,0.056056,0.033452,0.995269,0.931993,0.927308,0.15833


Useful structures

In [7]:
def get_common_clients(df1, mask1, mask2, df2=None):
    active_clients1 = df1[mask1]['ncodpers'].unique()
    if df2 is not None:
        active_clients2 = df2[mask2]['ncodpers'].unique()
    else:
        active_clients2 = df1[mask2]['ncodpers'].unique()
    active_clients = list(set(active_clients1) & set(active_clients2)) 
    
    if df2 is not None:
        return df1['ncodpers'].isin(active_clients), df2['ncodpers'].isin(active_clients)
    return df1['ncodpers'].isin(active_clients)

In [8]:
months_ym_map = {}
months = list(set(train_df['fecha_dato'].unique()) | set(val_df['fecha_dato'].unique()))
for m in months:
    months_ym_map[to_yearmonth(m)] = m
    
train_months = train_df['fecha_dato'].unique()
val_months = val_df['fecha_dato'].unique()

In [9]:
from common import get_added_products, remove_last_choice, apk, map7_score
from visualization import visualize_train_test, visualize_folds, compare_two_datasets, compare_folds, compare_folds2

Create profiles and create models for profiles

In [10]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Merge
from keras.utils import np_utils

Using Theano backend.


### Train a model

In [11]:
target_features = ['targets_diff', 'targets_logdiff', 'targets_logcount2_diff', 'targets_logcount2', 'targets_logcount1', 'targets_logDec']

In [12]:
#active_clients_mask = ~train_df['targets_diff'].isin([0, -99999])
#print active_clients_mask.shape, active_clients_mask.sum()

In [135]:
def get_XY(current_month, df1, next_year_month, df2):
    month_mask = df1['fecha_dato'] == months_ym_map[current_month]
    next_year_month_mask = df2['fecha_dato'] == months_ym_map[next_year_month]
    common_clients_mask1, common_clients_mask2 = get_common_clients(df1, month_mask, next_year_month_mask, df2)
    X = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + target_features + features + TARGET_LABELS_FRQ.tolist()]            
    Y = df2[common_clients_mask2 & next_year_month_mask][['ncodpers', 'fecha_dato', 'targets_str'] + TARGET_LABELS]    
    clients_last_choice = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + LC_TARGET_LABELS.tolist()]
    clients_last_choice.columns = ['ncodpers', 'fecha_dato'] + LC_TARGET_LABELS.tolist()
    assert (X['ncodpers'].values == Y['ncodpers'].values).all(), "There is a problem in alignment"
    Y.index = X.index
    
    return X, Y, clients_last_choice

In [136]:
current_month = 201505
next_year_month = current_month + 100

df1 = train_df if months_ym_map[current_month] in train_months else val_df
#df1 = train_df
df2 = train_df if months_ym_map[next_year_month] in train_months else val_df
#df2 = train_df

X, Y, clients_last_choice = get_XY(current_month, df1, next_year_month, df2)

In [None]:
print X.shape
X.head(10)

In [None]:
print Y.shape
Y.head(10)

In [19]:
print clients_last_choice.shape
clients_last_choice.head(10)

(149851, 26)


Unnamed: 0,ncodpers,fecha_dato,lc_ind_ahor_fin_ult1,lc_ind_aval_fin_ult1,lc_ind_cco_fin_ult1,lc_ind_cder_fin_ult1,lc_ind_cno_fin_ult1,lc_ind_ctju_fin_ult1,lc_ind_ctma_fin_ult1,lc_ind_ctop_fin_ult1,...,lc_ind_hip_fin_ult1,lc_ind_plan_fin_ult1,lc_ind_pres_fin_ult1,lc_ind_reca_fin_ult1,lc_ind_tjcr_fin_ult1,lc_ind_valo_fin_ult1,lc_ind_viv_fin_ult1,lc_ind_nomina_ult1,lc_ind_nom_pens_ult1,lc_ind_recibo_ult1
1680872,15893,2015-05-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1680874,15895,2015-05-28,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
1680875,15897,2015-05-28,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
1680880,15903,2015-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1680881,15906,2015-05-28,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
1680883,15908,2015-05-28,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
1680869,15917,2015-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1680856,15923,2015-05-28,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
1680857,15924,2015-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1680861,15928,2015-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Run KFold Cross-validation 

In [150]:
def create_model(profiles, ll):
    final_model = Sequential()
    models = []
    for key in profiles:
        length = len(profiles[key])
        model = Sequential()
        model.add(Dense(20, init='uniform', input_shape=(length,), activation='relu'))
        model.add(Dropout(0.15))
        model.add(Dense(ll, activation='sigmoid'))
#         model.add(Dense(ll, activation='softmax'))
        models.append(model)

    merged = Merge(models, mode='max')
    final_model.add(merged)
    final_model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
#     final_model.compile(loss='mae', optimizer='nadam', metrics=['accuracy'])
    return final_model

In [255]:
common_group1 = [2, 18, 23] #range(24)
common_group2 = [3, 4, 7, 8, 12] #range(24)
others = list(set(range(24)) - set(common_group1 + common_group2))
for i, a in enumerate(TARGET_LABELS2[common_group1]):
    print i, a

0 Current Accounts
1 Credit Card
2 Direct Debit


In [256]:
target_groups = [common_group1, common_group2, others]
print sum([len(t) for t in target_groups])

24


In [257]:
not_normalized_profiles = [1, 101, 3]

profiles = {
    
## 3 | 0.026415 | 0.029433 | 0.033869 | 0.00320 (not normalized)
#  3 | 0.026515 | 0.032648 | 0.041429 | 0.00637 (normalized)
   0: ['pais_residencia', 'sexo', 'age', 'ind_nuevo', 'segmento', 'ind_empleado', 'ind_actividad_cliente', 'indresi'],
#    100: ['pais_residencia', 'sexo', 'age', 'ind_nuevo', 'segmento', 'ind_empleado', 'ind_actividad_cliente', 'indresi'],

## 3 | 0.027578 | 0.030712 | 0.035311 | 0.00332 (should not be normalized)
    1: target_features,
    101: target_features,
    
    2: ['pais_residencia', 'sexo', 'age', 'segmento', 'renta'],
    3: ['pais_residencia', 'sexo', 'age', 'renta', 'targets_logdiff', 'targets_logcount2_diff','targets_logcount2','targets_logcount1'],
    11: ['nomprov', 'ind_nuevo', 'renta', 'ind_actividad_cliente', 'canal_entrada'],

## 3 | 0.025410 | 0.026983 | 0.028351 | 0.00121  (normalized)
    12: TARGET_LABELS_FRQ,
#     112: TARGET_LABELS_FRQ,        
    
## 3 | 0.025081 | 0.026518 | 0.028983 | 0.00175 
##    12: TARGET_LABELS_FRQ[common_group],
##    112: TARGET_LABELS_FRQ[common_group],        
    
## 3 | 0.025158 | 0.027583 | 0.032355 | 0.00337  (normalized)
## 3 | 0.016943 | 0.020535 | 0.025388 | 0.00356  (not normalized)
#     13: list(TARGET_LABELS_FRQ[common_group]) + target_features,
#     113: list(TARGET_LABELS_FRQ[common_group]) + target_features,

#     14: ['pais_residencia', 'sexo', 'age', 'segmento', 'renta'] + list(TARGET_LABELS_FRQ[common_group])
#     114: ['pais_residencia', 'sexo', 'age', 'segmento', 'renta'] + list(TARGET_LABELS_FRQ[common_group])
}

In [258]:
def targets_str_to_indices(targets_str, **kwargs):
    out = []
    index_map = lambda x: x if 'index_map' not in kwargs else kwargs['index_map'][x]
    for s in targets_str:
        out.append([index_map(i) for i, c in enumerate(s) if int(c) == 1])
    return np.array(out)

a = [[1, 0, 1, 0, 0, 0, 0, 0], [1, 0, 1, 1, 0, 0, 0, 0]]
print targets_str_to_indices(a, index_map=common_group)
print common_group

[[2, 4] [2, 4, 7]]
[2, 3, 4, 7, 8, 12, 18, 23]


In [286]:
# CROSS VALIDATION
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
n_highest = 7


def prepare_to_fit(X_train, Y_train, profiles):
    # Select only active client for the training part
    mask = ~X_train['targets_diff'].isin([0, -99999])
    X_train_ = X_train[mask]
    y_train = Y_train[mask].values
    x_train = []
    for key in profiles:
        x_train_ = X_train_[profiles[key]].values
        if key not in not_normalized_profiles:
            x_train_ = StandardScaler().fit_transform(x_train_)
        x_train.append(x_train_)
    return x_train, y_train


def prepare_to_test(X_val, profiles, Y_val=None):
    if Y_val is not None:
        y_val = Y_val.values
    
    x_val = []
    for key in profiles:
        x_val_ = X_val[profiles[key]].values   
        if key not in not_normalized_profiles:
            x_val_ = StandardScaler().fit_transform(x_val_)  
        x_val.append(x_val_)    
    
    if Y_val is not None:
        return x_val, y_val
    else:
        return x_val

    
def pred_to_targets_indices(y_probas, **kwargs):
    
    threshold = 0.5 if 'threshold' not in kwargs else kwargs['threshold']
    n_highest = 7 if 'n_highest' not in kwargs else kwargs['n_highest']
    
    y_probas[y_probas < threshold] = 0.0
    predicted_added_products = np.argsort(y_probas, axis=1)
    predicted_added_products = predicted_added_products[:,::-1][:,:n_highest]
    out = []
    index_map = lambda x: x if 'index_map' not in kwargs else kwargs['index_map'][x]
    for i, t in enumerate(predicted_added_products):
        out.append([index_map(j) for j in t if y_probas[i, j] > 0.0])
    return np.array(out)


def train_model(X_train, Y_train, profiles, 
                prepare_to_fit_func=prepare_to_fit, 
                create_model_func=create_model,
                **kwargs
               ):
    x_train, y_train = prepare_to_fit_func(X_train, Y_train, profiles)
    logging.info("- Train data shapes : {}, {}".format(
            [i.shape for i in x_train] if isinstance(x_train, list) else x_train.shape, 
            y_train.shape)
    )                
    logging.info("- Create the model")
    estimator = create_model_func(profiles, y_train.shape[1])
    
    nb_epoch = 100 if 'nb_epoch' not in kwargs else kwargs['nb_epoch']
    batch_size = min(7500, x_train[0].shape[0]) if 'batch_size' not in kwargs else kwargs['batch_size']

    logging.info("- Fit the model : (%i, %i)" % (nb_epoch, batch_size))
    hist = estimator.fit(x_train, y_train, nb_epoch=nb_epoch, batch_size=batch_size, verbose=0)        
    for key in hist.history:            
        logging.info("-- {} : min={:5f}, max={:5f}, last={}".format(key, np.min(hist.history[key]), np.max(hist.history[key]), hist.history[key][-4:]))
    return estimator


def predict_with_model(estimator, X_val, profiles, Y_val=None, 
                        prepare_to_test_func=prepare_to_test, 
                        transform_pred_func=pred_to_targets_indices, 
                       **kwargs):
    """
    kwargs : 
        threshold: e.g. 0.55
        return_probas: True/False
        group (list of indices)
    """
    logging.info("- Predict using trained model")
    
    if Y_val is not None:
        x_val, y_val = prepare_to_test_func(X_val, profiles, Y_val)
        logging.info("- Test data shapes : {}, {}".format(
                [i.shape for i in x_val] if isinstance(x_val, list) else x_val.shape, 
                y_val.shape)
        )                
    else:
        x_val = prepare_to_test_func(X_val, profiles)
        logging.info("- Test data shapes : {}".format(
                [i.shape for i in x_val] if isinstance(x_val, list) else x_val.shape)
        )                
    
    y_probas = estimator.predict(x_val, verbose=0)    
    y_pred = transform_pred_func(y_probas, **kwargs)
    
    return_probas = kwargs['return_probas'] if 'return_probas' in kwargs else False    
    
    if Y_val is not None:
        y_val = targets_str_to_indices(y_val, **kwargs)
        if return_probas:
            return y_pred, y_val, y_probas
        return y_pred, y_val
    
    if return_probas:
        return y_pred, y_probas
    return y_pred


def merge_predictions(y_array, groups):
    """
    y_array is a list of ndarrays : [y1, y2, ...]
        yi can be of shape (nb_samples, len(gi)) or (nb_samples,)
    groups is a list of indices : [g1, g2, ..., gN], len(y_array) == len(groups)
    return a properly ordered ndarray of shape (nb_samples, len(g1) + ... + len(gN)) or (nb_samples,) 
    """
    ll = np.sum([len(g) for g in groups])
    nb_samples = y_array[0].shape[0]
    two_d_case = len(y_array[0].shape) == 2    
    if two_d_case:
        out = np.empty((nb_samples, ll)) 
    else:
        out = np.empty((nb_samples, ), dtype=np.object_)
        out.fill([])
    for g, y in zip(groups, y_array):
        if two_d_case:
            out[:,g] = y[:]
        else:
            out += y
    return out


def fit_predict(X_train, Y_train, profiles, 
                X_val, Y_val, clc_val=None,
                train_model_func=train_model, 
                predict_with_model_func=predict_with_model,
                merge_predictions_func=merge_predictions):
    estimator = train_model_func(X_train, Y_train[TARGET_LABELS], profiles, nb_epoch=300)    
    return predict_with_model_func(estimator, X_val, profiles, Y_val[TARGET_LABELS], threshold=0.5, n_highest=7)


def fit_predict2(X_train, Y_train, profiles,
                X_val, Y_val, clc_val=None,
                train_model_func=train_model, 
                prepare_to_test_func=prepare_to_test,
                transform_pred_func=pred_to_targets_indices,
                merge_predictions_func=merge_predictions):

    x_val, y_val = prepare_to_test_func(X_val, profiles, Y_val[TARGET_LABELS])
    y_val = targets_str_to_indices(y_val)

    _y_probas = []    
    for group in target_groups:
        logging.info("--- process group : {}".format(group))
        target_labels = np.array(TARGET_LABELS)[group]
        estimator = train_model_func(X_train, Y_train[target_labels], profiles, 
                                     nb_epoch=300)  
            
        logging.info("- Test data shapes : {}, {}".format(
                [i.shape for i in x_val] if isinstance(x_val, list) else x_val.shape, 
                y_val.shape)
        )                
    
        y_probas = estimator.predict(x_val, verbose=0)    
        y_subpred = transform_pred_func(y_probas, threshold=0.5, n_highest=7)
        subscore = map7_score(y_val, y_subpred, clc_val)
        logging.info("--- group subscore: {}".format(subscore))
        _y_probas.append(y_probas)
    
    y_probas = merge_predictions(_y_probas, target_groups)
    y_pred = transform_pred_func(y_probas, threshold=0.5, n_highest=7)
           
    return y_pred, y_val


def cross_val_score2(data, 
                     profiles,
                     nb_folds=5, 
                     fit_predict_func=fit_predict):
    
    logging.info("- Cross validation : ")
    x_df, y_df, clients_last_choice = data
    kf = KFold(n_splits=nb_folds)
    scores = []
    
    count = 0
    for train_index, test_index in kf.split(range(x_df.shape[0])):
        count += 1
        logging.info("\n\n\t\t-- Fold : %i / %i\n" % (count, nb_folds))
        
        X_train, X_val = x_df.loc[x_df.index[train_index], :], x_df.loc[x_df.index[test_index], :]
        Y_train, Y_val = y_df.loc[y_df.index[train_index], :], y_df.loc[y_df.index[test_index], :]
        clc_val = clients_last_choice[test_index, :]
                
        y_pred, y_val = fit_predict_func(X_train, Y_train, profiles,
                                         X_val, Y_val, clc_val)
        logging.info("- Compute map7 score")
        scores.append(map7_score(y_val, y_pred, clc_val))   
        
    return np.array(scores)

In [None]:
nb_folds = 3
results = cross_val_score2((X, Y, clients_last_choice[LC_TARGET_LABELS].values), 
                            profiles=profiles,
                            nb_folds=nb_folds)
print "Profiles : ", profiles.keys()
print "Cross-Validation \n %i | %f | %f | %f | %.5f " % (nb_folds, results.min(), results.mean(), results.max(), results.std())


INFO:root:- Cross validation : 
INFO:root:

		-- Fold : 1 / 3

INFO:root:- Train data shapes : [(5720, 8), (5720, 6), (5720, 5), (5720, 8), (5720, 6), (5720, 5), (5720, 24)], (5720, 24)
INFO:root:- Create the model
INFO:root:- Fit the model : (300, 5720)
INFO:root:-- acc : min=0.864780, max=0.900903, last=[0.90067011117935181, 0.90006560087203979, 0.90058273077011108, 0.90087413787841797]
INFO:root:-- loss : min=0.424983, max=0.750873, last=[0.42642933130264282, 0.42632958292961121, 0.42498284578323364, 0.42741608619689941]
INFO:root:- Predict using trained model
INFO:root:- Test data shapes : [(49951, 8), (49951, 6), (49951, 5), (49951, 8), (49951, 6), (49951, 5), (49951, 24)], (49951, 24)
INFO:root:- Compute map7 score
INFO:root:-- Predicted map7 score: 0.0247544180599
INFO:root:

		-- Fold : 2 / 3

INFO:root:- Train data shapes : [(7489, 8), (7489, 6), (7489, 5), (7489, 8), (7489, 6), (7489, 5), (7489, 24)], (7489, 24)
INFO:root:- Create the model
INFO:root:- Fit the model : (300, 7


201505 -> 201605 

merge : max

Profiles :  [0, 1, 2, 3, 101, 11, 12]
Cross-Validation 
 3 | 0.029247 | 0.032420 | 0.034438 | 0.00227 


Profiles :  [0, 1, 2, 3, 101, 11, 12]
Cross-Validation 
 3 | 0.025502 | 0.028212 | 0.030811 | 0.00217 

Profiles :  [0, 1, 12]
Cross-Validation 
 3 | 0.026434 | 0.029399 | 0.033065 | 0.00275 


Profiles :  [0, 1]
Cross-Validation 
 3 | 0.027343 | 0.033443 | 0.044215 | 0.00764 

merge : ave

Profiles :  [0, 112, 12, 101]
Cross-Validation 
 3 | 0.025060 | 0.027826 | 0.031932 | 0.00296 


Profiles :  [0, 1, 12, 101, 112]
Cross-Validation 
 3 | 0.019901 | 0.022350 | 0.026742 | 0.00311 


Profiles :  [0, 12, 100, 112]
Cross-Validation 
 3 | 0.025470 | 0.027646 | 0.031973 | 0.00306 




201505 -> 201605 

Cross-Validation without scaler   
 3 | 0.045991 | 0.055948 | 0.061927 | 0.00709 
 3 | 0.052378 | 0.061900 | 0.073181 | 0.00858 


201506 -> 201605

Cross-Validation without scaler   
 3 | 0.057028 | 0.068970 | 0.077645 | 0.00873 
 3 | 0.055150 | 0.067474 | 0.078318 | 0.00952 

Kaggle : 0.0097026 <-> Many missing clients


Cross-Validation with scaler   
 3 | 0.050298 | 0.060795 | 0.066987 | 0.00746 





Compute cross-validation across several months

In [None]:
nb_folds = 3
yms = [201504, 201505]
#yms = [201505]

for ym in yms:
    logging.info("\n-------------------------")
    logging.info("- Process month : %s" % ym)
    logging.info("-------------------------\n")
    
    ym1 = ym + 100    
    df1 = train_df if months_ym_map[ym] in train_months else val_df
    df2 = train_df if months_ym_map[ym1] in train_months else val_df
    X, Y, clients_last_choice = get_XY(ym, df1, ym1, df2) 
    results = cross_val_score2((X, Y, clients_last_choice[LC_TARGET_LABELS].values), 
                                profiles=profiles,
                                nb_folds=nb_folds)
    print "Cross-Validation \n %i | %f | %f | %f | %.5f " % (nb_folds, results.min(), results.mean(), results.max(), results.std())

## Train model for predictions

In [137]:
current_month = 201505
next_year_month = current_month + 100

df1 = train_df
#df1 = val_df
#df2 = train_df if months_ym_map[next_year_month] in train_months else val_df
df2 = val_df

X, Y, clients_last_choice = get_XY(current_month, df1, next_year_month, df2)

In [138]:
estimator = train_model(X, Y, profiles)

INFO:root:- Train data shapes : [(10375, 8), (10375, 6), (10375, 5), (10375, 8), (10375, 6), (10375, 5), (10375, 24)], (10375, 24)
INFO:root:- Create the model
INFO:root:- Fit the model : (300, 7500)
INFO:root:-- acc : min=0.147229, max=0.940783, last=[0.94036143443670617, 0.94062649988266356, 0.94063854504780597, 0.94078316458736555]
INFO:root:-- loss : min=0.167897, max=3.495318, last=[0.16981683868959727, 0.16941485652722507, 0.16789676338793283, 0.16926698871405729]


Check score on the data 2016/05

In [139]:
y_pred, y_val, y_probas = predict_with_model(estimator, X, profiles, Y, threshold=0.5, return_probas=True)

logging.info("- Compute map7 score")
print map7_score(y_val, y_pred, clients_last_choice[LC_TARGET_LABELS].values)
logging.info("- Compute max map7 score")
print map7_score(y_val, y_val, clients_last_choice[LC_TARGET_LABELS].values)

INFO:root:- Predict using trained model
INFO:root:- Test data shapes : [(149851, 8), (149851, 6), (149851, 5), (149851, 8), (149851, 6), (149851, 5), (149851, 24)], (149851, 24)
INFO:root:- Compute map7 score
INFO:root:-- Predicted map7 score: 0.0272834242861
INFO:root:- Compute max map7 score


0.0272834242861


INFO:root:-- Predicted map7 score: 0.112865446343


0.112865446343


In [140]:
from common import targets_to_labels, targets_indices_to_labels, remove_last_choice

In [144]:
clients_last_choice[LC_TARGET_LABELS].values

limit = 100
count = 0
for last_choice, targets, products in zip(clients_last_choice[LC_TARGET_LABELS].values, y_val, y_pred):
    added_products = remove_last_choice(targets, last_choice)
    predictions = remove_last_choice(products, last_choice)
    
    if len(added_products) == 0:
        continue
    print "--- Count = ", count
    print targets_indices_to_labels(added_products, TARGET_LABELS2)
    print targets_indices_to_labels(predictions, TARGET_LABELS2), targets_indices_to_labels(products, TARGET_LABELS2)
    
    count += 1
    if count == limit:
        break

--- Count =  0
['Taxes']
[] ['e-account', 'Direct Debit', 'Payroll Account', 'Pensions', 'Payroll', 'Credit Card']
--- Count =  1
['Taxes']
['e-account'] ['Pensions (plan fin)', 'Securities', 'Direct Debit', 'Funds', 'e-account', 'Credit Card']
--- Count =  2
['e-account']
[] ['particular Account', 'Current Accounts']
--- Count =  3
['Payroll Account']
['Payroll Account', 'Pensions'] ['e-account', 'Direct Debit', 'particular Plus Account', 'Payroll Account', 'Credit Card', 'Pensions']
--- Count =  4
['Current Accounts', 'Long-term deposits', 'Securities']
['Payroll Account', 'Pensions', 'Payroll'] ['Direct Debit', 'e-account', 'Payroll Account', 'Pensions', 'Credit Card', 'particular Plus Account', 'Payroll']
--- Count =  5
['Funds']
['e-account'] ['Taxes', 'Direct Debit', 'Payroll Account', 'Pensions', 'Payroll', 'e-account', 'Current Accounts']
--- Count =  6
['Current Accounts', 'Securities', 'Direct Debit']
['Direct Debit'] ['Taxes', 'Payroll Account', 'e-account', 'Funds', 'Pensio

## Prediction for 2016/06

In [None]:
from dataset import load_train_test

In [None]:
full_train_df, test_df = load_train_test([201506])

In [None]:
months_ym_map = {}
months = list(set(full_train_df['fecha_dato'].unique()) | set(test_df['fecha_dato'].unique()))
for m in months:
    months_ym_map[to_yearmonth(m)] = m
    
full_train_months = full_train_df['fecha_dato'].unique()
test_months = test_df['fecha_dato'].unique()

In [None]:
current_month = 201506

month_mask = full_train_df['fecha_dato'] == months_ym_map[current_month]
next_month_mask = test_df['fecha_dato'] == '2016-06-28'
df1 = full_train_df
common_clients_mask1, common_clients_mask2 = get_common_clients(full_train_df, month_mask, next_month_mask, test_df)
print (common_clients_mask1 & month_mask).sum(), common_clients_mask2.sum()
X = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + target_features + features]            
clients_last_choice = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + TARGET_LABELS]

In [None]:
print X.shape, test_df.shape

In [None]:
def get_submission(predicted_added_products, clients, clc, target_labels):
    added_products_col = []
    count = 0 
    for products, last_choice in zip(predicted_added_products, clc):
        predictions = remove_last_choice(products, last_choice)
        added_products_col.append(' '.join([target_labels[i] for i in predictions]))
        count+=1
        if count % 100000 == 0:
            logging.info("Elapsed : %i", count)
            
    out = pd.DataFrame(data={'ncodpers': clients, 'added_products': added_products_col}, columns=['ncodpers', 'added_products'])
    return out

In [None]:
y_pred = predict_with_model(estimator, X, profiles)

logging.info("- Get submission dataframe:")
clients = X['ncodpers'].values
submission = get_submission(y_pred, clients, clients_last_choice[TARGET_LABELS].values, TARGET_LABELS)

In [None]:
submission_clients = set(submission['ncodpers'].unique())
test_clients = set(test_df['ncodpers'].unique())
if submission_clients != test_clients:
    missing_clients = list(test_clients - submission_clients)
    missing_added_products = np.zeros((len(missing_clients)))
    submission = pd.concat([submission, 
                            pd.DataFrame(data={
                                'ncodpers': missing_clients, 
                                'added_products': missing_added_products
                            }, columns=['ncodpers', 'added_products'])])

Get submission DataFrame and write csv file

In [None]:
print submission.shape
submission.head()

In [None]:
from datetime import datetime
import csv

logging.info('- Generate submission')
submission_file = '../results/submission_' + \
                  str(datetime.now().strftime("%Y-%m-%d-%H-%M")) + \
                  '.csv'

submission.to_csv(submission_file, index=False, index_label=False)

In [None]:
with open('../results/submission_2016-11-17-16-37.csv', 'r') as r:
    print r.readline()
    print r.readline()
    print r.readline()
    print r.readline()
    print r.readline()
    