# NN tryouts on SPR data, inspired by Kaggle Forum "When less is more"

Load training and validation data as 
    month : [ Features | Targets| Difference | Last Choice Targets  ]
    
    - Features : 
    - Target labels : TARGET_LABELS(month), 'targets_str', 'targets_features'
    - Difference with prev month
    - Last Choice Targets : LC_TARGET_LABELS(month-1), 'lc_targets_str', 'lc_targets_features'


In [1]:
import os
import numpy as np
import pandas as pd

import logging
logging.getLogger().handlers = []
logging.basicConfig(level=logging.INFO)

import matplotlib.pylab as plt
%matplotlib inline

In [2]:
from dataset import load_trainval, LC_TARGET_LABELS, decimal_to_dummies, targets_str_to_indices, targets_dec_to_indices
from common import to_yearmonth, TARGET_LABELS 
from common import target_str_to_labels, TARGET_LABELS2

In [3]:
features = [
    u'ind_empleado', u'pais_residencia',
    u'sexo', u'age', u'ind_nuevo', u'antiguedad', u'indrel',
    u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
    u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
    u'ind_actividad_cliente', u'renta', u'segmento'    
]

In [4]:
train_yearmonths_list = [201504, 201505, 201604]
val_yearmonth = [201605]
train_nb_clients = 150000
train_df, val_df = load_trainval(train_yearmonths_list, val_yearmonth, train_nb_clients)

INFO:root:- Load training data
INFO:root:- Load data : [201503, 201504, 201505, 201603, 201604]
INFO:root:-- Select 150000 clients
INFO:root:- Number of lines with unknown data : 51
INFO:root:- Number of columns with nan : 9
INFO:root:-- Process date : 201504
INFO:root:-- Process date : 201505
INFO:root:-- Process date : 201604
INFO:root:- Load validation data
INFO:root:- Load data : [201604, 201605]
INFO:root:-- Select max clients
INFO:root:- Number of lines with unknown data : 0
INFO:root:- Number of columns with nan : 10
INFO:root:-- Process date : 201605
INFO:root:-- Compute logCount dictionary
INFO:root:-- Add logCount columns
INFO:root:-- Process month : 2015-03-28
INFO:root:-- Process month : 2015-04-28
INFO:root:-- Process month : 2015-05-28
INFO:root:-- Process month : 2016-03-28
INFO:root:-- Process month : 2016-04-28
INFO:root:-- Process month : 2016-04-28
INFO:root:-- Process month : 2016-05-28
INFO:root:-- Add logDecimal columns
INFO:root:-- Transform age/renta/logdiff


In [33]:
df = train_df.copy()

months = df['fecha_dato'].unique()
for m in months:
    logging.info("-- Process month : %s" % m)
    tmask = df['fecha_dato'] == m
    
    for t in TARGET_LABELS:
        counts = df[tmask][t].value_counts()
        counts = counts/counts.sum()
        
#         print "\n", t
#         print counts/counts.sum()
        
        df.loc[tmask, t] = df.loc[tmask, t].apply(lambda x: counts[x])
        
        
    break
    
#     df.loc[tmask, 'targets_logcount1'] = df[tmask]['targets_str'].apply(lambda x: current_logcount_dict[x])
#     df.loc[tmask, 'targets_logcount2'] = df[tmask]['targets_str'].apply(lambda x: logcount_dict[x])

#     if df[tmask]['lc_targets_str'].isnull().sum() == 0:
#         df.loc[tmask, 'lc_targets_logcount2'] = df[tmask]['lc_targets_str'].apply(lambda x: logcount_dict[x])
#         df.loc[tmask, 'targets_logcount2_diff'] = df.loc[tmask, 'targets_logcount2'] - df.loc[
#             tmask, 'lc_targets_logcount2']

# df.loc[df['targets_logcount2_diff'].isnull(), 'targets_logcount2_diff'] = -99999
# df.loc[df['lc_targets_logcount2'].isnull(), 'lc_targets_logcount2'] = -99999
df[TARGET_LABELS].head()

INFO:root:-- Process month : 2015-03-28


Unnamed: 0,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
421104,0.99982,0.999973,0.783315,0.99944,0.097291,0.987885,0.987439,0.170146,0.943107,0.996633,...,0.992526,0.012108,0.996466,0.05938,0.056013,0.031444,0.994619,0.931039,0.929432,0.154478
839327,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
1680875,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
2199891,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
3250075,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0


Display loaded data

In [None]:
#print train_df.shape, train_df.columns
#print val_df.shape, val_df.columns

In [None]:
#train_df.head(10)

Useful structures

In [5]:
def get_common_clients(df1, mask1, mask2, df2=None):
    active_clients1 = df1[mask1]['ncodpers'].unique()
    if df2 is not None:
        active_clients2 = df2[mask2]['ncodpers'].unique()
    else:
        active_clients2 = df1[mask2]['ncodpers'].unique()
    active_clients = list(set(active_clients1) & set(active_clients2)) 
    
    if df2 is not None:
        return df1['ncodpers'].isin(active_clients), df2['ncodpers'].isin(active_clients)
    return df1['ncodpers'].isin(active_clients)

In [6]:
months_ym_map = {}
months = list(set(train_df['fecha_dato'].unique()) | set(val_df['fecha_dato'].unique()))
for m in months:
    months_ym_map[to_yearmonth(m)] = m
    
train_months = train_df['fecha_dato'].unique()
val_months = val_df['fecha_dato'].unique()

In [7]:
from common import get_added_products, remove_last_choice, apk, map7_score
from visualization import visualize_train_test, visualize_folds, compare_two_datasets, compare_folds, compare_folds2

Create profiles and create models for profiles

In [8]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Merge
from keras.utils import np_utils

Using TensorFlow backend.


### Train a model

In [9]:
target_features = ['targets_diff', 'targets_logdiff', 'targets_logcount2_diff', 'targets_logcount2', 'targets_logcount1', 'targets_logDec']

In [10]:
#active_clients_mask = ~train_df['targets_diff'].isin([0, -99999])
#print active_clients_mask.shape, active_clients_mask.sum()

In [11]:
def get_XY(current_month, df1, next_year_month, df2):
    month_mask = df1['fecha_dato'] == months_ym_map[current_month]
    next_year_month_mask = df2['fecha_dato'] == months_ym_map[next_year_month]
    common_clients_mask1, common_clients_mask2 = get_common_clients(df1, month_mask, next_year_month_mask, df2)
    X = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + target_features + features]            
    Y = df2[common_clients_mask2 & next_year_month_mask][['ncodpers', 'fecha_dato', 'targets_str'] + TARGET_LABELS]    
    clients_last_choice = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + LC_TARGET_LABELS]
    clients_last_choice.columns = ['ncodpers', 'fecha_dato'] + LC_TARGET_LABELS
    assert (X['ncodpers'].values == Y['ncodpers'].values).all(), "There is a problem in alignment"
    Y.index = X.index
    
    return X, Y, clients_last_choice

In [12]:
current_month = 201505
next_year_month = current_month + 100

df1 = train_df if months_ym_map[current_month] in train_months else val_df
#df1 = train_df
df2 = train_df if months_ym_map[next_year_month] in train_months else val_df
#df2 = train_df

X, Y, clients_last_choice = get_XY(current_month, df1, next_year_month, df2)

In [13]:
print X.shape
X.head(10)

(149845, 27)


Unnamed: 0,ncodpers,fecha_dato,targets_diff,targets_logdiff,targets_logcount2_diff,targets_logcount2,targets_logcount1,targets_logDec,ind_empleado,pais_residencia,...,tiprel_1mes,indresi,indext,conyuemp,canal_entrada,indfall,nomprov,ind_actividad_cliente,renta,segmento
1680875,15897,2015-05-28,-2.0,-1.098612,2.1e-05,5.6e-05,0.000158,14.805207,1,0,...,0,0,0,1,0,0,2,1.0,10,2
1680876,15899,2015-05-28,0.0,0.0,0.0,0.000184,0.000158,14.602025,3,0,...,0,0,0,1,0,0,2,1.0,8,2
1680880,15903,2015-05-28,0.0,0.0,0.0,0.000633,0.00123,14.586878,4,0,...,0,0,0,1,0,0,2,1.0,7,0
1680870,15914,2015-05-28,-6.0,-1.94591,-8.8e-05,3.6e-05,0.000158,14.780272,4,0,...,0,0,0,1,0,0,2,1.0,11,2
1680855,15922,2015-05-28,0.0,0.0,0.0,0.000392,0.000565,14.586867,3,0,...,0,0,0,1,0,0,2,1.0,10,0
1680859,15926,2015-05-28,0.0,0.0,0.0,0.000411,0.000732,14.587344,4,0,...,0,0,0,1,0,0,2,1.0,8,0
1680863,15930,2015-05-28,0.0,0.0,0.0,0.000316,0.000366,14.556709,0,0,...,0,0,0,1,0,0,2,1.0,11,2
1680865,15933,2015-05-28,0.0,0.0,0.0,0.000972,0.002094,14.586863,4,0,...,1,0,0,1,0,0,2,0.0,10,0
1680889,15939,2015-05-28,0.0,0.0,0.0,0.000381,0.000565,14.557312,0,0,...,0,0,0,1,0,0,2,1.0,10,0
1680909,15940,2015-05-28,0.0,0.0,0.0,0.000294,0.000366,14.589708,3,0,...,0,0,0,1,0,0,2,1.0,11,2


In [14]:
print Y.shape
Y.head(10)

(149845, 27)


Unnamed: 0,ncodpers,fecha_dato,targets_str,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
1680875,15897,2016-05-28,000010010000110101110011,0,0,0,0,1,0,0,...,0,1,0,1,1,1,0,0,1,1
1680876,15899,2016-05-28,001000011000000100010001,0,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,1
1680880,15903,2016-05-28,001000010000000000100000,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1680870,15914,2016-05-28,001010000000101010100111,0,0,1,0,1,0,0,...,1,0,1,0,1,0,0,1,1,1
1680855,15922,2016-05-28,001000010000000000001001,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1680859,15926,2016-05-28,001000010000010000010000,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680863,15930,2016-05-28,001000000000010100010000,0,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
1680865,15933,2016-05-28,001000010000000000000000,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1680889,15939,2016-05-28,001000000000101000000001,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1680909,15940,2016-05-28,001000010001100000010001,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [15]:
print clients_last_choice.shape
clients_last_choice.head(10)

(149845, 26)


Unnamed: 0,ncodpers,fecha_dato,lc_ind_ahor_fin_ult1,lc_ind_aval_fin_ult1,lc_ind_cco_fin_ult1,lc_ind_cder_fin_ult1,lc_ind_cno_fin_ult1,lc_ind_ctju_fin_ult1,lc_ind_ctma_fin_ult1,lc_ind_ctop_fin_ult1,...,lc_ind_hip_fin_ult1,lc_ind_plan_fin_ult1,lc_ind_pres_fin_ult1,lc_ind_reca_fin_ult1,lc_ind_tjcr_fin_ult1,lc_ind_valo_fin_ult1,lc_ind_viv_fin_ult1,lc_ind_nomina_ult1,lc_ind_nom_pens_ult1,lc_ind_recibo_ult1
1680875,15897,2015-05-28,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
1680876,15899,2015-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1680880,15903,2015-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1680870,15914,2015-05-28,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
1680855,15922,2015-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1680859,15926,2015-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1680863,15930,2015-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1680865,15933,2015-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1680889,15939,2015-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1680909,15940,2015-05-28,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


### Run KFold Cross-validation 

In [16]:
def create_model(profiles):
    ll = len(TARGET_LABELS)
    final_model = Sequential()
    models = []
    for key in profiles:
        length = len(profiles[key])
        model = Sequential()
        model.add(Dense(50, init='uniform', input_shape=(length,), activation='relu'))
        model.add(Dropout(0.15))
        model.add(Dense(10 + length, activation='relu'))
        model.add(Dropout(0.15))
        model.add(Dense(ll, activation='sigmoid'))
        models.append(model)

    merged = Merge(models, mode='ave')
    final_model.add(merged)
#     final_model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
    final_model.compile(loss='mae', optimizer='nadam', metrics=['accuracy'])
    return final_model

In [17]:
profiles = {
    0: ['pais_residencia', 'sexo', 'age', 'ind_nuevo', 'segmento', 'ind_empleado', 'ind_actividad_cliente', 'indresi'],
    1: target_features,
    2: ['pais_residencia', 'sexo', 'age', 'segmento', 'renta'],
    3: ['pais_residencia', 'sexo', 'age', 'renta', 'targets_logdiff', 'targets_logcount2_diff','targets_logcount2','targets_logcount1'],
    11: ['nomprov', 'ind_nuevo', 'renta', 'ind_actividad_cliente', 'canal_entrada']
}


In [18]:
# CROSS VALIDATION
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
n_highest = 7


def prepare_to_fit(X_train, Y_train, profiles):
    # Select only active client for the training part
    mask = ~X_train['targets_diff'].isin([0, -99999])
    X_train_ = X_train[mask]
    y_train = Y_train[mask][TARGET_LABELS].values
    x_train = []
    for key in profiles:
        x_train_ = X_train_[profiles[key]].values
#         x_train_ = StandardScaler().fit_transform(x_train_)
        x_train.append(x_train_)
    return x_train, y_train


def prepare_to_test(X_val, profiles, Y_val=None):
    if Y_val is not None:
        y_val = Y_val[TARGET_LABELS].values
    
    x_val = []
    for key in profiles:
        x_val_ = X_val[profiles[key]].values    
#         x_val_ = StandardScaler().fit_transform(x_val_)  
        x_val.append(x_val_)    
    
    if Y_val is not None:
        return x_val, y_val
    else:
        return x_val

    
def pred_to_targets_indices(y_probas, threshold, n_highest=7):
    y_probas[y_probas < threshold] = 0.0
    predicted_added_products = np.argsort(y_probas, axis=1)
    predicted_added_products = predicted_added_products[:,::-1][:,:n_highest]
    out = []
    for i, t in enumerate(predicted_added_products):
        out.append([j for j in t if y_probas[i, j] > 0.0])
    return np.array(out)


def train_model(X_train, Y_train, profiles, prepare_to_fit_func=prepare_to_fit, create_model_func=create_model):
    x_train, y_train = prepare_to_fit_func(X_train, Y_train, profiles)
    logging.info("- Train data shapes : {}, {}".format(
            [i.shape for i in x_train] if isinstance(x_train, list) else x_train.shape, 
            y_train.shape)
    )                
    logging.info("- Create the model")
    estimator = create_model_func(profiles)
    nb_epoch = 300
    batch_size = min(7500, x_train[0].shape[0])
    logging.info("- Fit the model : (%i, %i)" % (nb_epoch, batch_size))
    hist = estimator.fit(x_train, y_train, nb_epoch=nb_epoch, batch_size=batch_size, verbose=0)        
    for key in hist.history:            
        logging.info("-- {} : min={:5f}, max={:5f}, last={}".format(key, np.min(hist.history[key]), np.max(hist.history[key]), hist.history[key][-4:]))
    return estimator


def predict_with_model(estimator, X_val, profiles, Y_val=None, threshold=0.5, prepare_to_test_func=prepare_to_test, transform_pred_func=pred_to_targets_indices, return_probas=False):
    logging.info("- Predict using trained model")
    
    if Y_val is not None:
        x_val, y_val = prepare_to_test_func(X_val, profiles, Y_val)
        logging.info("- Test data shapes : {}, {}".format(
                [i.shape for i in x_val] if isinstance(x_val, list) else x_val.shape, 
                y_val.shape)
        )                
    else:
        x_val = prepare_to_test_func(X_val, profiles)
        logging.info("- Test data shapes : {}".format(
                [i.shape for i in x_val] if isinstance(x_val, list) else x_val.shape)
        )                
    
    y_probas = estimator.predict(x_val, verbose=0)    
    y_pred = transform_pred_func(y_probas, threshold)
    
    if Y_val is not None:
        if return_probas:
            return y_pred, targets_str_to_indices(y_val), y_probas
        return y_pred, targets_str_to_indices(y_val)
    
    if return_probas:
        return y_pred, y_probas
    return y_pred


def cross_val_score2(data, 
                     profiles,
                     nb_folds=5, 
                     train_model_func=train_model, 
                     predict_with_model_func=predict_with_model):
    
    logging.info("- Cross validation : ")
    x_df, y_df, clients_last_choice = data
    kf = KFold(n_splits=nb_folds)
    scores = []
    
    count = 0
    for train_index, test_index in kf.split(range(x_df.shape[0])):
        count += 1
        logging.info("-- fold : %i / %i" % (count, nb_folds))
        
        X_train, X_val = x_df.loc[x_df.index[train_index], :], x_df.loc[x_df.index[test_index], :]
        Y_train, Y_val = y_df.loc[y_df.index[train_index], :], y_df.loc[y_df.index[test_index], :]
        clc_val = clients_last_choice[test_index, :]
            
        estimator = train_model_func(X_train, Y_train, profiles)    
        y_pred, y_val = predict_with_model_func(estimator, X_val, profiles, Y_val)
        
        logging.info("- Compute map7 score")
        scores.append(map7_score(y_val, y_pred, clc_val))   
    
    return np.array(scores)

In [19]:
nb_folds = 3
results = cross_val_score2((X, Y, clients_last_choice[LC_TARGET_LABELS].values), 
                            profiles=profiles,
                            nb_folds=nb_folds)
print "Cross-Validation \n %i | %f | %f | %f | %.5f " % (nb_folds, results.min(), results.mean(), results.max(), results.std())

INFO:root:- Cross validation : 
INFO:root:-- fold : 1 / 3
INFO:root:- Train data shapes : [(5776, 8), (5776, 6), (5776, 5), (5776, 8), (5776, 5)], (5776, 24)
INFO:root:- Create the model
INFO:root:- Fit the model : (300, 5776)
INFO:root:-- acc : min=0.012465, max=0.516967, last=[0.51696676015853882, 0.51696676015853882, 0.51696676015853882, 0.51696676015853882]
INFO:root:-- loss : min=0.109929, max=0.497222, last=[0.11003660410642624, 0.10995077341794968, 0.10992904752492905, 0.11002188175916672]
INFO:root:- Predict using trained model
INFO:root:- Test data shapes : [(49949, 8), (49949, 6), (49949, 5), (49949, 8), (49949, 5)], (49949, 24)
INFO:root:- Compute map7 score
INFO:root:-- Predicted map7 score: 0.0223180977931
INFO:root:-- fold : 2 / 3
INFO:root:- Train data shapes : [(7544, 8), (7544, 6), (7544, 5), (7544, 8), (7544, 5)], (7544, 24)
INFO:root:- Create the model
INFO:root:- Fit the model : (300, 7500)
INFO:root:-- acc : min=0.005832, max=0.242842, last=[0.010604453757049739, 0

Cross-Validation 
 3 | 0.022318 | 0.031946 | 0.043563 | 0.00879 


201505 -> 201605 

Cross-Validation without scaler   
 3 | 0.045991 | 0.055948 | 0.061927 | 0.00709 
 3 | 0.052378 | 0.061900 | 0.073181 | 0.00858 


201506 -> 201605

Cross-Validation without scaler   
 3 | 0.057028 | 0.068970 | 0.077645 | 0.00873 
 3 | 0.055150 | 0.067474 | 0.078318 | 0.00952 

Kaggle : 0.0097026 <-> Many missing clients


Cross-Validation with scaler   
 3 | 0.050298 | 0.060795 | 0.066987 | 0.00746 





Compute cross-validation across several months

In [None]:
nb_folds = 3
yms = [201504, 201505]
#yms = [201505]

for ym in yms:
    logging.info("\n-------------------------")
    logging.info("- Process month : %s" % ym)
    logging.info("-------------------------\n")
    
    ym1 = ym + 100    
    df1 = train_df if months_ym_map[ym] in train_months else val_df
    df2 = train_df if months_ym_map[ym1] in train_months else val_df
    X, Y, clients_last_choice = get_XY(ym, df1, ym1, df2) 
    results = cross_val_score2((X, Y, clients_last_choice[LC_TARGET_LABELS].values), 
                                profiles=profiles,
                                nb_folds=nb_folds)
    print "Cross-Validation \n %i | %f | %f | %f | %.5f " % (nb_folds, results.min(), results.mean(), results.max(), results.std())

## Train model for predictions

In [None]:
current_month = 201505
next_year_month = current_month + 100

df1 = train_df
#df1 = val_df
#df2 = train_df if months_ym_map[next_year_month] in train_months else val_df
df2 = val_df

X, Y, clients_last_choice = get_XY(current_month, df1, next_year_month, df2)

In [None]:
estimator = train_model(X, Y, profiles)

Check score on the data 2016/05

In [None]:
y_pred, y_val, y_probas = predict_with_model(estimator, X, profiles, Y, threshold=0.5, return_probas=True)

logging.info("- Compute map7 score")
print map7_score(y_val, y_pred, clients_last_choice[LC_TARGET_LABELS].values)
logging.info("- Compute max map7 score")
print map7_score(y_val, y_val, clients_last_choice[LC_TARGET_LABELS].values)

In [None]:
from common import targets_to_labels, targets_indices_to_labels, remove_last_choice

In [None]:
clients_last_choice[LC_TARGET_LABELS].values

count = 0
for last_choice, targets, products in zip(clients_last_choice[LC_TARGET_LABELS].values, y_val, y_pred):
    added_products = remove_last_choice(targets, last_choice)
    predictions = remove_last_choice(products, last_choice)
    
    if len(added_products) == 0:
        continue
    print "--- Count = ", count
    print targets_indices_to_labels(added_products, TARGET_LABELS2)
    print targets_indices_to_labels(predictions, TARGET_LABELS2)
    
    count += 1
    if count == 10:
        break

## Prediction for 2016/06

In [None]:
from dataset import load_train_test

In [None]:
full_train_df, test_df = load_train_test([201506])

In [None]:
months_ym_map = {}
months = list(set(full_train_df['fecha_dato'].unique()) | set(test_df['fecha_dato'].unique()))
for m in months:
    months_ym_map[to_yearmonth(m)] = m
    
full_train_months = full_train_df['fecha_dato'].unique()
test_months = test_df['fecha_dato'].unique()

In [None]:
current_month = 201506

month_mask = full_train_df['fecha_dato'] == months_ym_map[current_month]
next_month_mask = test_df['fecha_dato'] == '2016-06-28'
df1 = full_train_df
common_clients_mask1, common_clients_mask2 = get_common_clients(full_train_df, month_mask, next_month_mask, test_df)
print (common_clients_mask1 & month_mask).sum(), common_clients_mask2.sum()
X = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + target_features + features]            
clients_last_choice = df1[common_clients_mask1 & month_mask][['ncodpers', 'fecha_dato'] + TARGET_LABELS]

In [None]:
print X.shape, test_df.shape

In [None]:
def get_submission(predicted_added_products, clients, clc, target_labels):
    added_products_col = []
    count = 0 
    for products, last_choice in zip(predicted_added_products, clc):
        predictions = remove_last_choice(products, last_choice)
        added_products_col.append(' '.join([target_labels[i] for i in predictions]))
        count+=1
        if count % 100000 == 0:
            logging.info("Elapsed : %i", count)
            
    out = pd.DataFrame(data={'ncodpers': clients, 'added_products': added_products_col}, columns=['ncodpers', 'added_products'])
    return out

In [None]:
y_pred = predict_with_model(estimator, X, profiles)

logging.info("- Get submission dataframe:")
clients = X['ncodpers'].values
submission = get_submission(y_pred, clients, clients_last_choice[TARGET_LABELS].values, TARGET_LABELS)

In [None]:
submission_clients = set(submission['ncodpers'].unique())
test_clients = set(test_df['ncodpers'].unique())
if submission_clients != test_clients:
    missing_clients = list(test_clients - submission_clients)
    missing_added_products = np.zeros((len(missing_clients)))
    submission = pd.concat([submission, 
                            pd.DataFrame(data={
                                'ncodpers': missing_clients, 
                                'added_products': missing_added_products
                            }, columns=['ncodpers', 'added_products'])])

Get submission DataFrame and write csv file

In [None]:
print submission.shape
submission.head()

In [None]:
from datetime import datetime
import csv

logging.info('- Generate submission')
submission_file = '../results/submission_' + \
                  str(datetime.now().strftime("%Y-%m-%d-%H-%M")) + \
                  '.csv'

submission.to_csv(submission_file, index=False, index_label=False)

In [None]:
with open('../results/submission_2016-11-17-16-37.csv', 'r') as r:
    print r.readline()
    print r.readline()
    print r.readline()
    print r.readline()
    print r.readline()
    