# NN tryouts on SPR data


- Load m month of data
- Minimal data cleaning
- Feature engineering

- Setup model



In [1]:
import os
import numpy as np
import pandas as pd

import logging
logging.getLogger().handlers = []
logging.basicConfig(level=logging.DEBUG)

In [2]:
from common import load_data2, minimal_clean_data_inplace, preprocess_data_inplace, TARGET_LABELS

In [3]:
DATAFILE_PATH = os.path.join("..", "data", "train_ver2.csv")

In [4]:
yearmonth_list = [201504, 201505, 201506]

In [5]:
nb_months = len(yearmonth_list)

In [6]:
nb_clients = 10000

In [7]:
data_df = load_data2(DATAFILE_PATH, yearmonth_list, nb_clients)

INFO:root:-- Select 10000 clients


In [8]:
minimal_clean_data_inplace(data_df)

INFO:root:- Number of lines with unknown data : 105
INFO:root:- Number of columns with nan : 5


In [9]:
print data_df.shape
data_df.tail()

(29895, 46)


Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
1894192,2015-06-28,549474,N,ES,V,48.0,2005-08-17,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
1894228,2015-06-28,549119,N,ES,H,53.0,2005-08-17,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
1894233,2015-06-28,549135,N,ES,V,42.0,2005-08-17,0,118,1.0,...,1,0,0,0,1,0,0,1.0,1.0,1
1894236,2015-06-28,549141,N,ES,V,55.0,2005-08-17,0,118,1.0,...,0,0,0,0,1,0,0,0.0,0.0,0
1894359,2015-06-28,549628,N,ES,V,60.0,2005-08-18,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0


Encode non-numerical columns 

In [10]:
preprocess_data_inplace(data_df)

In [11]:
print data_df.shape
data_df.tail()

(29895, 46)


Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
1894192,2015-06-28,549474,2,13,1,48.0,2005-08-17,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
1894228,2015-06-28,549119,2,13,0,53.0,2005-08-17,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
1894233,2015-06-28,549135,2,13,1,42.0,2005-08-17,0,118,1.0,...,1,0,0,0,1,0,0,1.0,1.0,1
1894236,2015-06-28,549141,2,13,1,55.0,2005-08-17,0,118,1.0,...,0,0,0,0,1,0,0,0.0,0.0,0
1894359,2015-06-28,549628,2,13,1,60.0,2005-08-18,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0


In [12]:
clients = data_df['ncodpers'].unique()
print len(clients), (data_df['ncodpers'].value_counts() == nb_months).sum()
assert len(clients) == (data_df['ncodpers'].value_counts() == nb_months).sum()

9965 9965


Setup NN model

Keras model :

Sequential
- Dense
- Activation
- Dropout

In [13]:
features = [
    u'ind_empleado', u'pais_residencia',
    u'sexo', u'age', u'ind_nuevo', u'antiguedad', u'indrel',
    u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
    u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
    u'ind_actividad_cliente', u'renta', u'segmento'    
]

In [14]:
months = data_df['fecha_dato'].unique()
print months

['2015-04-28' '2015-05-28' '2015-06-28']


In [15]:
clients = data_df['ncodpers'].unique()
print len(clients)

9965


In [16]:
ll = len(clients)
for m in months:
    l = len(data_df[data_df['fecha_dato'] == m]['ncodpers'].unique())
    assert l == ll, "Number of clients should be identical for all monthes. (%s, %s, %s)" % (m, l, ll)

Define test dataset :

In [17]:
test_months = months[-2:]
print test_months
test_df = data_df[data_df['fecha_dato'].isin(test_months)].sort_values(['fecha_dato', 'ncodpers', ])

['2015-05-28' '2015-06-28']


In [18]:
X_test = test_df[test_df['fecha_dato'] != test_months[-1]][features + TARGET_LABELS].values
Y_test = test_df[test_df['fecha_dato'] == test_months[-1]][TARGET_LABELS].values

In [19]:
print X_test.shape
print Y_test.shape

(9965, 43)
(9965, 24)


Define training dataset

In [None]:
train_months = months[:-1]
print train_months
train_df = data_df[data_df['fecha_dato'].isin(train_months)].sort_values(['fecha_dato', 'ncodpers'])

Select only clients that choose new products

In [None]:
dates1 = train_months[:-1]
dates2 = train_months[1:]

In [None]:
print dates1, dates2

In [None]:
tmp_df = train_df[['fecha_dato','ncodpers']]

In [None]:
tmp_df.loc[:,'target'] = train_df[TARGET_LABELS].sum(axis=1)

In [None]:
v1 = tmp_df[tmp_df['fecha_dato'].isin(dates2)]['target'].values
v2 = tmp_df[tmp_df['fecha_dato'].isin(dates1)]['target'].values
ll = min(len(v1), len(v2))
train_df.loc[:,'diff'] = pd.Series(v1 - v2, index=tmp_df.index[:ll])

In [None]:
X_train = None
Y_train = None
for i, m in enumerate(train_months[:-1]):
    _mask = (train_df['fecha_dato'] == m) & (train_df['diff'] > 0)
    _clients = train_df[_mask]['ncodpers']
    if X_train is None:
        X_train = train_df[_mask][features + TARGET_LABELS].values
    else:
        X_train = np.concatenate((X_train, train_df[_mask][features + TARGET_LABELS].values))
    next_month = train_months[i+1]
    _mask = (train_df['fecha_dato'] == next_month) & (train_df['ncodpers'].isin(_clients))
    if Y_train is None:
        Y_train = train_df[_mask][TARGET_LABELS].values
    else:
        Y_train = np.concatenate((Y_train, train_df[_mask][TARGET_LABELS].values))

In [None]:
print X_train.shape
print Y_train.shape

In [None]:
from sklearn.preprocessing import StandardScaler
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import np_utils

Setup model 1

In [None]:
model = Sequential()
model.add(Dense(43, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(Y_train.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, nb_epoch=500, batch_size=10000, verbose=2)

In [None]:
# summarize performance of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

Setup model 2

In [None]:
model = Sequential()
model.add(Dense(43, init='uniform', input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(Y_train.shape[1], activation='softmax'))
model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, nb_epoch=1000, batch_size=10000, verbose=2)

In [None]:
# summarize performance of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

Setup model 3

In [None]:
model = Sequential()
model.add(Dense(50, init='uniform', input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(Y_train.shape[1], activation='softmax'))
model.compile(loss='mae', optimizer='nadam', metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, nb_epoch=1000, batch_size=10000, verbose=2)

In [None]:
# summarize performance of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

## Predicion part

In [None]:
#del test_df, train_df, data_df

In [20]:
DATAFILE_PATH = os.path.join("..", "data", "test_ver2.csv")

In [22]:
test_df = load_data2(DATAFILE_PATH, [])

INFO:root:-- Read all data from the file : ../data/test_ver2.csv


In [23]:
test_df.head()

Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,...,indext,conyuemp,canal_entrada,indfall,tipodom,cod_prov,nomprov,ind_actividad_cliente,renta,segmento
0,2016-06-28,15889,F,ES,V,56,1995-01-16,0,256,1,...,N,N,KAT,N,1,28.0,MADRID,1,326124.9,01 - TOP
1,2016-06-28,1170544,N,ES,H,36,2013-08-28,0,34,1,...,N,,KAT,N,1,3.0,ALICANTE,0,,02 - PARTICULARES
2,2016-06-28,1170545,N,ES,V,22,2013-08-28,0,34,1,...,N,,KHE,N,1,15.0,"CORUÑA, A",1,,03 - UNIVERSITARIO
3,2016-06-28,1170547,N,ES,H,22,2013-08-28,0,34,1,...,N,,KHE,N,1,8.0,BARCELONA,0,148402.98,03 - UNIVERSITARIO
4,2016-06-28,1170548,N,ES,H,22,2013-08-28,0,34,1,...,N,,KHE,N,1,7.0,"BALEARS, ILLES",0,106885.8,03 - UNIVERSITARIO


In [None]:
minimal_clean_data_inplace(data_df)

In [None]:
print data_df.shape
data_df.tail()

Encode non-numerical columns 

In [None]:
preprocess_data_inplace(data_df)

In [None]:
print data_df.shape
data_df.tail()

In [None]:
clients = data_df['ncodpers'].unique()
print len(clients), (data_df['ncodpers'].value_counts() == nb_months).sum()
assert len(clients) == (data_df['ncodpers'].value_counts() == nb_months).sum()