# NN tryouts on SPR data


- Load m month of data
- Minimal data cleaning
- Feature engineering

- Setup model



In [1]:
import os
import numpy as np
import pandas as pd

import logging
logging.getLogger().handlers = []
logging.basicConfig(level=logging.DEBUG)

In [2]:
from common import load_data, minimal_clean_data_inplace, preprocess_data_inplace, TARGET_LABELS

In [3]:
DATAFILE_PATH = os.path.join("..", "data", "train_ver2.csv")

In [4]:
start_month = 201503

In [5]:
nb_months = 4

In [6]:
nb_clients = 150000

In [7]:
data_df = load_data(DATAFILE_PATH, start_month, start_month + nb_months - 1, nb_clients)

In [8]:
minimal_clean_data_inplace(data_df)

INFO:root:- Number of unknown clients : 1672
INFO:root:- Number of columns with nan : 10


In [9]:
print data_df.shape
data_df.tail()

(531732, 46)


Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
2523617,2015-06-28,549743,N,ES,V,49.0,2005-08-18,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
2523623,2015-06-28,549683,N,ES,V,73.0,2005-08-18,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
2523629,2015-06-28,549670,N,ES,V,46.0,2005-08-18,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,1
2523632,2015-06-28,549672,N,ES,V,59.0,2005-08-18,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,1
2523639,2015-06-28,549682,N,ES,V,39.0,2005-08-18,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0


Encode non-numerical columns 

In [10]:
preprocess_data_inplace(data_df)

In [11]:
print data_df.shape
data_df.tail()

(531732, 46)


Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
2523617,2015-06-28,549743,3,25,2,49.0,2005-08-18,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
2523623,2015-06-28,549683,3,25,2,73.0,2005-08-18,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
2523629,2015-06-28,549670,3,25,2,46.0,2005-08-18,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,1
2523632,2015-06-28,549672,3,25,2,59.0,2005-08-18,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,1
2523639,2015-06-28,549682,3,25,2,39.0,2005-08-18,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0


Setup NN model

Keras model :

Sequential
- Dense
- Activation
- Dropout

In [12]:
features = [
    u'ind_empleado', u'pais_residencia',
    u'sexo', u'age', u'ind_nuevo', u'antiguedad', u'indrel',
    u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
    u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
    u'ind_actividad_cliente', u'renta', u'segmento'    
]

In [13]:
months = data_df['fecha_dato'].unique()
print months

['2015-03-28' '2015-04-28' '2015-05-28' '2015-06-28']


In [14]:
clients = data_df['ncodpers'].unique()
print len(clients)

132933


Define test dataset :

In [15]:
test_months = months[-2:]
test_df = data_df[data_df['fecha_dato'].isin(test_months)].sort_values(['fecha_dato', 'ncodpers', ])

In [16]:
X_test = test_df[test_df['fecha_dato'] != test_months[-1]][features + TARGET_LABELS].values
Y_test = test_df[test_df['fecha_dato'] == test_months[-1]][TARGET_LABELS].values

In [17]:
print X_test.shape
print Y_test.shape

(132933, 43)
(132933, 24)


Define training dataset

In [18]:
train_months = months[:-1]
train_df = data_df[data_df['fecha_dato'].isin(train_months)].sort_values(['fecha_dato', 'ncodpers'])

Select only clients that choose new products

In [19]:
dates1 = train_months[:-1]
dates2 = train_months[1:]

In [20]:
print dates1, dates2

['2015-03-28' '2015-04-28'] ['2015-04-28' '2015-05-28']


In [21]:
tmp_df = train_df[['fecha_dato','ncodpers']]

In [24]:
tmp_df.loc[:,'target'] = train_df[TARGET_LABELS].sum(axis=1)

In [25]:
v1 = tmp_df[tmp_df['fecha_dato'].isin(dates2)]['target'].values
v2 = tmp_df[tmp_df['fecha_dato'].isin(dates1)]['target'].values
ll = min(len(v1), len(v2))
train_df.loc[:,'diff'] = pd.Series(v1 - v2, index=tmp_df.index[:ll])

In [26]:
mask = train_df['diff'] > 0

In [27]:
X_train = None
Y_train = None
for i, m in enumerate(train_months[:-1]):
    _mask = (train_df['fecha_dato'] == m) & (train_df['diff'] > 0)
    _clients = train_df[_mask]['ncodpers']
    if X_train is None:
        X_train = train_df[_mask][features + TARGET_LABELS].values
    else:
        X_train = np.concatenate((X_train, train_df[_mask][features + TARGET_LABELS].values))
    next_month = train_months[i+1]
    _mask = (train_df['fecha_dato'] == next_month) & (train_df['ncodpers'].isin(_clients))
    if Y_train is None:
        Y_train = train_df[_mask][TARGET_LABELS].values
    else:
        Y_train = np.concatenate((Y_train, train_df[_mask][TARGET_LABELS].values))

In [28]:
print X_train.shape
print Y_train.shape

(8621, 43)
(8621, 24)


In [29]:
from sklearn.preprocessing import StandardScaler
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

In [30]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import np_utils

Using TensorFlow backend.


Setup model 1

In [31]:
model = Sequential()
model.add(Dense(43, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(Y_train.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [41]:
model.fit(X_train, Y_train, nb_epoch=500, batch_size=10000, verbose=2)

Epoch 1/500
0s - loss: 7.6432 - acc: 0.5572
Epoch 2/500
0s - loss: 7.6284 - acc: 0.5636
Epoch 3/500
0s - loss: 7.6359 - acc: 0.5662
Epoch 4/500
0s - loss: 7.6279 - acc: 0.5686
Epoch 5/500
0s - loss: 7.6465 - acc: 0.5701
Epoch 6/500
0s - loss: 7.6372 - acc: 0.5793
Epoch 7/500
0s - loss: 7.6462 - acc: 0.5778
Epoch 8/500
0s - loss: 7.6431 - acc: 0.5659
Epoch 9/500
0s - loss: 7.6351 - acc: 0.5574
Epoch 10/500
0s - loss: 7.6436 - acc: 0.5673
Epoch 11/500
0s - loss: 7.6339 - acc: 0.5655
Epoch 12/500
0s - loss: 7.6299 - acc: 0.5620
Epoch 13/500
0s - loss: 7.6489 - acc: 0.5608
Epoch 14/500
0s - loss: 7.6361 - acc: 0.5728
Epoch 15/500
0s - loss: 7.6368 - acc: 0.5619
Epoch 16/500
0s - loss: 7.6357 - acc: 0.5604
Epoch 17/500
0s - loss: 7.6421 - acc: 0.5571
Epoch 18/500
0s - loss: 7.6370 - acc: 0.5622
Epoch 19/500
0s - loss: 7.6331 - acc: 0.5654
Epoch 20/500
0s - loss: 7.6233 - acc: 0.5759
Epoch 21/500
0s - loss: 7.6179 - acc: 0.5723
Epoch 22/500
0s - loss: 7.6510 - acc: 0.5617
Epoch 23/500
0s - l

<keras.callbacks.History at 0x7fcc0742f390>

In [42]:
# summarize performance of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

Model Accuracy: 76.74%


Setup model 2

In [43]:
model = Sequential()
model.add(Dense(43, init='uniform', input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(Y_train.shape[1], activation='softmax'))
model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

In [46]:
model.fit(X_train, Y_train, nb_epoch=500, batch_size=10000, verbose=2)

Epoch 1/500
0s - loss: 0.1074 - acc: 0.6256
Epoch 2/500
0s - loss: 0.1074 - acc: 0.6143
Epoch 3/500
0s - loss: 0.1074 - acc: 0.6163
Epoch 4/500
0s - loss: 0.1074 - acc: 0.6163
Epoch 5/500
0s - loss: 0.1075 - acc: 0.6140
Epoch 6/500
0s - loss: 0.1074 - acc: 0.6133
Epoch 7/500
0s - loss: 0.1074 - acc: 0.6170
Epoch 8/500
0s - loss: 0.1074 - acc: 0.6222
Epoch 9/500
0s - loss: 0.1074 - acc: 0.6171
Epoch 10/500
0s - loss: 0.1074 - acc: 0.6155
Epoch 11/500
0s - loss: 0.1074 - acc: 0.6107
Epoch 12/500
0s - loss: 0.1074 - acc: 0.6167
Epoch 13/500
0s - loss: 0.1074 - acc: 0.6156
Epoch 14/500
0s - loss: 0.1074 - acc: 0.6194
Epoch 15/500
0s - loss: 0.1074 - acc: 0.6220
Epoch 16/500
0s - loss: 0.1074 - acc: 0.6201
Epoch 17/500
0s - loss: 0.1074 - acc: 0.6259
Epoch 18/500
0s - loss: 0.1074 - acc: 0.6193
Epoch 19/500
0s - loss: 0.1073 - acc: 0.6137
Epoch 20/500
0s - loss: 0.1074 - acc: 0.6087
Epoch 21/500
0s - loss: 0.1074 - acc: 0.6154
Epoch 22/500
0s - loss: 0.1074 - acc: 0.6317
Epoch 23/500
0s - l

<keras.callbacks.History at 0x7fcc0742f550>

In [47]:
# summarize performance of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

Model Accuracy: 69.35%


Setup model 3

In [48]:
model = Sequential()
model.add(Dense(50, init='uniform', input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(Y_train.shape[1], activation='softmax'))
model.compile(loss='mae', optimizer='nadam', metrics=['accuracy'])

In [49]:
model.fit(X_train, Y_train, nb_epoch=1500, batch_size=10000, verbose=2)

Epoch 1/1500
0s - loss: 0.1909 - acc: 0.0130
Epoch 2/1500
0s - loss: 0.1907 - acc: 0.0205
Epoch 3/1500
0s - loss: 0.1905 - acc: 0.0284
Epoch 4/1500
0s - loss: 0.1904 - acc: 0.0341
Epoch 5/1500
0s - loss: 0.1902 - acc: 0.0414
Epoch 6/1500
0s - loss: 0.1900 - acc: 0.0484
Epoch 7/1500
0s - loss: 0.1898 - acc: 0.0614
Epoch 8/1500
0s - loss: 0.1895 - acc: 0.0710
Epoch 9/1500
0s - loss: 0.1892 - acc: 0.0862
Epoch 10/1500
0s - loss: 0.1888 - acc: 0.1073
Epoch 11/1500
0s - loss: 0.1884 - acc: 0.1214
Epoch 12/1500
0s - loss: 0.1880 - acc: 0.1377
Epoch 13/1500
0s - loss: 0.1875 - acc: 0.1566
Epoch 14/1500
0s - loss: 0.1869 - acc: 0.1735
Epoch 15/1500
0s - loss: 0.1861 - acc: 0.1876
Epoch 16/1500
0s - loss: 0.1854 - acc: 0.1906
Epoch 17/1500
0s - loss: 0.1845 - acc: 0.2054
Epoch 18/1500
0s - loss: 0.1834 - acc: 0.2191
Epoch 19/1500
0s - loss: 0.1823 - acc: 0.2220
Epoch 20/1500
0s - loss: 0.1813 - acc: 0.2343
Epoch 21/1500
0s - loss: 0.1801 - acc: 0.2450
Epoch 22/1500
0s - loss: 0.1789 - acc: 0.25

<keras.callbacks.History at 0x7fcbed70fdd0>

In [50]:
# summarize performance of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

Model Accuracy: 78.13%
