# RNN tryouts on SPR data


- Load m month of data
- Minimal data cleaning
- Feature engineering

- Setup model



In [1]:
import os
import numpy as np
import pandas as pd

import logging
logging.getLogger().handlers = []
logging.basicConfig(level=logging.DEBUG)

In [2]:
from common import load_data, minimal_clean_data_inplace, preprocess_data_inplace, TARGET_LABELS

In [3]:
DATAFILE_PATH = os.path.join("..", "data", "train_ver2.csv")

In [4]:
data_df = load_data(DATAFILE_PATH, 201503, 201506)

In [5]:
minimal_clean_data_inplace(data_df)

INFO:root:- Number of unknown clients : 14841
INFO:root:- Number of columns with nan : 10


In [6]:
print data_df.shape
data_df.tail()

(2508802, 46)


Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
2523638,2015-06-28,549679,N,ES,V,39.0,2005-08-18,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
2523639,2015-06-28,549682,N,ES,V,39.0,2005-08-18,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
2523640,2015-06-28,549673,N,ES,H,49.0,2006-10-16,0,104,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
2523641,2015-06-28,889143,N,ES,H,46.0,2010-08-19,0,58,1.0,...,0,0,0,1,1,0,0,1.0,1.0,1
2523642,2015-06-28,498639,N,ES,V,65.0,2004-10-28,0,128,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0


Encode non-numerical columns 

In [7]:
preprocess_data_inplace(data_df)

In [8]:
print data_df.shape
data_df.tail()

(2508802, 46)


Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
2523638,2015-06-28,549679,3,34,2,39.0,2005-08-18,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
2523639,2015-06-28,549682,3,34,2,39.0,2005-08-18,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
2523640,2015-06-28,549673,3,34,0,49.0,2006-10-16,0,104,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
2523641,2015-06-28,889143,3,34,0,46.0,2010-08-19,0,58,1.0,...,0,0,0,1,1,0,0,1.0,1.0,1
2523642,2015-06-28,498639,3,34,2,65.0,2004-10-28,0,128,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0


In [9]:
np.sort(data_df['renta'].unique())

array([ -9.90000000e+01,   1.20273000e+03,   1.47123000e+03, ...,
         2.42532369e+07,   2.55472519e+07,   2.88943955e+07])

Setup RNN model

In [10]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.utils import np_utils

Using Theano backend.


Keras model :

Sequential
- LSTM
- Dense

X_train.shape = (samples, time steps, features)
-> (nb_clients, nb_months, profile_length)

Y_train.shape = (samples, features)
-> (nb_clients, nb_targets)


For example, training dataset contains 3 months of data on 3 users with 2 internal features
```
d1, u1, f11, f12, t11, t12, t13, t14
d1, u2, f21, f22, t21, t22, t23, t24
d1, u2, f31, f32, t31, t32, t33, t34

d2, u1, f11, f12, tt11, tt12, tt3, tt4
d2, u2, f21, f22, tt21, tt22, tt23, tt24
d2, u2, f31, f32, tt31, tt32, tt33, tt34

d3, u1, f11, f12, ttt11, ttt12, ttt3, ttt4
d3, u2, f21, f22, ttt21, ttt22, ttt23, ttt24
d3, u2, f31, f32, ttt31, ttt32, ttt33, ttt34
```
So then, the training dataset is :
```
X_train = [
    [
        [f11, f12, t11, t12, t13, t14], # 1st time step
        [f11, f12, tt11, tt12, tt3, tt4], # 2nd time step
    ], # 1st sample
    [
        [f21, f22, t21, t22, t23, t24], # 1st time step
        [f21, f22, tt21, tt22, tt23, tt24], # 2nd time step
    ], # 2nd sample
    [
        [f31, f32, t31, t32, t33, t34], # 1st time step
        [f31, f32, tt31, tt32, tt33, tt34], # 2nd time step        
    ], # 3rd sample
]
```
and
```
Y_train = [
    [ttt11, ttt12, ttt3, ttt4], # 1st sample
    [ttt21, ttt22, ttt23, ttt24], # 2nd sample
    [ttt31, ttt32, ttt33, ttt34], # 3rd sample
]
```



In [11]:
features = [
    u'ind_empleado', u'pais_residencia',
    u'sexo', u'age', u'ind_nuevo', u'antiguedad', u'indrel',
    u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
    u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
    u'ind_actividad_cliente', u'renta', u'segmento'    
]

In [12]:
months = data_df['fecha_dato'].unique()

In [13]:
clients = data_df['ncodpers'].value_counts()[data_df['ncodpers'].value_counts() == 4].index.values

In [14]:
test_month = months[-1]
train_df = data_df[(data_df['ncodpers'].isin(clients)) & (data_df['fecha_dato'] != test_month)].sort_values(['ncodpers', 'fecha_dato'])
test_df = data_df[(data_df['ncodpers'].isin(clients)) & (data_df['fecha_dato'] == test_month)].sort_values(['ncodpers', 'fecha_dato'])

In [15]:
train_last_month = months[-2]
X_train = train_df[train_df['fecha_dato'] != train_last_month]
train_cols = list(features); train_cols.extend(TARGET_LABELS)
X_train = X_train[train_cols]
Y_train = train_df[train_df['fecha_dato'] == train_last_month][TARGET_LABELS]

In [16]:
X_train = X_train.values.reshape((-1, 2, 43))

In [17]:
from sklearn.preprocessing import StandardScaler
for i in range(X_train.shape[1]):
    X_train[:,i,:] = StandardScaler().fit_transform(X_train[:,i,:])

In [18]:
X_train.shape

(619745, 2, 43)

In [20]:
Y_train.head()

Unnamed: 0,ind_ahor_fin_ult1,ind_aval_fin_ult1,ind_cco_fin_ult1,ind_cder_fin_ult1,ind_cno_fin_ult1,ind_ctju_fin_ult1,ind_ctma_fin_ult1,ind_ctop_fin_ult1,ind_ctpp_fin_ult1,ind_deco_fin_ult1,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
1891531,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,1,1,0,0.0,0.0,0
1891532,0,0,0,0,1,0,0,0,1,0,...,0,1,0,0,1,0,0,1.0,1.0,1
1680871,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,1,1,0,0.0,0.0,1
1680872,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0.0,0.0,0
1680873,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,1,1,0,1.0,1.0,1


In [23]:
X_test = test_df[features + TARGET_LABELS]
Y_test = test_

In [24]:
X_test.shape

(619745, 43)

In [None]:
batch_size = 1
model = Sequential()
model.add(LSTM(32, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dense(Y_train.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X, y, nb_epoch=50, batch_size=1, verbose=2)
# summarize performance of the model
scores = model.evaluate(X_test, y, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))