# RNN tryouts on SPR data


- Load m month of data
- Minimal data cleaning
- Feature engineering

- Setup model



In [1]:
import os
import numpy as np
import pandas as pd

import logging
logging.getLogger().handlers = []
logging.basicConfig(level=logging.DEBUG)

In [2]:
from common import load_data, minimal_clean_data_inplace, preprocess_data_inplace, TARGET_LABELS

In [3]:
DATAFILE_PATH = os.path.join("..", "data", "train_ver2.csv")

In [4]:
data_df = load_data(DATAFILE_PATH, 201504, 201506)

In [5]:
minimal_clean_data_inplace(data_df)

INFO:root:- Number of unknown clients : 9750
INFO:root:- Number of columns with nan : 10


In [6]:
print data_df.shape
data_df.tail()

(1884684, 46)


Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
1894429,2015-06-28,549679,N,ES,V,39.0,2005-08-18,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
1894430,2015-06-28,549682,N,ES,V,39.0,2005-08-18,0,118,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
1894431,2015-06-28,549673,N,ES,H,49.0,2006-10-16,0,104,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0
1894432,2015-06-28,889143,N,ES,H,46.0,2010-08-19,0,58,1.0,...,0,0,0,1,1,0,0,1.0,1.0,1
1894433,2015-06-28,498639,N,ES,V,65.0,2004-10-28,0,128,1.0,...,0,0,0,0,0,0,0,0.0,0.0,0


Setup RNN model

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.utils import np_utils

Keras model :

Sequential
- LSTM
- Dense

X_train.shape = (samples, time steps, features)
-> (nb_clients, nb_months, profile_length)

Y_train.shape = (samples, features)
-> (nb_clients, nb_targets)


For example, training dataset contains 3 months of data on 3 users with 2 internal features
```
d1, u1, f11, f12, t11, t12, t13, t14
d1, u2, f21, f22, t21, t22, t23, t24
d1, u2, f31, f32, t31, t32, t33, t34

d2, u1, f11, f12, tt11, tt12, tt3, tt4
d2, u2, f21, f22, tt21, tt22, tt23, tt24
d2, u2, f31, f32, tt31, tt32, tt33, tt34

d3, u1, f11, f12, ttt11, ttt12, ttt3, ttt4
d3, u2, f21, f22, ttt21, ttt22, ttt23, ttt24
d3, u2, f31, f32, ttt31, ttt32, ttt33, ttt34
```
So then, the training dataset is :
```
X_train = [
    [
        [f11, f12, t11, t12, t13, t14], # 1st time step
        [f11, f12, tt11, tt12, tt3, tt4], # 2nd time step
    ], # 1st sample
    [
        [f21, f22, t21, t22, t23, t24], # 1st time step
        [f21, f22, tt21, tt22, tt23, tt24], # 2nd time step
    ], # 2nd sample
    [
        [f31, f32, t31, t32, t33, t34], # 1st time step
        [f31, f32, tt31, tt32, tt33, tt34], # 2nd time step        
    ], # 3rd sample
]
```
and
```
Y_train = [
    [ttt11, ttt12, ttt3, ttt4], # 1st sample
    [ttt21, ttt22, ttt23, ttt24], # 2nd sample
    [ttt31, ttt32, ttt33, ttt34], # 3rd sample
]
```



In [8]:
features = [
    u'ind_empleado', u'pais_residencia',
    u'sexo', u'age', u'fecha_alta', u'ind_nuevo', u'antiguedad', u'indrel',
    u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
    u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
    u'ind_actividad_cliente', u'renta', u'segmento'    
]

In [19]:
res = data_df.sort_values(['ncodpers', 'fecha_dato'])
res.head()

Unnamed: 0,fecha_dato,ncodpers,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,...,ind_hip_fin_ult1,ind_plan_fin_ult1,ind_pres_fin_ult1,ind_reca_fin_ult1,ind_tjcr_fin_ult1,ind_valo_fin_ult1,ind_viv_fin_ult1,ind_nomina_ult1,ind_nom_pens_ult1,ind_recibo_ult1
0,2015-04-28,15889,F,ES,V,56.0,1995-01-16,0,243,1.0,...,0,0,0,0,0,1,0,0.0,0.0,0
1262322,2015-05-28,15889,F,ES,V,56.0,1995-01-16,0,244,1.0,...,0,0,0,0,1,1,0,0.0,0.0,0
1683777,2015-06-28,15889,F,ES,V,56.0,1995-01-16,0,245,1.0,...,0,0,0,0,1,1,0,0.0,0.0,0
210123,2015-04-28,15890,A,ES,V,62.0,1995-01-16,0,243,1.0,...,0,1,0,0,1,0,0,1.0,1.0,1
1262323,2015-05-28,15890,A,ES,V,62.0,1995-01-16,0,244,1.0,...,0,1,0,0,1,0,0,1.0,1.0,1


In [None]:
X_train = data_df[data_df['fecha_dato'] != '2015-06-28']
X_train = X_train.sort_values(['ncodpers', 'fecha_dato'])
train_cols = list(features); train_cols.extend(TARGET_LABELS)
X_train = X_train[train_cols]

# Y_train = 

#X_train = data_df[data_df['fecha_dato'] != '2015-06-28']
#Y_train = data_df[TARGET_LABELS]

In [None]:
X_test

In [None]:
batch_size = 1
model = Sequential()
model.add(LSTM(32, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])