# RNN tryouts on SPR data


- Load m month of data
- Minimal data cleaning
- Feature engineering

- Setup model



In [None]:
import os
import numpy as np
import pandas as pd

import logging
logging.getLogger().handlers = []
logging.basicConfig(level=logging.DEBUG)

In [None]:
from common import load_data, minimal_clean_data_inplace, preprocess_data_inplace, TARGET_LABELS

In [None]:
DATAFILE_PATH = os.path.join("..", "data", "train_ver2.csv")

In [None]:
nb_months = 4

In [None]:
nb_clients = 100000

In [None]:
data_df = load_data(DATAFILE_PATH, 201503, 201503 + nb_months - 1, nb_clients)

In [None]:
minimal_clean_data_inplace(data_df)

In [None]:
print data_df.shape
data_df.tail()

Encode non-numerical columns 

In [None]:
preprocess_data_inplace(data_df)

In [None]:
print data_df.shape
data_df.tail()

Setup RNN model

In [None]:
from keras.models import Sequential
from keras.layers import Dense, TimeDistributed, LSTM, Dropout
from keras.utils import np_utils

Keras model :

Sequential
- LSTM
- Dense

X_train.shape = (samples, time steps, features)
-> (nb_clients, nb_months, profile_length)

Y_train.shape = (samples, features)
-> (nb_clients, nb_targets)


For example, training dataset contains 4 months of data on 3 users with 2 internal features
```
d1, u1, f11, f12, t11, t12, t13, t14
d1, u2, f21, f22, t21, t22, t23, t24
d1, u2, f31, f32, t31, t32, t33, t34

d2, u1, f11, f12, tt11, tt12, tt3, tt4
d2, u2, f21, f22, tt21, tt22, tt23, tt24
d2, u2, f31, f32, tt31, tt32, tt33, tt34

d3, u1, f11, f12, ttt11, ttt12, ttt3, ttt4
d3, u2, f21, f22, ttt21, ttt22, ttt23, ttt24
d3, u2, f31, f32, ttt31, ttt32, ttt33, ttt34

d4, u1, f11, f12, tttt11, tttt12, tttt3, tttt4
d4, u2, f21, f22, tttt21, tttt22, tttt23, tttt24
d4, u2, f31, f32, tttt31, tttt32, tttt33, tttt34
```
So then, the training dataset is :
```
X_train = [
    [
        [f11, f12, t11, t12, t13, t14], # 1st time step
        [f11, f12, tt11, tt12, tt3, tt4], # 2nd time step
    ], # 1st sample
    [
        [f21, f22, t21, t22, t23, t24], # 1st time step
        [f21, f22, tt21, tt22, tt23, tt24], # 2nd time step
    ], # 2nd sample
    [
        [f31, f32, t31, t32, t33, t34], # 1st time step
        [f31, f32, tt31, tt32, tt33, tt34], # 2nd time step        
    ], # 3rd sample
]
```
and
```
Y_train = [
    [ttt11, ttt12, ttt3, ttt4], # 1st sample
    [ttt21, ttt22, ttt23, ttt24], # 2nd sample
    [ttt31, ttt32, ttt33, ttt34], # 3rd sample
]
```
and the test dataset is :

```
X_test = [
    [
        [f11, f12, tt11, tt12, tt13, tt14], # 2nd time step
        [f11, f12, ttt11, ttt12, ttt3, ttt4], # 3rd time step
    ], # 1st sample
    [
        [f21, f22, tt21, tt22, tt23, tt24], # 2nd time step
        [f21, f22, ttt21, ttt22, ttt23, ttt24], # 3rd time step
    ], # 2nd sample
    [
        [f31, f32, tt31, tt32, tt33, tt34], # 2nd time step
        [f31, f32, ttt31, ttt32, ttt33, ttt34], # 3rd time step        
    ], # 3rd sample
]
```

and

```
Y_test = [
    [tttt11, tttt12, tttt3, tttt4], # 1st sample
    [tttt21, tttt22, tttt23, tttt24], # 2nd sample
    [tttt31, tttt32, tttt33, tttt34], # 3rd sample
]
```

In [None]:
features = [
    u'ind_empleado', u'pais_residencia',
    u'sexo', u'age', u'ind_nuevo', u'antiguedad', u'indrel',
    u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
    u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
    u'ind_actividad_cliente', u'renta', u'segmento'    
]

In [None]:
months = data_df['fecha_dato'].unique()

In [None]:
clients = data_df['ncodpers'].value_counts()[data_df['ncodpers'].value_counts() == nb_months].index.values
print len(clients)

Train on X months

In [None]:
nb_months_train = 2

Define test dataset :

In [None]:
test_months = months[-1 - nb_months_train:]
test_df = data_df[(data_df['ncodpers'].isin(clients)) & (data_df['fecha_dato'].isin(test_months))].sort_values(['ncodpers', 'fecha_dato'])

In [None]:
X_test = test_df[test_df['fecha_dato'] != test_months[-1]][features + TARGET_LABELS]
Y_test = test_df[test_df['fecha_dato'] == test_months[-1]][TARGET_LABELS].values

In [None]:
X_test = X_test.values.reshape((-1, nb_months_train, 43))

In [None]:
print X_test.shape
print Y_test.shape

Define training dataset

In [None]:
train_months = months[-2 - nb_months_train:-1]
train_df = data_df[(data_df['ncodpers'].isin(clients)) & (data_df['fecha_dato'].isin(train_months))].sort_values(['ncodpers', 'fecha_dato'])

In [None]:
X_train = train_df[train_df['fecha_dato'] != train_months[-1]][features + TARGET_LABELS]
Y_train = train_df[train_df['fecha_dato'] == train_months[-1]][TARGET_LABELS].values

In [None]:
X_train = X_train.values.reshape((-1, nb_months_train, 43))

In [None]:
print X_train.shape
print Y_train.shape

In [None]:
X_train

In [None]:
from sklearn.preprocessing import StandardScaler
for i in range(nb_months_train):
    X_train[:,i,:] = StandardScaler().fit_transform(X_train[:,i,:])
    X_test[:,i,:] = StandardScaler().fit_transform(X_test[:,i,:])

Setup model

In [None]:
model = Sequential()
# Input data is (nb_samples, timesteps=2, nb_features=43)
model.add(LSTM(30, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2]))) # output shape: (nb_samples, timesteps, 50)
model.add(TimeDistributed(Dense(45, activation='relu'))) # output shape: (nb_samples, timesteps, 45)
model.add(Dropout(0.15))
model.add(LSTM(30, return_sequences=True)) # output shape: (nb_samples, timesteps, 30)
model.add(TimeDistributed(Dense(20, activation='relu'))) # output shape: (nb_samples, timesteps, 20)
model.add(Dropout(0.15))
model.add(LSTM(10)) # output shape: (nb_samples, timesteps, 10)
model.add(Dense(Y_train.shape[1], activation='softmax')) # output shape: (nb_targets)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, nb_epoch=25, batch_size=10000, verbose=2)
# summarize performance of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
model.fit(X_train, Y_train, nb_epoch=25, batch_size=10000, verbose=2)
# summarize performance of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))