# NN tryouts on SPR data, inspired by Kaggle Forum "When less is more"


- Load m month of data
- Minimal data cleaning
- Feature engineering

- Setup model

TRAIN : 201505, 201506

~~- Select only users that added products in 201506 comparing to 201505~~
- FEATURES <- get_profile(ALL_FEATURES) : Select some profiles

[FEATURES|TARGETS]

X_train = [FEATURES] of the training part
Y_train = [TARGETS]  of the training part

X_val = [FEATURES] of the validation part
Y_val = [TARGETS]  of the validation part

TEST :
201606
- All users
[FEATURES]
X_test = [FEATURES]

In [1]:
import os
import numpy as np
import pandas as pd

import logging
logging.getLogger().handlers = []
logging.basicConfig(level=logging.DEBUG)

In [2]:
from common import load_data2, minimal_clean_data_inplace, preprocess_data_inplace, TARGET_LABELS

In [3]:
TRAIN_FILE_PATH = os.path.join("..", "data", "train_ver2.csv")
TEST_FILE_PATH = os.path.join("..", "data", "test_ver2.csv")

Load data + minimal cleaning + preprocessing

* 201505 - to get the clients last choice 
* 201506 - to train on

In [4]:
yearmonth_list = [201505, 201506] 
nb_months = len(yearmonth_list)
nb_clients = 10000

In [5]:
data_df = load_data2(TRAIN_FILE_PATH, yearmonth_list, nb_clients)
minimal_clean_data_inplace(data_df)
preprocess_data_inplace(data_df)

months = data_df['fecha_dato'].unique()
clients = data_df['ncodpers'].unique()
assert len(clients) == (data_df['ncodpers'].value_counts() == nb_months).sum()
ll = len(clients)
for m in months:
    l = len(data_df[data_df['fecha_dato'] == m]['ncodpers'].unique())
    assert l == ll, "Number of clients should be identical for all monthes. (%s, %s, %s)" % (m, l, ll)

INFO:root:-- Select 10000 clients
INFO:root:- Number of lines with unknown data : 52
INFO:root:- Number of columns with nan : 5


In [6]:
features = [
    u'ind_empleado', u'pais_residencia',
    u'sexo', u'age', u'ind_nuevo', u'antiguedad', u'indrel',
    u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
    u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
    u'ind_actividad_cliente', u'renta', u'segmento'    
]

last_choice_mask = data_df['fecha_dato'] == months[-2]
train_month_mask = data_df['fecha_dato'] == months[-1]

Create profiles 

In [141]:
profiles = {
#     0: ['pais_residencia', 'sexo', 'age', 'ind_nuevo', 'segmento', 'ind_empleado', 'ind_actividad_cliente', 'indresi'],
#     1: ['pais_residencia', 'sexo', 'age', 'segmento', 'nomprov'],
#     2: ['pais_residencia', 'sexo', 'age', 'segmento', 'antiguedad'],
#     3: ['pais_residencia', 'sexo', 'age', 'segmento', 'ind_nuevo'],
#     4: ['pais_residencia', 'sexo', 'age', 'segmento', 'ind_actividad_cliente'],
#     5: ['pais_residencia', 'sexo', 'age', 'segmento', 'canal_entrada'],
#     6: ['pais_residencia', 'sexo', 'age', 'segmento', 'ind_nuevo', 'canal_entrada'],
#     7: ['pais_residencia', 'sexo', 'age', 'segmento', 'ind_empleado'],
#     8: ['pais_residencia', 'sexo', 'age', 'segmento', 'renta'],
    9: ['sexo', 'age', 'segmento'],    
#     10: ['sexo', 'age', 'segmento', 'ind_actividad_cliente']
#     11: ['nomprov', 'ind_nuevo', 'antiguedad', 'renta']
    12: ['nomprov']
}

Create models for profiles

In [142]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import np_utils

In [177]:
models = {}
ll = len(TARGET_LABELS)
for key in profiles:
    length = len(profiles[key])
    model = Sequential()
    model.add(Dense(2*length + 15, init='uniform', input_shape=(length,), activation='relu'))
    model.add(Dropout(0.15))
    model.add(Dense(10 + length, activation='relu'))
    model.add(Dropout(0.15))
    model.add(Dense(ll, activation='softmax'))
    model.compile(loss='mae', optimizer='nadam', metrics=['accuracy'])
    models[key] = model

Train and score models on X_train, Y_train, X_val, Y_val for each profile

In [178]:
clients_last_choice = data_df[last_choice_mask][['ncodpers'] + TARGET_LABELS]
X = data_df[train_month_mask][['ncodpers'] + features]
Y = data_df[train_month_mask][TARGET_LABELS].values

In [179]:
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, train_size=0.70)
print X_train.shape, X_val.shape, " | ", Y_train.shape, Y_val.shape

(6981, 20) (2993, 20)  |  (6981, 24) (2993, 24)


In [180]:
from sklearn.preprocessing import StandardScaler

In [184]:
Y_pred = {}
for key in profiles:
    print "\n - Work with profile: \n", profiles[key]
    x_train = X_train[profiles[key]].values
    x_val = X_val[profiles[key]].values    
    x_train = StandardScaler().fit_transform(x_train)
    x_val = StandardScaler().fit_transform(x_val)  

    model = models[key]
    print model, model.layers, x_train[0,:], x_val[0, :]
    model.fit(x_train, Y_train, nb_epoch=100, batch_size=10000, verbose=0)
    scores = model.evaluate(x_val, Y_val, verbose=0)
    print "Model Accuracy : %.2f%%" % (scores[1]*100)        
    y_pred = model.predict(x_val, verbose=0)
    Y_pred[key] = y_pred


 - Work with profile: 
['sexo', 'age', 'segmento']
<keras.models.Sequential object at 0x11140ff90> [<keras.layers.core.Dense object at 0x11140fed0>, <keras.layers.core.Dropout object at 0x118f4f910>, <keras.layers.core.Dense object at 0x118ed5c90>, <keras.layers.core.Dropout object at 0x110f38b10>, <keras.layers.core.Dense object at 0x118ed5e50>] [ 0.90101883 -1.0260658   1.22768036] [ 0.92309477 -0.96738977  1.21944611]
Model Accuracy : 77.61%
Model Accuracy : 77.61%
[  2.98828894e-07   1.12699482e-07   9.99952197e-01   3.11205611e-07
   2.73329852e-06   1.82325402e-06   6.08906430e-06   2.67760083e-06
   8.13910219e-07   1.82374970e-06   5.71887256e-07   4.13675053e-07
   1.19848255e-05   2.04549380e-07   3.91807816e-06   1.90901619e-06
   3.34413443e-07   1.91100980e-07   3.99482178e-06   5.87348893e-07
   4.26074081e-07   3.16459159e-06   2.50912240e-06   8.83561086e-07]

 - Work with profile: 
['nomprov']
<keras.models.Sequential object at 0x11db8dd50> [<keras.layers.core.Dense o

Score with MAP@7


In [107]:
def get_added_products(current_choice, last_choice):
    real = []
    for i, c in enumerate(current_choice):
        if c == 1:
            if last_choice[i] == 0:
                real.append(i)
    return real

def apk(actual, predicted, k=7):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0
    
    return score / min(len(actual), k)

In [103]:
predicted_added_products = np.array([set() for i in range(len(Y_val))])
for i, key in enumerate(profiles):
    profile_predictions = np.argmax(Y_pred[key], axis=1)
    predicted_added_products[:] |= set(profile_predictions[:])

In [16]:
gb = clients_last_choice.groupby('ncodpers')

In [111]:
map7 = 0.0

for i, client, targets in zip(range(len(Y_val)), X_val['ncodpers'], Y_val):
    last_choice = gb.get_group(client)[TARGET_LABELS]
    added_products = get_added_products(targets, last_choice.values[0])
    predictions = list(predicted_added_products[i])
    map7 += apk(added_products, predictions)    
    
map7 /= len(Y_val)
print('Predicted score: {}'.format(map7))

Predicted score: 0.0129747187883



```
0: ['pais_residencia', 'sexo', 'age', 'ind_nuevo', 'segmento', 'ind_empleado', 'ind_actividad_cliente', 'indresi'],
1: ['pais_residencia', 'sexo', 'age', 'segmento', 'nomprov'],
2: ['pais_residencia', 'sexo', 'age', 'segmento', 'antiguedad'],
```
Predicted score: 0.0129747187883


### Define train_val dataset :
- Select only clients that choose new products in 201506 comparing with 201505

In [None]:
trainval_df = data_df.sort_values(['fecha_dato', 'ncodpers'])

In [None]:
dates1 = months[:-1]
dates2 = months[1:]

In [None]:
print dates1, dates2

In [None]:
tmp_df = trainval_df[['fecha_dato','ncodpers']]
tmp_df.loc[:,'target'] = trainval_df[TARGET_LABELS].sum(axis=1)
v1 = tmp_df[tmp_df['fecha_dato'].isin(dates2)]['target'].values
v2 = tmp_df[tmp_df['fecha_dato'].isin(dates1)]['target'].values
ll = min(len(v1), len(v2))
indices = tmp_df.index[ll:]
trainval_df.loc[indices,'diff'] = pd.Series(v1 - v2, index=indices)
del tmp_df, v1, v2

In [None]:
trainval_df.sort_values(['ncodpers', 'fecha_dato']).head(10)

In [None]:
X = trainval_df[(trainval_df['fecha_dato'].isin(dates2)) & (trainval_df['diff'] > 0)][features]
Y = trainval_df[(trainval_df['fecha_dato'].isin(dates2)) & (trainval_df['diff'] > 0)][TARGET_LABELS]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X.values, Y.values, train_size=0.70)

In [None]:
from sklearn.preprocessing import StandardScaler
X_train = StandardScaler().fit_transform(X_train)
X_val = StandardScaler().fit_transform(X_val)

In [None]:
print X_train.shape, X_val.shape
print Y_train.shape, Y_val.shape

Setup NN model

Keras model :

Sequential
- Dense
- Activation
- Dropout

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import np_utils

Setup model 1

In [None]:
model = Sequential()
model.add(Dense(43, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(Y_train.shape[1], activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, nb_epoch=1500, batch_size=10000, verbose=2)

In [None]:
# summarize performance of the model
scores = model.evaluate(X_val, Y_val, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

Setup model 2

In [None]:
model = Sequential()
model.add(Dense(43, init='uniform', input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(Y_train.shape[1], activation='sigmoid'))
model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, nb_epoch=1000, batch_size=10000, verbose=2)

In [None]:
# summarize performance of the model
scores = model.evaluate(X_val, Y_val, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

Setup model 3

In [None]:
model = Sequential()
model.add(Dense(50, init='uniform', input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(Y_train.shape[1], activation='softmax'))
model.compile(loss='mae', optimizer='nadam', metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, nb_epoch=1000, batch_size=10000, verbose=2)

In [None]:
# summarize performance of the model
scores = model.evaluate(X_val, Y_val, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

## Prediction part

In [None]:
#del X, Y, X_train, X_val, Y_train, Y_val, trainval_df, data_df

Load the last month from the training dataset to get user last choice 

In [None]:
yearmonth_list = [201605]

In [None]:
lastmonth_df = load_data2(TRAIN_FILE_PATH, yearmonth_list)

In [None]:
minimal_clean_data_inplace(lastmonth_df)

In [None]:
clients_last_choice = lastmonth_df[['ncodpers'] + TARGET_LABELS]

In [None]:
test_df = load_data2(TEST_FILE_PATH, [])

In [None]:
test_df.head()

In [None]:
minimal_clean_data_inplace(test_df)

In [None]:
print test_df.shape
test_df.tail()

Encode non-numerical columns 

In [None]:
preprocess_data_inplace(test_df)

In [None]:
print test_df.shape
test_df.tail()

Make predictions

In [None]:
X_test = test_df[features]

In [None]:
X_test = StandardScaler().fit_transform(X_test)

In [None]:
Y_pred = model.predict(X_test)

In [None]:
Y_pred[:5]

---------------

Check multiple models training

In [191]:
data = np.random.randn(100,10)
_x_train_1 = data[:, (1,3,5,7,9)]
_x_train_2 = data[:, (0,2,4,6,8)]
_y_train = np.random.randint(10, size=100)

_x_val = np.random.randn(30,5) 
_y_val = np.random.randint(10, size=30)

In [192]:
_models = {}

for i in range(2):
#     model1 = Sequential()
#     model1.add(Dense(15, input_shape=(5,), activation='relu'))
#     model1.add(Dense(1, activation='relu'))
#     model1.compile(loss='mae', optimizer='nadam', metrics=['accuracy'])
    length = 5
    model = Sequential()
    model.add(Dense(2*length + 15, init='uniform', input_shape=(length,), activation='relu'))
    model.add(Dropout(0.15))
    model.add(Dense(10 + length, activation='relu'))
    model.add(Dropout(0.15))
    model.add(Dense(ll, activation='softmax'))
    model.compile(loss='mae', optimizer='nadam', metrics=['accuracy'])

    _models[i] = model1

In [193]:
for i, _x_train in zip(range(2), [_x_train_1, _x_train_2]):
    model = _models[i]
    model.fit(_x_train, _y_train, nb_epoch=1000, batch_size=100, verbose=0)
    score = model.evaluate(_x_val, _y_val, verbose=0)
    print score

[3.0700631141662598, 0.13333334028720856]
[3.3067326545715332, 0.10000000149011612]
