# NN tryouts on SPR data, inspired by Kaggle Forum "When less is more"


- Load m month of data
- Minimal data cleaning
- Feature engineering

- Setup model

TRAIN : 201505, 201506

~~- Select only users that added products in 201506 comparing to 201505~~
- FEATURES <- get_profile(ALL_FEATURES) : Select some profiles

[FEATURES|TARGETS]

X_train = [FEATURES] of the training part
Y_train = [TARGETS]  of the training part

X_val = [FEATURES] of the validation part
Y_val = [TARGETS]  of the validation part

TEST :
201606
- All users
[FEATURES]
X_test = [FEATURES]

In [1]:
import os
import numpy as np
import pandas as pd

import logging
logging.getLogger().handlers = []
logging.basicConfig(level=logging.DEBUG)

In [2]:
from common import load_data2, minimal_clean_data_inplace, preprocess_data_inplace, TARGET_LABELS

In [3]:
TRAIN_FILE_PATH = os.path.join("..", "data", "train_ver2.csv")
TEST_FILE_PATH = os.path.join("..", "data", "test_ver2.csv")

Load data + minimal cleaning + preprocessing

* 201505 - to get the clients last choice 
* 201506 - to train on

In [4]:
yearmonth_list = [201505, 201506] 
nb_months = len(yearmonth_list)
nb_clients = 10000

In [5]:
data_df = load_data2(TRAIN_FILE_PATH, yearmonth_list, nb_clients)
minimal_clean_data_inplace(data_df)
preprocess_data_inplace(data_df)

months = data_df['fecha_dato'].unique()
clients = data_df['ncodpers'].unique()
assert len(clients) == (data_df['ncodpers'].value_counts() == nb_months).sum()
ll = len(clients)
for m in months:
    l = len(data_df[data_df['fecha_dato'] == m]['ncodpers'].unique())
    assert l == ll, "Number of clients should be identical for all monthes. (%s, %s, %s)" % (m, l, ll)

INFO:root:-- Select 10000 clients
INFO:root:- Number of lines with unknown data : 70
INFO:root:- Number of columns with nan : 6


In [12]:
features = [
    u'ind_empleado', u'pais_residencia',
    u'sexo', u'age', u'ind_nuevo', u'antiguedad', u'indrel',
    u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
    u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
    u'ind_actividad_cliente', u'renta', u'segmento'    
]

last_choice_mask = data_df['fecha_dato'] == months[-2]
train_month_mask = data_df['fecha_dato'] == months[-1]

Create profiles 

In [83]:
profiles = {
    0: ['pais_residencia', 'sexo', 'age', 'ind_nuevo', 'segmento', 'ind_empleado', 'ind_actividad_cliente', 'indresi'],
    1: ['pais_residencia', 'sexo', 'age', 'segmento', 'nomprov'],
     2: ['pais_residencia', 'sexo', 'age', 'segmento', 'antiguedad'],
#     3: ['pais_residencia', 'sexo', 'age', 'segmento', 'ind_nuevo'],
#     4: ['pais_residencia', 'sexo', 'age', 'segmento', 'ind_actividad_cliente'],
#     5: ['pais_residencia', 'sexo', 'age', 'segmento', 'canal_entrada'],
#     6: ['pais_residencia', 'sexo', 'age', 'segmento', 'ind_nuevo', 'canal_entrada'],
#     7: ['pais_residencia', 'sexo', 'age', 'segmento', 'ind_empleado'],
#     8: ['pais_residencia', 'sexo', 'renta', 'age', 'segmento'],
#     9: ['sexo', 'age', 'segmento'],
#     10: ['sexo', 'age', 'segmento', 'ind_actividad_cliente']
}

Create models for profiles

In [84]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import np_utils

In [85]:
models = {}
ll = len(TARGET_LABELS)
for key in profiles:
    lenght = len(profiles[key])
    model = Sequential()
    model.add(Dense(50, init='uniform', input_shape=(lenght,), activation='relu'))
    model.add(Dropout(0.15))
    model.add(Dense(10, activation='relu'))
    model.add(Dropout(0.15))
    model.add(Dense(ll, activation='softmax'))
    model.compile(loss='mae', optimizer='nadam', metrics=['accuracy'])
    models[key] = model

Train and score models on X_train, Y_train, X_val, Y_val for each profile

In [86]:
clients_last_choice = data_df[last_choice_mask][['ncodpers'] + TARGET_LABELS]
X = data_df[train_month_mask][['ncodpers'] + features]
Y = data_df[train_month_mask][TARGET_LABELS].values

In [87]:
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, train_size=0.70)
print X_train.shape, X_val.shape, " | ", Y_train.shape, Y_val.shape

In [88]:
from sklearn.preprocessing import StandardScaler

In [89]:
Y_pred = {}
for key in profiles:
    print "\n - Work with profile: \n", profiles[key]
    x_train = X_train[profiles[key]].values
    x_val = X_val[profiles[key]].values
    x_train = StandardScaler().fit_transform(x_train)
    x_val = StandardScaler().fit_transform(x_val)  
    
    model = models[key]
    model.fit(x_train, Y_train, nb_epoch=1000, batch_size=10000, verbose=0)
    scores = model.evaluate(x_val, Y_val, verbose=0)
    print "Model Accuracy : %.2f%%" % (scores[1]*100)    
    y_pred = model.predict(x_val, verbose=0)
    Y_pred[key] = y_pred


 - Work with profile: 
['pais_residencia', 'sexo', 'age', 'ind_nuevo', 'segmento', 'ind_empleado', 'ind_actividad_cliente', 'indresi']
(6975, 8) (2990, 8)  |  (6975, 24) (2990, 24)
Model Accuracy : 78.90%

 - Work with profile: 
['pais_residencia', 'sexo', 'age', 'segmento', 'nomprov']
(6975, 5) (2990, 5)  |  (6975, 24) (2990, 24)
Model Accuracy : 78.90%

 - Work with profile: 
['pais_residencia', 'sexo', 'age', 'segmento', 'antiguedad']
(6975, 5) (2990, 5)  |  (6975, 24) (2990, 24)
Model Accuracy : 78.90%


Score with MAP@7


In [72]:
gb = clients_last_choice.groupby('ncodpers')

In [82]:
for client, targets in zip(X_val['ncodpers'], Y_val):
    last_choice = gb.get_group(client)[TARGET_LABELS]
    added_products = get_added_products(targets, last_choice.values[0])
    print client, np.sum(targets - last_choice.values[0]), added_products
    #predicted_added_products = 
    break

436784 0.0 []
432961 0.0 []
691777 0.0 []
571445 0.0 []
224658 0.0 []
1151882 0.0 []
87516 0.0 []
274611 0.0 []
671480 0.0 []
60210 0.0 []
1212917 0.0 []
618192 0.0 []
1333383 0.0 []
1167982 0.0 []
309292 0.0 []
1169833 0.0 []
1046937 0.0 []
258629 0.0 []
135129 0.0 []
980046 0.0 []
719564 0.0 []
305500 0.0 []
481694 0.0 []
139717 0.0 []
674986 0.0 []
403957 0.0 []
896097 0.0 []
364222 0.0 []
821125 0.0 []
1014761 -1.0 []
405926 0.0 []
961827 0.0 []
736001 0.0 []
1052361 0.0 []
434567 0.0 []
735666 0.0 []
1354298 0.0 [9]
330451 0.0 []
1018925 0.0 []
969570 0.0 []
1272430 0.0 []
1050574 0.0 []
1189761 0.0 []
1279793 0.0 []
39872 0.0 []
1036938 0.0 []
958299 0.0 []
1331441 0.0 []
530701 0.0 []
251910 0.0 []
515552 0.0 []
1035551 0.0 []
341483 0.0 []
1193572 0.0 []
1296454 0.0 []
87053 0.0 []
743904 0.0 []
207203 0.0 []
581265 0.0 []
1382543 0.0 []
521838 0.0 []
990370 0.0 []
321311 1.0 [23]
473282 0.0 []
689795 0.0 []
1232287 0.0 []
68911 0.0 []
900992 0.0 []
751806 0.0 []
1304286 0.0 []

In [None]:
def apk(actual, predicted, k=7):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0
    
    return score / min(len(actual), k)

In [None]:
a = [1,2,3,4,5]
b = [6,1,7,2,3]
apk(a, b, 5)

In [58]:
def get_added_products(current_choice, last_choice):
    real = []
    for i, c in enumerate(current_choice):
        if c == 1:
            if last_choice[i] == 0:
                real.append(i)
    return real


### Define train_val dataset :
- Select only clients that choose new products in 201506 comparing with 201505

In [None]:
trainval_df = data_df.sort_values(['fecha_dato', 'ncodpers'])

In [None]:
dates1 = months[:-1]
dates2 = months[1:]

In [None]:
print dates1, dates2

In [None]:
tmp_df = trainval_df[['fecha_dato','ncodpers']]
tmp_df.loc[:,'target'] = trainval_df[TARGET_LABELS].sum(axis=1)
v1 = tmp_df[tmp_df['fecha_dato'].isin(dates2)]['target'].values
v2 = tmp_df[tmp_df['fecha_dato'].isin(dates1)]['target'].values
ll = min(len(v1), len(v2))
indices = tmp_df.index[ll:]
trainval_df.loc[indices,'diff'] = pd.Series(v1 - v2, index=indices)
del tmp_df, v1, v2

In [None]:
trainval_df.sort_values(['ncodpers', 'fecha_dato']).head(10)

In [None]:
X = trainval_df[(trainval_df['fecha_dato'].isin(dates2)) & (trainval_df['diff'] > 0)][features]
Y = trainval_df[(trainval_df['fecha_dato'].isin(dates2)) & (trainval_df['diff'] > 0)][TARGET_LABELS]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X.values, Y.values, train_size=0.70)

In [None]:
from sklearn.preprocessing import StandardScaler
X_train = StandardScaler().fit_transform(X_train)
X_val = StandardScaler().fit_transform(X_val)

In [None]:
print X_train.shape, X_val.shape
print Y_train.shape, Y_val.shape

Setup NN model

Keras model :

Sequential
- Dense
- Activation
- Dropout

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import np_utils

Setup model 1

In [None]:
model = Sequential()
model.add(Dense(43, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(Y_train.shape[1], activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, nb_epoch=1500, batch_size=10000, verbose=2)

In [None]:
# summarize performance of the model
scores = model.evaluate(X_val, Y_val, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

Setup model 2

In [None]:
model = Sequential()
model.add(Dense(43, init='uniform', input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(Y_train.shape[1], activation='sigmoid'))
model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, nb_epoch=1000, batch_size=10000, verbose=2)

In [None]:
# summarize performance of the model
scores = model.evaluate(X_val, Y_val, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

Setup model 3

In [None]:
model = Sequential()
model.add(Dense(50, init='uniform', input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(Y_train.shape[1], activation='softmax'))
model.compile(loss='mae', optimizer='nadam', metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, nb_epoch=1000, batch_size=10000, verbose=2)

In [None]:
# summarize performance of the model
scores = model.evaluate(X_val, Y_val, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

## Prediction part

In [None]:
#del X, Y, X_train, X_val, Y_train, Y_val, trainval_df, data_df

Load the last month from the training dataset to get user last choice 

In [None]:
yearmonth_list = [201605]

In [None]:
lastmonth_df = load_data2(TRAIN_FILE_PATH, yearmonth_list)

In [None]:
minimal_clean_data_inplace(lastmonth_df)

In [None]:
clients_last_choice = lastmonth_df[['ncodpers'] + TARGET_LABELS]

In [None]:
test_df = load_data2(TEST_FILE_PATH, [])

In [None]:
test_df.head()

In [None]:
minimal_clean_data_inplace(test_df)

In [None]:
print test_df.shape
test_df.tail()

Encode non-numerical columns 

In [None]:
preprocess_data_inplace(test_df)

In [None]:
print test_df.shape
test_df.tail()

Make predictions

In [None]:
X_test = test_df[features]

In [None]:
X_test = StandardScaler().fit_transform(X_test)

In [None]:
Y_pred = model.predict(X_test)

In [None]:
Y_pred[:5]