# NN tryouts on SPR data, inspired by Kaggle Forum "When less is more"


- Load m month of data
- Minimal data cleaning
- Feature engineering

- Setup model

TRAIN : 201505, 201506

- FEATURES <- get_profile(ALL_FEATURES) : Select some profiles

- Train on all users
- Select only users that added products in 201506 comparing to 201505


month data is like [FEATURES|TARGETS]

X_train = [FEATURES] of the training part
Y_train = [TARGETS]  of the training part

X_val = [FEATURES] of the validation part
Y_val = [TARGETS]  of the validation part

TEST :
201606
- All users
[FEATURES]
X_test = [FEATURES]

In [1]:
import os
import numpy as np
import pandas as pd

import logging
logging.getLogger().handlers = []
logging.basicConfig(level=logging.DEBUG)

In [2]:
from common import load_data2, minimal_clean_data_inplace, preprocess_data_inplace, TARGET_LABELS

In [3]:
TRAIN_FILE_PATH = os.path.join("..", "data", "train_ver2.csv")
TEST_FILE_PATH = os.path.join("..", "data", "test_ver2.csv")

Load data + minimal cleaning + preprocessing

* 201505 - to get the clients last choice 
* 201506 - to train on

In [4]:
yearmonth_list = [201505, 201506] 
nb_months = len(yearmonth_list)
nb_clients = 100000

In [5]:
data_df = load_data2(TRAIN_FILE_PATH, yearmonth_list, nb_clients)
minimal_clean_data_inplace(data_df)
preprocess_data_inplace(data_df)

months = data_df['fecha_dato'].unique()
clients = data_df['ncodpers'].unique()
assert len(clients) == (data_df['ncodpers'].value_counts() == nb_months).sum()
ll = len(clients)
for m in months:
    l = len(data_df[data_df['fecha_dato'] == m]['ncodpers'].unique())
    assert l == ll, "Number of clients should be identical for all monthes. (%s, %s, %s)" % (m, l, ll)

INFO:root:-- Select 100000 clients
INFO:root:- Number of lines with unknown data : 568
INFO:root:- Number of columns with nan : 9


In [6]:
features = [
    u'ind_empleado', u'pais_residencia',
    u'sexo', u'age', u'ind_nuevo', u'antiguedad', u'indrel',
    u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
    u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
    u'ind_actividad_cliente', u'renta', u'segmento'    
]

last_choice_mask = data_df['fecha_dato'] == months[-2]
train_month_mask = data_df['fecha_dato'] == months[-1]

Create profiles 

In [42]:
profiles = {
     0: ['pais_residencia', 'sexo', 'age', 'ind_nuevo', 'segmento', 'ind_empleado', 'ind_actividad_cliente', 'indresi'],
     1: ['pais_residencia', 'sexo', 'age', 'segmento', 'nomprov'],
#      2: ['pais_residencia', 'sexo', 'age', 'segmento', 'antiguedad'],
#      3: ['pais_residencia', 'sexo', 'age', 'segmento', 'ind_nuevo'],
#      4: ['pais_residencia', 'sexo', 'age', 'segmento', 'ind_actividad_cliente'],
#      5: ['pais_residencia', 'sexo', 'age', 'segmento', 'canal_entrada'],
#      6: ['pais_residencia', 'sexo', 'age', 'segmento', 'ind_nuevo', 'canal_entrada'],
#      7: ['pais_residencia', 'sexo', 'age', 'segmento', 'ind_empleado'],
#      8: ['pais_residencia', 'sexo', 'age', 'segmento', 'renta'],
#      9: ['sexo', 'age', 'segmento'],    
#      10: ['sexo', 'age', 'segmento', 'ind_actividad_cliente']
#      11: ['nomprov', 'ind_nuevo', 'antiguedad', 'renta', 'ind_actividad_cliente', 'canal_entrada']
}

Create models for profiles

In [43]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Merge
from keras.utils import np_utils


In [61]:
def create_model():
    ll = len(TARGET_LABELS)
    final_model = Sequential()
    models = []
    for key in profiles:
        length = len(profiles[key])
        model = Sequential()
        model.add(Dense(2*length + 15, init='uniform', input_shape=(length,), activation='relu'))
        model.add(Dropout(0.15))
        model.add(Dense(10 + length, activation='relu'))
        model.add(Dropout(0.15))
        model.add(Dense(ll, activation='sigmoid'))
        models.append(model)

    merged = Merge(models, mode='ave')
    final_model.add(merged)
    final_model.compile(loss='mae', optimizer='nadam', metrics=['accuracy'])
    return final_model

Train and score models on X_train, Y_train, X_val, Y_val for each profile

In [62]:
clients_last_choice = data_df[last_choice_mask][['ncodpers'] + TARGET_LABELS]
X = data_df[train_month_mask][['ncodpers'] + features]
Y = data_df[train_month_mask][TARGET_LABELS].values

In [63]:
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, train_size=0.70)
print X_train.shape, X_val.shape, " | ", Y_train.shape, Y_val.shape

(69801, 20) (29915, 20)  |  (69801, 24) (29915, 24)


In [64]:
from sklearn.preprocessing import StandardScaler

In [65]:
x_train = []
x_val = []

for key in profiles:
    x_train_ = X_train[profiles[key]].values
    x_val_ = X_val[profiles[key]].values    
    x_train_ = StandardScaler().fit_transform(x_train_)
    x_val_ = StandardScaler().fit_transform(x_val_)  

    x_train.append(x_train_)
    x_val.append(x_val_)

logging.info("- Create the model")
model = create_model()
logging.info("- Fit the model")
model.fit(x_train, Y_train, nb_epoch=100, batch_size=10000, verbose=0)
logging.info("- Evaluate the model")
scores = model.evaluate(x_val, Y_val, verbose=0)
logging.info("-- Model Accuracy : %.2f%%" % (scores[1]*100))        
logging.info("- Predict using trained model")
Y_pred = model.predict(x_val, verbose=0)

INFO:root:- Create the model
INFO:root:- Fit the model
INFO:root:- Evaluate the model
INFO:root:-- Model Accuracy : 78.74%
INFO:root:- Predict using trained model


Score with MAP@7


In [58]:
def get_added_products(current_choice, last_choice):
    """
    current_choice is e.g. [0, 0, 1, 0, ..., 1], of length 24
    """
    real = []
    for i, c in enumerate(current_choice):
        if c == 1:
            if last_choice[i] == 0:
                real.append(i)
    return real

def remove_last_choice(predictions, last_choice):
    """
    predictions is a list of product indices
    """
    out = list(predictions)
    for i, c in enumerate(last_choice):
        if c == 1 and i in out:
            out.remove(i)
    return out
    

def apk(actual, predicted, k=7):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0
    
    return score / min(len(actual), k)

Structure `Y_pred` is a dictionary with predictions :
```
Y_pred = {
   [24 target probas], # client 1
   [24 target probas], # client 2
   ...
}
```

Merge predictions from profiles

In [69]:
n_highest = 5
predicted_added_products = np.argsort(Y_pred, axis=1)
predicted_added_products = predicted_added_products[:,::-1][:,:n_highest]

gb = clients_last_choice.groupby('ncodpers')

map7 = 0.0

for i, client, targets in zip(range(len(Y_val)), X_val['ncodpers'], Y_val):
    last_choice = gb.get_group(client)[TARGET_LABELS].values[0].astype(np.uint)
    added_products = get_added_products(targets, last_choice)
    predictions = remove_last_choice(predicted_added_products[i], last_choice)
    score = apk(added_products, predictions)    
    map7 += score    
    
map7 /= len(Y_val)
print('Predicted score: {}'.format(map7))

Predicted score: 0.012727217858



On 10000 client, 5 profiles -> map7 = 0.028

On 100000 client, 5 profiles -> map7 = 0.0190395538682, 0.0220090653677



### Define train_val dataset :
- Select only clients that choose new products in 201506 comparing with 201505

In [None]:
trainval_df = data_df.sort_values(['fecha_dato', 'ncodpers'])

In [None]:
dates1 = months[:-1]
dates2 = months[1:]

In [None]:
print dates1, dates2

In [None]:
tmp_df = trainval_df[['fecha_dato','ncodpers']]
tmp_df.loc[:,'target'] = trainval_df[TARGET_LABELS].sum(axis=1)
v1 = tmp_df[tmp_df['fecha_dato'].isin(dates2)]['target'].values
v2 = tmp_df[tmp_df['fecha_dato'].isin(dates1)]['target'].values
ll = min(len(v1), len(v2))
indices = tmp_df.index[ll:]
trainval_df.loc[indices,'diff'] = pd.Series(v1 - v2, index=indices)
del tmp_df, v1, v2

In [None]:
trainval_df.sort_values(['ncodpers', 'fecha_dato']).head(10)

In [None]:
X = trainval_df[(trainval_df['fecha_dato'].isin(dates2)) & (trainval_df['diff'] > 0)][features]
Y = trainval_df[(trainval_df['fecha_dato'].isin(dates2)) & (trainval_df['diff'] > 0)][TARGET_LABELS]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(X.values, Y.values, train_size=0.70)

In [None]:
from sklearn.preprocessing import StandardScaler
X_train = StandardScaler().fit_transform(X_train)
X_val = StandardScaler().fit_transform(X_val)

In [None]:
print X_train.shape, X_val.shape
print Y_train.shape, Y_val.shape

Setup NN model

Keras model :

Sequential
- Dense
- Activation
- Dropout

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import np_utils

Setup model 1

In [None]:
model = Sequential()
model.add(Dense(43, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(Y_train.shape[1], activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, nb_epoch=1500, batch_size=10000, verbose=2)

In [None]:
# summarize performance of the model
scores = model.evaluate(X_val, Y_val, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

Setup model 2

In [None]:
model = Sequential()
model.add(Dense(43, init='uniform', input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(Y_train.shape[1], activation='sigmoid'))
model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, nb_epoch=1000, batch_size=10000, verbose=2)

In [None]:
# summarize performance of the model
scores = model.evaluate(X_val, Y_val, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

Setup model 3

In [None]:
model = Sequential()
model.add(Dense(50, init='uniform', input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(10, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(Y_train.shape[1], activation='softmax'))
model.compile(loss='mae', optimizer='nadam', metrics=['accuracy'])

In [None]:
model.fit(X_train, Y_train, nb_epoch=1000, batch_size=10000, verbose=2)

In [None]:
# summarize performance of the model
scores = model.evaluate(X_val, Y_val, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

## Prediction part

In [None]:
#del X, Y, X_train, X_val, Y_train, Y_val, trainval_df, data_df

Load the last month from the training dataset to get user last choice 

In [None]:
yearmonth_list = [201605]

In [None]:
lastmonth_df = load_data2(TRAIN_FILE_PATH, yearmonth_list)

In [None]:
minimal_clean_data_inplace(lastmonth_df)

In [None]:
clients_last_choice = lastmonth_df[['ncodpers'] + TARGET_LABELS]

In [None]:
test_df = load_data2(TEST_FILE_PATH, [])

In [None]:
test_df.head()

In [None]:
minimal_clean_data_inplace(test_df)

In [None]:
print test_df.shape
test_df.tail()

Encode non-numerical columns 

In [None]:
preprocess_data_inplace(test_df)

In [None]:
print test_df.shape
test_df.tail()

Make predictions

In [None]:
X_test = test_df[features]

In [None]:
X_test = StandardScaler().fit_transform(X_test)

In [None]:
Y_pred = model.predict(X_test)

In [None]:
Y_pred[:5]

---------------

Check multiple models training

In [None]:
data = np.random.randn(100,10)
_x_train_1 = data[:, (1,3,5,7,9)]
_x_train_2 = data[:, (0,2,4,6,8)]
_y_train = np.random.randint(10, size=100)

_x_val = np.random.randn(30,5) 
_y_val = np.random.randint(10, size=30)

In [None]:
_models = {}

for i in range(2):
#     model1 = Sequential()
#     model1.add(Dense(15, input_shape=(5,), activation='relu'))
#     model1.add(Dense(1, activation='relu'))
#     model1.compile(loss='mae', optimizer='nadam', metrics=['accuracy'])
    length = 5
    model = Sequential()
    model.add(Dense(2*length + 15, init='uniform', input_shape=(length,), activation='relu'))
    model.add(Dropout(0.15))
    model.add(Dense(10 + length, activation='relu'))
    model.add(Dropout(0.15))
    model.add(Dense(ll, activation='softmax'))
    model.compile(loss='mae', optimizer='nadam', metrics=['accuracy'])

    _models[i] = model1

In [None]:
for i, _x_train in zip(range(2), [_x_train_1, _x_train_2]):
    model = _models[i]
    model.fit(_x_train, _y_train, nb_epoch=1000, batch_size=100, verbose=0)
    score = model.evaluate(_x_val, _y_val, verbose=0)
    print score