# NN tryouts on SPR data, inspired by Kaggle Forum "When less is more"


- Load m month of data
- Minimal data cleaning
- Feature engineering

- Setup model

TRAIN : 201505, 201506

- FEATURES <- get_profile(ALL_FEATURES) : Select some profiles

- Train on all users
- Select only users that added products in 201506 comparing to 201505


month data is like [FEATURES|TARGETS]

X_train = [FEATURES] of the training part
Y_train = [TARGETS]  of the training part

X_val = [FEATURES] of the validation part
Y_val = [TARGETS]  of the validation part

TEST : 201606

- All users

[FEATURES]

X_test = [FEATURES]

In [1]:
import os
import numpy as np
import pandas as pd

import logging
logging.getLogger().handlers = []
logging.basicConfig(level=logging.INFO)

import matplotlib.pylab as plt
%matplotlib inline



In [2]:
from common import load_data2, minimal_clean_data_inplace, preprocess_data_inplace, TARGET_LABELS
from common import get_added_products, remove_last_choice, apk, map7_score2

from visualization import visualize_train_test, visualize_folds, compare_two_datasets, compare_folds, compare_folds2

In [3]:
TRAIN_FILE_PATH = os.path.join("..", "data", "train_ver2.csv")
TEST_FILE_PATH = os.path.join("..", "data", "test_ver2.csv")

Load data + minimal cleaning + preprocessing

* 201505 - to get the clients last choice 
* 201506 - to train on

In [4]:
yearmonth_list = [201504, 201505] 
nb_months = len(yearmonth_list)
nb_clients = 250000
# nb_clients = 'max'

In [5]:
data_df = load_data2(TRAIN_FILE_PATH, yearmonth_list, nb_clients)
minimal_clean_data_inplace(data_df)
preprocess_data_inplace(data_df)

months = data_df['fecha_dato'].unique()
clients = data_df['ncodpers'].unique()
assert len(clients) == (data_df['ncodpers'].value_counts() == nb_months).sum()
ll = len(clients)
for m in months:
    l = len(data_df[data_df['fecha_dato'] == m]['ncodpers'].unique())
    assert l == ll, "Number of clients should be identical for all monthes. (%s, %s, %s)" % (m, l, ll)

INFO:root:-- Select 250000 clients
INFO:root:- Number of lines with unknown data : 2804
INFO:root:- Number of columns with nan : 9


In [6]:
features = [
    u'ind_empleado', u'pais_residencia',
    u'sexo', u'age', u'ind_nuevo', u'antiguedad', u'indrel',
    u'ult_fec_cli_1t', u'indrel_1mes', u'tiprel_1mes', u'indresi',
    u'indext', u'conyuemp', u'canal_entrada', u'indfall', u'nomprov',
    u'ind_actividad_cliente', u'renta', u'segmento'    
]

last_choice_mask = data_df['fecha_dato'] == months[-2]
train_month_mask = data_df['fecha_dato'] == months[-1]

In [7]:
from common import PREPROCESS_LABEL_ENCODERS
#PREPROCESS_LABEL_ENCODERS

Create profiles and create models for profiles

In [8]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Merge
from keras.utils import np_utils

Using Theano backend.


## Select only clients that choose new products in 201506 comparing with 201505

201505 [FEATURES|TARGETS]  
201506 [FEATURES|TARGETS]  

TARGETS_201505 -> clients_last_choice
TARGETS_201506 - TARGETS_201505 -> active_clients

Cross-validation : 
FEATURES_201506 -> (Split train/val) :
  --> (0.75*FEATURES) -> (select only active clients) -> X_train
  --> 0.25*FEATURES -> X_val



In [9]:
trainval_df = data_df.sort_values(['fecha_dato', 'ncodpers'])
last_choice_mask = trainval_df['fecha_dato'] == months[-2]
train_month_mask = trainval_df['fecha_dato'] == months[-1]

In [10]:
def add_diff_inplace(df, last_choice_mask, train_month_mask):
    tmp_df = df[['fecha_dato','ncodpers']]
    tmp_df.loc[:,'target'] = df[TARGET_LABELS].sum(axis=1)
    v1 = tmp_df[train_month_mask]['target'].values
    v2 = tmp_df[last_choice_mask]['target'].values
    ll = min(len(v1), len(v2))
    indices = tmp_df.index[ll:]
    df.loc[indices,'diff'] = pd.Series(v1 - v2, index=indices)
    del tmp_df, v1, v2

In [11]:
add_diff_inplace(trainval_df, last_choice_mask, train_month_mask)
print trainval_df.shape

(497196, 47)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [12]:
clients_last_choice = trainval_df[last_choice_mask][['ncodpers'] + TARGET_LABELS].sort_values(['ncodpers'])
X = trainval_df[train_month_mask][['ncodpers', 'diff'] + features]
Y = trainval_df[train_month_mask][['ncodpers'] + TARGET_LABELS]

In [13]:
# Select only clients from test data
clients_last_choice = clients_last_choice[clients_last_choice['ncodpers'].isin(X['ncodpers'])]
assert (X['ncodpers'].values == clients_last_choice['ncodpers'].values).all(), "WTF"

### Feature engineering

In [14]:
def get_age_group_index(age):
    if age < 10:
        return -3
    elif age < 15:
        return -2        
    elif age < 18:
        return -1    
    elif age < 23:
        return 0
    elif age < 25:
        return 1
    elif age < 27:
        return 2
    elif age < 28:
        return 3    
    elif age < 32:
        return 4
    elif age < 37:
        return 5    
    elif age < 42:
        return 6
    elif age < 47:
        return 7
    elif age < 52:
        return 8
    elif age < 57:
        return 9
    elif age < 60:
        return 10
    elif age < 65:
        return 11
    elif age < 70:
        return 12
    elif age < 75:
        return 13
    elif age < 80:
        return 14
    else:
        return 15

def get_income_group_index(income):
    if income < 0:
        return -1
    elif income < 45542.97:
        return 1
    elif income < 57629.67:
        return 2
    elif income < 68211.78:
        return 3
    elif income < 78852.39:
        return 4
    elif income < 90461.97:
        return 5
    elif income < 103855.23:
        return 6
    elif income < 120063.00:
        return 7
    elif income < 141347.49:
        return 8
    elif income < 173418.36:
        return 9
    elif income < 234687.12:
        return 10
    else:
        return 11

In [15]:
X.loc[:,'age'] = X['age'].apply(get_age_group_index)
X.loc[:,'renta'] = X['renta'].apply(get_income_group_index)

In [None]:
# X_countries = pd.get_dummies(X['pais_residencia'])
# X_countries.head()

In [None]:
# def drop_const_cols(df):
#     """
#     Method to remove constant columns
#     """
#     indices = df.index
#     return df.loc[:, (df != df.ix[indices[0]]).any()]

# X = drop_const_cols(X)

In [None]:
# X_dummy = pd.DataFrame()

# X_dummy = pd.concat([X_dummy, X['diff']], axis=1)
# ff = list(features)
# ff.remove('indrel_1mes')
# ff.remove('tiprel_1mes')
# ff = list(set(ff) & set(X.columns))

# for f in ff:
#     print "Process dummification: ", f
#     X_dummy = pd.concat([X_dummy, pd.get_dummies(X[f])], axis=1)
    
# print X_dummy.shape
# X_dummy.head()

In [20]:
X.head()

Unnamed: 0,ncodpers,diff,ind_empleado,pais_residencia,sexo,age,ind_nuevo,antiguedad,indrel,ult_fec_cli_1t,...,tiprel_1mes,indresi,indext,conyuemp,canal_entrada,indfall,nomprov,ind_actividad_cliente,renta,segmento
1262323,15890,0.0,1,0,1,11,0,244,1.0,0,...,0,0,0,1,2,0,1,1.0,4,1
1051663,15893,0.0,0,0,1,11,0,211,1.0,0,...,0,0,0,1,2,0,1,1.0,11,1
1051665,15895,0.0,1,0,0,8,0,244,1.0,0,...,0,0,0,1,2,0,1,1.0,10,1
1051668,15900,0.0,2,0,1,8,0,244,1.0,0,...,0,0,0,1,2,0,1,1.0,7,1
1051672,15906,-1.0,0,0,0,9,0,171,1.0,0,...,0,0,0,1,2,0,1,1.0,5,2


### Run KFold Cross-validation 

In [21]:
def create_model(input_dim, output_dim):
        
    model = Sequential()
    model.add(Dense(input_dim, init='uniform', input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.30))
    model.add(Dense(50, activation='relu'))
    model.add(Dropout(0.20))
    model.add(Dense(30, activation='relu'))
    model.add(Dropout(0.10))
    model.add(Dense(output_dim, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
    return model

In [22]:
from sklearn.model_selection import KFold

nb_folds=3

def max_map7_score_per_fold(x_df, y_df, clients_last_choice_df, nb_folds):    
    kf = KFold(n_splits=nb_folds)
    scores = []
    for train_index, test_index in kf.split(range(x_df.shape[0])):
        X_val = x_df.loc[x_df.index[test_index], :]
        Y_val = y_df.loc[y_df.index[test_index], :]
        CLC_val = clients_last_choice_df.loc[clients_last_choice_df.index[test_index], :]                
        x_val, y_val = prepare_to_test(X_val, Y_val)
        scores.append(map7_score2(y_val, y_val, CLC_val[TARGET_LABELS].values))  
    return scores        

In [23]:
# CROSS VALIDATION
from sklearn.preprocessing import StandardScaler
n_highest = 7

def prepare_to_fit(X_train, Y_train):
    # Select only active client for the training part
    mask = X_train['diff'] > 0
    x_train = X_train[mask].drop(['diff'], axis=1).values
    y_train = Y_train[mask][TARGET_LABELS].values
    x_train = StandardScaler().fit_transform(x_train)
    return x_train, y_train

def prepare_to_fit_nomask(X_train, Y_train):
    y_train = Y_train[TARGET_LABELS].values
    x_train = X_train.drop(['diff'], axis=1).values
    x_train = StandardScaler().fit_transform(x_train)
    return x_train, y_train

def prepare_to_test(X_val, Y_val=None):
    if Y_val is not None:
        y_val = Y_val[TARGET_LABELS].values
    
    x_val = []
    x_val = X_val.drop(['diff'], axis=1).values    
    x_val = StandardScaler().fit_transform(x_val)  
    
    if Y_val is not None:
        return x_val, y_val
    else:
        return x_val

def cross_val_score2(data, 
                     nb_folds=5, 
                     prepare_to_fit_func=prepare_to_fit, 
                     prepare_to_test_func=prepare_to_test, 
                     create_model_func=create_model):
    
    x_df, y_df, clients_last_choice_df = data
    kf = KFold(n_splits=nb_folds)
    scores = []
    
    for train_index, test_index in kf.split(range(x_df.shape[0])):
        X_train, X_val = x_df.loc[x_df.index[train_index], :], x_df.loc[x_df.index[test_index], :]
        Y_train, Y_val = y_df.loc[y_df.index[train_index], :], y_df.loc[y_df.index[test_index], :]
        CLC_val = clients_last_choice_df.loc[clients_last_choice_df.index[test_index], :]
                
        x_train, y_train = prepare_to_fit_func(X_train, Y_train)
        x_val, y_val = prepare_to_test_func(X_val, Y_val)
        
        logging.info("- Train/Val shapes : {}, {} | {}, {}".format(
                [i.shape for i in x_train] if isinstance(x_train, list) else x_train.shape, 
                [i.shape for i in x_val] if isinstance(x_val, list) else x_val.shape, 
                y_train.shape, 
                y_val.shape)
        )                
            
        logging.info("- Create the model : ")
        estimator = create_model_func(x_train.shape[1], len(TARGET_LABELS))
        logging.info("- Fit the model")
        hist = estimator.fit(x_train, y_train, nb_epoch=250, batch_size=5000, verbose=0)        
        for key in hist.history:            
            logging.info("-- %s : min=%f, max=%f" % (key, np.min(hist.history[key]), np.max(hist.history[key])))
        
        logging.info("- Predict using trained model")
        y_pred = estimator.predict(x_val, verbose=0)
        logging.info("- Compute map7 score")
        scores.append(map7_score2(y_val, y_pred, CLC_val[TARGET_LABELS].values))        
    
    return np.array(scores)

In [25]:
results = cross_val_score2((X.drop(['ncodpers'], axis=1), Y, clients_last_choice), 
                           nb_folds=nb_folds)
print "Cross-Validation \n %i | %f | %f | %f | %.5f " % (nb_folds, results.min(), results.mean(), results.max(), results.std())

INFO:root:- Train/Val shapes : (4267, 19), (82866, 19) | (4267, 24), (82866, 24)
INFO:root:- Create the model : 
INFO:root:- Fit the model
INFO:root:-- acc : min=0.505585, max=0.892567
INFO:root:-- loss : min=0.233187, max=0.693873
INFO:root:- Predict using trained model
INFO:root:- Compute map7 score
INFO:root:-- Predicted map7 score: 0.0226522433043
INFO:root:- Train/Val shapes : (5165, 19), (82866, 19) | (5165, 24), (82866, 24)
INFO:root:- Create the model : 
INFO:root:- Fit the model
INFO:root:-- acc : min=0.552331, max=0.867554
INFO:root:-- loss : min=0.271221, max=0.690441
INFO:root:- Predict using trained model
INFO:root:- Compute map7 score
INFO:root:-- Predicted map7 score: 0.0168355073347
INFO:root:- Train/Val shapes : (4844, 19), (82866, 19) | (4844, 24), (82866, 24)
INFO:root:- Create the model : 
INFO:root:- Fit the model
INFO:root:-- acc : min=0.509152, max=0.850029
INFO:root:-- loss : min=0.310334, max=0.692694
INFO:root:- Predict using trained model
INFO:root:- Compute 

Cross-Validation 
 3 | 0.016836 | 0.020387 | 0.022652 | 0.00254 


### Test 1 : Train on active clients + feature engineering

#### Data: 

- yearmonth_list = [201505, 201506]
- nb_clients = 250000

#### Feature engineering

- age -> age group
- income -> income group


#### Model

- 75 -> 50 -> 30 ->
- sigmoid, binary_crossentropy, nadam, accuracy, 

**Conf:**
- batch_size=2000

*Cross-validation results :* 

 Nb epoch | Nb folds | Min MAP@7 | Mean MAP@7 | Max MAP@7 | STD MAP@7
 --- | --- | --- | --- | --- | ---
 150 | 3 | 0.025659 | 0.029959 | 0.037639 | 0.00544 


#### Model 1 

- 50 -> 30 ->
- sigmoid, binary_crossentropy, nadam, accuracy, 

**Conf:**
- batch_size=2000

*Cross-validation results :* 

 Nb epoch | Nb folds | Min MAP@7 | Mean MAP@7 | Max MAP@7 | STD MAP@7
 --- | --- | --- | --- | --- | ---
 150 | 3 | 0.025802 | 0.030137 | 0.037217 | 0.00505 


Kaggle : 0.0197579

## Train model for predictions

In [26]:
def train_model(X_train, Y_train):

    x_train, y_train = prepare_to_fit(X_train, Y_train)
    logging.info("- Train data shapes : {}, {}".format(
            [i.shape for i in x_train] if isinstance(x_train, list) else x_train.shape, 
            y_train.shape)
    )                

    logging.info("- Create the model")
    
    estimator = create_model(x_train.shape[1], len(TARGET_LABELS))
    logging.info("- Fit the model")
    hist = estimator.fit(x_train, y_train, nb_epoch=150, batch_size=2000, verbose=0)        
    for key in hist.history:            
        logging.info("-- %s : min=%f, max=%f" % (key, np.min(hist.history[key]), np.max(hist.history[key])))
    return estimator

estimator = train_model(X[['diff'] + features], Y)

INFO:root:- Train data shapes : (7138, 19), (7138, 24)
INFO:root:- Create the model
INFO:root:- Fit the model
INFO:root:-- acc : min=0.591360, max=0.869682
INFO:root:-- loss : min=0.271273, max=0.689413


Check score on the data 2016

In [28]:
yearmonth_list = [201604, 201605] 
nb_months = len(yearmonth_list)
nb_clients = 'max'

In [29]:
val_df = load_data2(TRAIN_FILE_PATH, yearmonth_list, nb_clients)
minimal_clean_data_inplace(val_df)
preprocess_data_inplace(val_df)

months = val_df['fecha_dato'].unique()
clients = val_df['ncodpers'].unique()
assert len(clients) == (val_df['ncodpers'].value_counts() == nb_months).sum()
ll = len(clients)
for m in months:
    l = len(val_df[val_df['fecha_dato'] == m]['ncodpers'].unique())
    assert l == ll, "Number of clients should be identical for all monthes. (%s, %s, %s)" % (m, l, ll)

INFO:root:-- Select max clients
INFO:root:- Number of lines with unknown data : 0
INFO:root:- Number of columns with nan : 10


In [9]:
val_df = val_df.sort_values(['fecha_dato', 'ncodpers'])
last_choice_mask = val_df['fecha_dato'] == months[-2]
train_month_mask = trainval_df['fecha_dato'] == months[-1]

In [10]:
def add_diff_inplace(df, last_choice_mask, train_month_mask):
    tmp_df = df[['fecha_dato','ncodpers']]
    tmp_df.loc[:,'target'] = df[TARGET_LABELS].sum(axis=1)
    v1 = tmp_df[train_month_mask]['target'].values
    v2 = tmp_df[last_choice_mask]['target'].values
    ll = min(len(v1), len(v2))
    indices = tmp_df.index[ll:]
    df.loc[indices,'diff'] = pd.Series(v1 - v2, index=indices)
    del tmp_df, v1, v2

In [11]:
add_diff_inplace(trainval_df, last_choice_mask, train_month_mask)
print trainval_df.shape

(497196, 47)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [12]:
clients_last_choice = trainval_df[last_choice_mask][['ncodpers'] + TARGET_LABELS].sort_values(['ncodpers'])
X = trainval_df[train_month_mask][['ncodpers', 'diff'] + features]
Y = trainval_df[train_month_mask][['ncodpers'] + TARGET_LABELS]

In [13]:
# Select only clients from test data
clients_last_choice = clients_last_choice[clients_last_choice['ncodpers'].isin(X['ncodpers'])]
assert (X['ncodpers'].values == clients_last_choice['ncodpers'].values).all(), "WTF"

## Prediction on test data

Load the last month from the training dataset to get user last choice and remove it from predictions

In [None]:
yearmonth_list = [201605]
lastmonth_df = load_data2(TRAIN_FILE_PATH, yearmonth_list)
minimal_clean_data_inplace(lastmonth_df)

test_df = load_data2(TEST_FILE_PATH, [])
minimal_clean_data_inplace(test_df)
preprocess_data_inplace(test_df)
test_df = test_df.sort_values(['ncodpers'])

clients_last_choice_test = lastmonth_df[['ncodpers'] + TARGET_LABELS].sort_values(['ncodpers'])
# Select only clients from test data
clients_last_choice_test = clients_last_choice_test[clients_last_choice_test['ncodpers'].isin(test_df['ncodpers'])]
assert (test_df['ncodpers'].values == clients_last_choice_test['ncodpers'].values).all(), "WTF"

In [None]:
X_test = test_df[['ncodpers'] + features]

In [None]:
X_test.loc[:,'age'] = X_test['age'].apply(get_age_group_index)
X_test.loc[:,'renta'] = X_test['renta'].apply(get_income_group_index)

Make predictions

In [None]:
def get_submission(y_pred, clients, clc, target_labels, n_highest=7):
    predicted_added_products = np.argsort(y_pred, axis=1)
    predicted_added_products = predicted_added_products[:,::-1][:,:n_highest]
    added_products_col = []
    count = 0 
    for products, last_choice in zip(predicted_added_products, clc):
        predictions = remove_last_choice(products, last_choice)
        added_products_col.append(' '.join([target_labels[i] for i in predictions]))
        count+=1
        if count % 100000 == 0:
            logging.info("Elapsed : %i", count)
            
    out = pd.DataFrame(data={'ncodpers': clients, 'added_products': added_products_col}, columns=['ncodpers', 'added_products'])
    return out

In [None]:

data = {
    0: {'x': X_test, 'clc': clients_last_choice_test},
}

submissions = []
for i in data.keys():
    x = data[i]['x']
    clc = data[i]['clc']
    x_test = prepare_to_test(x)
    y_pred = estimator.predict(x_test, verbose=0)

    clients = x['ncodpers']
    submission = get_submission(y_pred, clients, clc[TARGET_LABELS].values, TARGET_LABELS, 7)
    print submission.head()
    submissions.append(submission)

submission = pd.concat(submissions)

In [None]:
submission.head()

Get submission DataFrame and write csv file

In [None]:
from datetime import datetime
import csv

logging.info('- Generate submission')
submission_file = '../results/submission_' + \
                  str(datetime.now().strftime("%Y-%m-%d-%H-%M")) + \
                  '.csv'

submission.to_csv(submission_file, index=False, index_label=False)