# RecSys Challenge: A two-step approach
Alexis Forest (01948163), Harry Walker (01867526), William Flynn (01997714), Coraline Duval (01930178)


In [6]:
#data wrangling library
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime

#visuals
import matplotlib.pyplot as plt

# Machine Learning - Data & Metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsRegressor

#other
import time
import warnings
warnings.filterwarnings("ignore")
#library for deleting cache memory (when your computer is going to do a big task)
import gc

## Importing data and basic wrangling

In [7]:
buy = pd.read_csv('yoochoose-buys.dat', sep=',', names = ['session_id', 'time', 'item_id', 'price', 'quantity'])
click = pd.read_csv('yoochoose-clicks.dat', sep=',', names = ['session_id', 'time', 'item_id', 'category'])

In [8]:
click['time'] = pd.to_datetime(click['time'])
buy['time'] = pd.to_datetime(buy['time'])

In [9]:
buy.head()

Unnamed: 0,session_id,time,item_id,price,quantity
0,420374,2014-04-06 18:44:58.314000+00:00,214537888,12462,1
1,420374,2014-04-06 18:44:58.325000+00:00,214537850,10471,1
2,281626,2014-04-06 09:40:13.032000+00:00,214535653,1883,1
3,420368,2014-04-04 06:13:28.848000+00:00,214530572,6073,1
4,420368,2014-04-04 06:13:28.858000+00:00,214835025,2617,1


In [10]:
click.head()

Unnamed: 0,session_id,time,item_id,category
0,1,2014-04-07 10:51:09.277000+00:00,214536502,0
1,1,2014-04-07 10:54:09.868000+00:00,214536500,0
2,1,2014-04-07 10:54:46.998000+00:00,214536506,0
3,1,2014-04-07 10:57:00.306000+00:00,214577561,0
4,2,2014-04-07 13:56:37.614000+00:00,214662742,0


### Product and Category Figures

There are 52,000 different products shared in 340 categories. Only around 20,000 products have ever been bought.

In [11]:
len(click.item_id.unique())

52739

In [12]:
len(click.category.unique())

340

In [13]:
len(buy.item_id.unique())

19949

The number of items among the categories is really sparse, as a lot of categories have a small number of items. For instance, 60 categories only have 1 item. A lot of items have several categories.

In [14]:
category_item = click.groupby('item_id').agg({'category' : "nunique"})
category_item = category_item.sort_values('category', ascending = False)

In [15]:
len(category_item.loc[category_item.category > 1])

36706

## 1. First Classifier - Predicting buy events for test sessions

### 1.1. Creation of training set - Session-specific Features

### Time spent on item

To have a deeper understanding of each session, we need to calculate the time spent on each item by the user (time of each click). This baseline feature will be used to extract a lot of other features such as total duration of a session, max time spent on an object, etc. To calculate it, we shift the dataset by -1 row, and we concatenate the two datasets. Now, on one single row, we have the time of the click plus the time of the following click. The difference of both dates gives us the time spent on the item.

In [16]:
col = ['session_id', 'time'] 
click_shifted = click.shift(-1).loc[:,col]
click_shifted.columns = ['session_id_1', 'time_1'] 

click = pd.concat([click, click_shifted], axis = 1, join = 'inner')
click

Unnamed: 0,session_id,time,item_id,category,session_id_1,time_1
0,1,2014-04-07 10:51:09.277000+00:00,214536502,0,1.0,2014-04-07 10:54:09.868000+00:00
1,1,2014-04-07 10:54:09.868000+00:00,214536500,0,1.0,2014-04-07 10:54:46.998000+00:00
2,1,2014-04-07 10:54:46.998000+00:00,214536506,0,1.0,2014-04-07 10:57:00.306000+00:00
3,1,2014-04-07 10:57:00.306000+00:00,214577561,0,2.0,2014-04-07 13:56:37.614000+00:00
4,2,2014-04-07 13:56:37.614000+00:00,214662742,0,2.0,2014-04-07 13:57:19.373000+00:00
...,...,...,...,...,...,...
33003939,11299809,2014-09-25 09:33:22.412000+00:00,214819412,S,11299809.0,2014-09-25 09:43:52.821000+00:00
33003940,11299809,2014-09-25 09:43:52.821000+00:00,214830939,S,11299811.0,2014-09-24 19:02:09.741000+00:00
33003941,11299811,2014-09-24 19:02:09.741000+00:00,214854855,S,11299811.0,2014-09-24 19:02:11.894000+00:00
33003942,11299811,2014-09-24 19:02:11.894000+00:00,214854838,S,11299811.0,2014-09-24 19:02:25.146000+00:00


In [17]:
# we add the time between two clicks from the same session

click['time_diff_clic'] = click.time_1 - click.time 
click['time_diff_clic'] = click['time_diff_clic'].dt.total_seconds()
click.loc[33003943, 'time_diff_clic'] = 0

# we suppress wrong numbers occured because of shift
# when two session id are different, it means we are starting a new session

click['diff_session_tosuppress'] = click.session_id_1 - click.session_id 
mask = click['diff_session_tosuppress'] != 0
click.loc[mask, 'time_diff_clic'] = 0

In [18]:
click = click.drop(columns = ['session_id_1', 'time_1', 'diff_session_tosuppress'])

In [19]:
click

Unnamed: 0,session_id,time,item_id,category,time_diff_clic
0,1,2014-04-07 10:51:09.277000+00:00,214536502,0,180.591
1,1,2014-04-07 10:54:09.868000+00:00,214536500,0,37.130
2,1,2014-04-07 10:54:46.998000+00:00,214536506,0,133.308
3,1,2014-04-07 10:57:00.306000+00:00,214577561,0,0.000
4,2,2014-04-07 13:56:37.614000+00:00,214662742,0,41.759
...,...,...,...,...,...
33003939,11299809,2014-09-25 09:33:22.412000+00:00,214819412,S,630.409
33003940,11299809,2014-09-25 09:43:52.821000+00:00,214830939,S,0.000
33003941,11299811,2014-09-24 19:02:09.741000+00:00,214854855,S,2.153
33003942,11299811,2014-09-24 19:02:11.894000+00:00,214854838,S,13.252


### Number of items in the session with 2 or more clicks

In [20]:
# if item has 2 clicks

def item_with_2c(list_item):
    list_occurence = list(dict(Counter(list_item)).values())
    list_occurence = np.array(list_occurence)
    item_with_2c = sum(list_occurence ==2)
    return item_with_2c

# if item has 3 or more clicks

def item_with_3c(list_item):
    list_occurence = list(dict(Counter(list_item)).values())
    list_occurence = np.array(list_occurence)
    item_with_3c = sum(list_occurence > 2)
    return item_with_3c

#test of aggregators

tmp_click = click[1:2000]
tmp_click.groupby('session_id').agg({'item_id':item_with_2c}).head()

Unnamed: 0_level_0,item_id
session_id,Unnamed: 1_level_1
1,0
2,1
3,0
4,0
6,0


### Adding features to matrix

Along with the features mentioned, we create several additional features for each session:

 - total time user spends on the session
 - median time spent on each item
 - total number of clicks in the session
 - number of unique items in the session
 - max time spent on an item in the session
 

In [21]:
t = time.time()
gc.collect()

matrix = click.groupby('session_id').agg(start = ('time', 'min'), 
                                         end=('time','max'), 
                                         total_time=('time_diff_clic','sum'), 
                                         median_time_clic=('time_diff_clic','median'), 
                                         nb_clic=('time_diff_clic','count'),
                                         nb_2clic=('item_id',item_with_2c), 
                                         nb_3clic=('item_id',item_with_3c),
                                         unique_item_count=('item_id','nunique'),
                                         max_time_on_item=('time_diff_clic','max')                                    
                                        )

print(time.time()-t)
matrix.head()

346.01015973091125


Unnamed: 0_level_0,start,end,total_time,median_time_clic,nb_clic,nb_2clic,nb_3clic,unique_item_count,max_time_on_item
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,2014-04-07 10:51:09.277000+00:00,2014-04-07 10:57:00.306000+00:00,351.029,85.219,4,0,0,4,180.591
2,2014-04-07 13:56:37.614000+00:00,2014-04-07 14:02:36.889000+00:00,359.275,60.4005,6,1,0,5,118.642
3,2014-04-02 13:17:46.940000+00:00,2014-04-02 13:30:12.318000+00:00,745.378,249.803,3,0,0,3,495.575
4,2014-04-07 12:09:10.948000+00:00,2014-04-07 12:26:25.416000+00:00,1034.468,517.234,2,0,0,2,1034.468
6,2014-04-06 16:58:20.848000+00:00,2014-04-06 17:02:26.976000+00:00,246.128,123.064,2,0,0,2,246.128


Now we had the objective function to the dataset

In [22]:
session_with_buy = buy.session_id.unique()
session_with_buy

array([  420374,   281626,   420368, ..., 11368691, 11523941, 11423202],
      dtype=int64)

In [23]:
# how many purchases have been made

purchase_mask = matrix.index.isin(session_with_buy)
matrix['purchase'] = 0
matrix.loc[purchase_mask, 'purchase'] = 1

In [24]:
matrix.purchase.sum() / matrix.purchase.count()

0.05510388466516154

We have a dataset containing 9.2 million sessions from which 5.5% have resulted in a purchase. 

### 1.2. Aditional Feature Engineering

It is useful to downcast the memory of some columns. For instance, a string takes much more memory than an integer.
Furthermore, an integer is basically stored as an int64, an int32 or even int16 is enough (divide memory by 2 or 4).

It will be very important for the computer which only has 8GB of RAM as the dataset may become very heavy. The current dataset without additional feature is taking 635MB of RAM non optimized, and it decreases to 400MB better optimized.

In [25]:
#matrix.info()

In [26]:
matrix['purchase'] = matrix['purchase'].astype('int8')
matrix['median_time_clic'] = matrix['median_time_clic'].astype('float32')
matrix['total_time'] = matrix['total_time'].astype('float32')
matrix['nb_clic'] = matrix['nb_clic'].astype('int32')
matrix['nb_3clic'] = matrix['nb_2clic'].astype('int16')
matrix['nb_3clic'] = matrix['nb_3clic'].astype('int16')
matrix['unique_item_count'] = matrix['unique_item_count'].astype('int16')
matrix['max_time_on_item'] = matrix['max_time_on_item'].astype('float32')

In [27]:
#matrix.info()

### Day and Month Features

In [28]:
# day column

matrix['day'] = matrix.start.dt.strftime('%w')
matrix['day'] = matrix['day'].astype('int8')

# month column

matrix['month'] = matrix.start.dt.month
matrix['month'] = matrix['month'].astype('int8')

matrix.head()

Unnamed: 0_level_0,start,end,total_time,median_time_clic,nb_clic,nb_2clic,nb_3clic,unique_item_count,max_time_on_item,purchase,day,month
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,2014-04-07 10:51:09.277000+00:00,2014-04-07 10:57:00.306000+00:00,351.028992,85.219002,4,0,0,4,180.591003,0,1,4
2,2014-04-07 13:56:37.614000+00:00,2014-04-07 14:02:36.889000+00:00,359.274994,60.400501,6,1,1,5,118.641998,0,1,4
3,2014-04-02 13:17:46.940000+00:00,2014-04-02 13:30:12.318000+00:00,745.377991,249.802994,3,0,0,3,495.575012,0,3,4
4,2014-04-07 12:09:10.948000+00:00,2014-04-07 12:26:25.416000+00:00,1034.468018,517.234009,2,0,0,2,1034.468018,0,1,4
6,2014-04-06 16:58:20.848000+00:00,2014-04-06 17:02:26.976000+00:00,246.128006,123.064003,2,0,0,2,246.128006,0,0,4


In [29]:
click.head()

Unnamed: 0,session_id,time,item_id,category,time_diff_clic
0,1,2014-04-07 10:51:09.277000+00:00,214536502,0,180.591
1,1,2014-04-07 10:54:09.868000+00:00,214536500,0,37.13
2,1,2014-04-07 10:54:46.998000+00:00,214536506,0,133.308
3,1,2014-04-07 10:57:00.306000+00:00,214577561,0,0.0
4,2,2014-04-07 13:56:37.614000+00:00,214662742,0,41.759


### Top Item Count

An item which appears in the top 25% of items purchased is classified as popular. The top_item_count feature represents how many popular items the customer clicked on in the session. Items which are in the top 25% should be more likely to be recommended.

In [30]:
# create top 25% quartile

quartile_item = list(buy.item_id.value_counts()[0:5000].index)
click['top_item'] = click.item_id.apply(lambda x: 1 if x in quartile_item else 0) # value of 1 assigned if item falls in top 25%

# count total for each session

matrix = matrix.merge(click.groupby('session_id').agg({'top_item':'sum'}).rename(
    columns = {'top_item': 'top_item_count'}), on = 'session_id', how = 'inner')

In [31]:
print('Purchase and feature correlation:',matrix['purchase'].corr(matrix['top_item_count']))
matrix.head(3)

Purchase and feature correlation: 0.20934347029857694


Unnamed: 0_level_0,start,end,total_time,median_time_clic,nb_clic,nb_2clic,nb_3clic,unique_item_count,max_time_on_item,purchase,day,month,top_item_count
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,2014-04-07 10:51:09.277000+00:00,2014-04-07 10:57:00.306000+00:00,351.028992,85.219002,4,0,0,4,180.591003,0,1,4,1
2,2014-04-07 13:56:37.614000+00:00,2014-04-07 14:02:36.889000+00:00,359.274994,60.400501,6,1,1,5,118.641998,0,1,4,1
3,2014-04-02 13:17:46.940000+00:00,2014-04-02 13:30:12.318000+00:00,745.377991,249.802994,3,0,0,3,495.575012,0,3,4,3


## 1.3 Model Training

In [32]:
# add purchase column to end of matrix

matrix = matrix[[c for c in matrix if c not in ['purchase']] + ['purchase']]
matrix.head(3)

Unnamed: 0_level_0,start,end,total_time,median_time_clic,nb_clic,nb_2clic,nb_3clic,unique_item_count,max_time_on_item,day,month,top_item_count,purchase
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,2014-04-07 10:51:09.277000+00:00,2014-04-07 10:57:00.306000+00:00,351.028992,85.219002,4,0,0,4,180.591003,1,4,1,0
2,2014-04-07 13:56:37.614000+00:00,2014-04-07 14:02:36.889000+00:00,359.274994,60.400501,6,1,1,5,118.641998,1,4,1,0
3,2014-04-02 13:17:46.940000+00:00,2014-04-02 13:30:12.318000+00:00,745.377991,249.802994,3,0,0,3,495.575012,3,4,3,0


### Data split

All data is for 2014. We have data from months April to September. Use 60% training, 20% validation, 20% testing. We drop those with negative values in total_time (outliers) which are 26%.

In [33]:
ml_matrix = matrix.sort_values('month')
ml_matrix = ml_matrix[ml_matrix.total_time >= 0]

In [34]:
ml_matrix.head()

Unnamed: 0_level_0,start,end,total_time,median_time_clic,nb_clic,nb_2clic,nb_3clic,unique_item_count,max_time_on_item,day,month,top_item_count,purchase
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,2014-04-07 10:51:09.277000+00:00,2014-04-07 10:57:00.306000+00:00,351.028992,85.219002,4,0,0,4,180.591003,1,4,1,0
1274674,2014-04-20 15:14:10.360000+00:00,2014-04-20 15:14:10.360000+00:00,0.0,0.0,1,0,0,1,0.0,0,4,1,0
1274673,2014-04-19 05:49:25.680000+00:00,2014-04-19 05:57:18.139000+00:00,472.459015,144.705994,4,1,1,3,183.046997,6,4,4,0
1274672,2014-04-16 06:01:33.771000+00:00,2014-04-16 06:01:35.469000+00:00,1.698,0.849,2,0,0,2,1.698,3,4,2,0
1274671,2014-04-16 17:28:52.869000+00:00,2014-04-16 18:04:53.976000+00:00,2161.106934,77.750504,8,0,0,8,1594.262939,3,4,3,0


In [35]:
## Train-validation-test split

# get indices for data split

print(round(len(ml_matrix)))
print(round(len(ml_matrix)*0.6)) # train
print(round(len(ml_matrix)*0.6 + len(ml_matrix)*0.2)) # validation
print(round(len(ml_matrix)*0.6 + len(ml_matrix)*0.4)) # test

X = ml_matrix.drop(['purchase', 'start','end','day'],axis=1)
Y = ml_matrix["purchase"]

x_train = X.iloc[0:4096005]
x_validate = X.iloc[4096006:5461341]
x_test = X.iloc[5461342:6826676]

y_train = Y.iloc[0:4096005]
y_validate = Y.iloc[4096006:5461341]
y_test = Y.iloc[5461342:6826676]

9249729
5549837
7399783
9249729


In [36]:
X.head()

Unnamed: 0_level_0,total_time,median_time_clic,nb_clic,nb_2clic,nb_3clic,unique_item_count,max_time_on_item,month,top_item_count
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,351.028992,85.219002,4,0,0,4,180.591003,4,1
1274674,0.0,0.0,1,0,0,1,0.0,4,1
1274673,472.459015,144.705994,4,1,1,3,183.046997,4,4
1274672,1.698,0.849,2,0,0,2,1.698,4,2
1274671,2161.106934,77.750504,8,0,0,8,1594.262939,4,3


## XGBoost

In [37]:
XGB = XGBClassifier(n_estimators = 2000, learning_rate = 0.05, use_label_encoder = False, eval_metric = 'logloss',
                   scale_pos_weight = 4)
XGB.fit(x_train, y_train, early_stopping_rounds=5, 
         eval_set=[(x_validate, y_validate)], verbose=False) 

predictions = XGB.predict(x_test)
print('Test Accuracy:', metrics.accuracy_score(y_test, predictions) , 
      'Optimal Estimators:', XGB.best_ntree_limit)

Test Accuracy: 0.9293938333037923 Optimal Estimators: 174


### Confusion Matrix

In [38]:
pd.DataFrame(confusion_matrix(y_test, predictions))

Unnamed: 0,0,1
0,1252775,39546
1,56855,16158


### Creating a dataframe with our predictions

predictions column: equal to 1 if it is a buy session (user purchases something), 0 otherwise

In [39]:
X_TEST = x_test.reset_index().merge(pd.DataFrame(predictions).rename(columns = {0:'predictions'}), left_index = True, how = 'inner',
                          right_index = True)

In [40]:
# select all buy sessions

X_TEST[X_TEST['predictions'] == 1]

Unnamed: 0,session_id,total_time,median_time_clic,nb_clic,nb_2clic,nb_3clic,unique_item_count,max_time_on_item,month,top_item_count,predictions
20,6191541,2242.589111,102.387001,13,2,2,11,643.125977,7,13,1
32,6191672,4093.089111,44.681999,11,4,4,7,3309.589111,7,11,1
49,6191699,2221.966064,71.151001,12,3,3,9,1113.128052,7,12,1
51,6191662,7319.076172,46.391998,13,1,1,7,3498.378906,7,12,1
105,6191384,1927.862061,61.725498,26,0,0,26,286.098999,7,24,1
...,...,...,...,...,...,...,...,...,...,...,...
1365170,7556018,1074.215942,39.229000,15,1,1,14,290.481995,8,14,1
1365178,7556008,1685.748047,48.247002,13,4,4,6,989.705994,8,10,1
1365245,7555923,3713.665039,85.455498,14,3,3,9,2362.972900,8,8,1
1365247,7555921,1967.156982,107.564003,13,1,1,9,506.709991,8,12,1


## 2. Second Classifier - Given a buy session, what items will they purchase?

## 2.1. Creation of training set - Item-specific Features

### Category Features

A user which views a lot of items in a session of the same category could be using targeted browsing (looking for something in particular) and more likely to buy something.

In [41]:
# we have some 0s as integers and strings in the category column but they must be the same

def correct_zeroes(row):
    
    if row == '0':
        return int(row)
    else:
        return row
        
click.category = click.category.apply(lambda x: correct_zeroes(x))

In [42]:
cat_list = 'S 0 1 2 3 4 5 6 7 8 9 10 11 12'.split()
for c in cat_list:
    click[c] = np.nan

In [43]:
# creating dummy variables for each category, equal to 1 if item falls into said category, 0 otherwise

click['0'] = click.category.apply(lambda x: 1 if x == 0 else 0)
click['1'] = click.category.apply(lambda x: 1 if x == 1 else 0)
click['2'] = click.category.apply(lambda x: 1 if x == 2 else 0)
click['3'] = click.category.apply(lambda x: 1 if x == 3 else 0)
click['4'] = click.category.apply(lambda x: 1 if x == 4 else 0)
click['5'] = click.category.apply(lambda x: 1 if x == 5 else 0)
click['6'] = click.category.apply(lambda x: 1 if x == 6 else 0)
click['7'] = click.category.apply(lambda x: 1 if x == 7 else 0)
click['8'] = click.category.apply(lambda x: 1 if x == 8 else 0)
click['9'] = click.category.apply(lambda x: 1 if x == 9 else 0)
click['10'] = click.category.apply(lambda x: 1 if x == 10 else 0)
click['11'] = click.category.apply(lambda x: 1 if x == 11 else 0)
click['12'] = click.category.apply(lambda x: 1 if x == 12 else 0)
click['S'] = click.category.apply(lambda x: 1 if x == 'S' else 0)

In [44]:
# add new category features to first matrix and sum

matrix_q2 = matrix.copy()
matrix_q2 = matrix_q2.merge(click.groupby('session_id').agg({'0':'sum','1':'sum','2':'sum','3':'sum','4':'sum',
                                                       '5':'sum','6':'sum','7':'sum',
                                                       '8':'sum','9':'sum','10':'sum','11':'sum','12':'sum','S':'sum'}),
                                                        on = 'session_id', how = 'inner')

In [45]:
pd.set_option('display.max_columns', None)

In [46]:
# item list by session

matrix_q2 = matrix_q2.merge(pd.DataFrame(click.groupby(['session_id'])['item_id'].apply(list)), how = 'inner', on = 'session_id')
matrix_q2 = matrix_q2.rename(columns = {'item_id': 'item_list'})

In [47]:
click_q2 = click.copy()
click_q2 = click_q2[['session_id','item_id','top_item', 'category']].drop_duplicates().merge(click_q2.groupby(['session_id', 'item_id']).agg(click_count = ('time','count'),
                                                click_time = ('time_diff_clic', 'sum')), left_on = ['session_id', 'item_id'],
                                                right_on = ['session_id', 'item_id'], how = 'inner')

### Ratio time and last item features

 - ratio time: amount of time spent on item compared to total session time
 - last item: whether item was last that appeared in the session

In [48]:
Q2_MATRIX = click_q2.merge(matrix_q2['total_time median_time_clic nb_clic nb_2clic nb_3clic day month unique_item_count max_time_on_item top_item_count 0 1 2 3 4 5 6 7 8 9 10 11 12 S item_list'.split()],
              on = 'session_id', how = 'inner')

# ratio time feature

Q2_MATRIX['ratio_time'] = Q2_MATRIX['click_time']/Q2_MATRIX['total_time']

# we have total time 0 when 1 click, creates NA values that should be 0

Q2_MATRIX['ratio_time']=Q2_MATRIX['ratio_time'].fillna(0)

# last item feature

Q2_MATRIX['last_item'] = Q2_MATRIX.item_list.apply(lambda x: x[-1])
Q2_MATRIX=Q2_MATRIX.drop(['day','category','item_list','total_time'], axis = 1) 

In [49]:
Q2_MATRIX.head()

Unnamed: 0,session_id,item_id,top_item,click_count,click_time,median_time_clic,nb_clic,nb_2clic,nb_3clic,month,unique_item_count,max_time_on_item,top_item_count,0,1,2,3,4,5,6,7,8,9,10,11,12,S,ratio_time,last_item
0,1,214536502,1,1,180.591,85.219002,4,0,0,4,4,180.591003,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0.514462,214577561
1,1,214536500,0,1,37.13,85.219002,4,0,0,4,4,180.591003,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0.105775,214577561
2,1,214536506,0,1,133.308,85.219002,4,0,0,4,4,180.591003,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0.379764,214577561
3,1,214577561,0,1,0.0,85.219002,4,0,0,4,4,180.591003,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,214577561
4,2,214662742,0,2,119.832,60.400501,6,1,1,4,5,118.641998,1,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0.333538,214551617


## Custom Distances for kNN Regressor

### Jaccard Distance and Discrete Metric

In [50]:
def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

def dicrete_metric(item1,item2):
    if item1==item2:
        return 0
    return 1

### Custom distance metrics for each feature 

In [51]:
def custom_distance(x1, x2, weight=[1]*31):
    
    #distance between 2 rows of Q2_MATRIX
    
    distance=0
    
    #session_id 
    #distance+=weight[0] * dicrete_metric(x1.session_id,x2.session_id)
    distance+=weight[0] * dicrete_metric(x1[0],x2[0])
    
    #item_id 
    distance+=weight[1] * dicrete_metric(x1[1],x2[1])
    
    #top_item
    distance+=weight[2] * dicrete_metric(x1[2],x2[2])

    #category
    #distance+=weight[3] * dicrete_metric(x1.category,x2.category)

    #click_count
    distance+=weight[4] * abs(x1[3]-x2[3])

    #click_time 
    distance+=weight[5] * abs(x1[4]-x2[4])

    #median_time_clic 
    distance+=weight[6] * abs(x1[5]-x2[5])

    #nb_clic
    distance+=weight[7] * abs(x1[6]-x2[6])

    #nb_2clic 
    distance+=weight[8] * abs(x1[7] -x2[7])

    #nb_3clic 
    distance+=weight[9] * abs(x1[8]-x2[8])

    #month 
    distance+=weight[10] * abs(x1[9]-x2[9])

    #unique_item_count 
    distance+=weight[11] * abs(x1[10]-x2[10])

    #max_time_on_item 
    distance+=weight[12] * abs(x1[11]-x2[11])

    #top_item_count 
    distance+=weight[13] * abs(x1[12]-x2[12])

    #0 1 2 3 4 5 6 7 8 9 10 11 12
    #category_list=[0,1,2,3,4,5,6,7,8,9,10,11,12,'S'] 
    #for i,c in enumerate(category_list):
    for i in range (13,27):
        distance+=weight[i] * abs(x1[i]-x2[i])

    #item_list
    #distance+=weight[28] * (1-jaccard(x1.item_list,x2.item_list)) #jaccard distance is 1-jaccard score

    #ratio_time 
    distance+=weight[29] * abs(x1[27]-x2[27])

    #last_item
    distance+=weight[30] * dicrete_metric(x1[28],x2[28])

    return distance/30

## Data split - train and test sets for kNN

In [52]:
x_train_knn=Q2_MATRIX[(Q2_MATRIX.session_id.isin(x_train.index)) &(Q2_MATRIX.session_id.isin(buy.session_id))]

y_train_knn=x_train_knn[['session_id','item_id']]
temp=buy[buy.session_id.isin(x_train.index)][['session_id','item_id']].drop_duplicates()
temp['bought']=1
y_train_knn=y_train_knn.merge(temp, on=['session_id','item_id'], how='left')
y_train_knn['bought']=[0 if i!=1.0 else 1 for i in y_train_knn.bought]

x_test_knn=Q2_MATRIX[Q2_MATRIX.session_id.isin(X_TEST[X_TEST['predictions'] == 1].session_id)]

In [53]:
x_test_knn[x_test_knn.nb_clic>1]

Unnamed: 0,session_id,item_id,top_item,click_count,click_time,median_time_clic,nb_clic,nb_2clic,nb_3clic,month,unique_item_count,max_time_on_item,top_item_count,0,1,2,3,4,5,6,7,8,9,10,11,12,S,ratio_time,last_item
13885330,6121933,214850715,1,4,1049.592,43.688000,4,0,0,7,1,962.216003,4,0,0,0,0,0,0,0,0,0,0,0,0,0,4,1.000000,214850715
13885331,6121922,214716984,1,1,89.317,91.875000,17,2,2,7,11,1912.878052,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.020680,214716982
13885332,6121922,214716965,1,3,385.290,91.875000,17,2,2,7,11,1912.878052,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.089207,214716982
13885333,6121922,214716999,1,1,302.684,91.875000,17,2,2,7,11,1912.878052,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.070081,214716982
13885334,6121922,214748327,1,1,68.237,91.875000,17,2,2,7,11,1912.878052,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.015799,214716982
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22593346,9737848,214853105,1,1,42.441,36.073502,14,2,2,8,7,334.358002,14,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0.043133,214853094
22593347,9737848,214853220,1,1,334.358,36.073502,14,2,2,8,7,334.358002,14,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0.339813,214853094
22593348,9737848,214852978,1,2,131.056,36.073502,14,2,2,8,7,334.358002,14,0,0,0,0,0,0,0,0,0,0,0,0,0,14,0.133194,214853094
22593358,9737843,214852987,1,2,469.935,234.967499,4,2,2,8,2,977.674988,4,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0.324628,214853850


## 2.3. Model Training

## kNN Regressor

The kNN model is run without the custom distance (due to computational errors)

In [54]:
# k nearest kneighbors with own distance

col=list(x_test_knn.columns)
col.remove('session_id')
col.remove('item_id')
col.remove('last_item')
k = 10

# kNNreg = KNeighborsRegressor(n_neighbors=k, weights='distance', metric=custom_distance)

kNNreg = KNeighborsRegressor(n_neighbors=k)
kNNreg.fit(x_train_knn[col], y_train_knn.bought)

KNeighborsRegressor(n_neighbors=10)

In [55]:
# prediction using the test of Q1

pred=kNNreg.predict(x_test_knn[col])

## Test sessions and items bought

In [56]:
knn_res=x_test_knn

#knn_res['proba']=pred

knn_res['prediction']=[1 if p>=0.5 else 0 for p in pred]
df=knn_res[knn_res.prediction==1].groupby(['session_id'])['item_id'].apply(set)
df=df.reset_index()
df=df.rename(columns={'item_id':'buy_item'})
df.head()

Unnamed: 0,session_id,buy_item
0,5967731,"{214844953, 214829325, 214839742}"
1,5967737,{214845380}
2,5967767,"{214744796, 214847797}"
3,5967773,"{214557701, 214845958, 214846110, 214821024, 2..."
4,5967794,"{214845577, 214829878}"


In [57]:
test=buy[buy.session_id.isin(x_test_knn.session_id)].groupby(['session_id'])['item_id'].apply(set)
test=pd.DataFrame(test)
test=test.reset_index()
test=test.rename(columns={'item_id':'buy_item'})
test['buy']=True
test=test.merge(x_test_knn.session_id.drop_duplicates(), on='session_id', how='right')
test.buy=test.buy.fillna(False)
test.buy_item=test.buy_item.fillna(0)
test.head()

Unnamed: 0,session_id,buy_item,buy
0,6121933,0,False
1,6121922,0,False
2,6121969,0,False
3,6121823,{214712237},True
4,6121749,0,False


### Score Function

In the function below: 

- df denotes the matrix of the prediction based on the test set of items bought, it has columns 'session_id' and 'buy_item' which has the set of item we predicted bought
- test is the matrix of the test set, it has columns 'session_id', 'buy' a boolean that indicate if any item was bought and 'buy_item' which has the set of item that were bought (actual)

In [58]:
def score(df, test):
    
    score=0
    
    # first need to compute Sb/S, on the test set
    
    sb_s=test[test==True].buy.count()/len(test.buy)
    for s in df.session_id:
        if test[test.session_id==s].buy.values[0]==True: 
            
            # inter = count(df[df.session_id==s].buy_item & test[test.session_id==s].buy_item)
            # union = count(df[df.session_id==s].buy_item | test[test.session_id==s].buy_item)
            
            item1=test[test.session_id==s].buy_item.values[0]
            if item1==0:
                item1=set()
            item2=df[df.session_id==s].buy_item.values[0]
            if item2==0:
                item2=set()
            jac=jaccard(item1,item2)
            score+=sb_s+jac
            
        else:
            score-=sb_s
    return score

In [59]:
score(df, test)

166.838565562067

## Appendix: Markov Chain Modelling

**Warning: takes a long time to run**

In [60]:
#data wrangling library
import pandas as pd
import numpy as np
from collections import Counter

from markovclick.models import MarkovClickstream
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

ModuleNotFoundError: No module named 'markovclick'

In [None]:
## For category nbr and then predict buy on item nbr
#takes 3-5 min to run FYI

#dividing the set into training / test based on sessions
sessions=[i for i in click.session_id.unique()]
train,test=train_test_split(sessions,test_size=0.20,shuffle=True,random_state=1)

#preparations : 
#classifier 1
dict_cat={}
for a, b in enumerate(click.category.unique()):
    dict_cat[b] = "P"+str(a)

click['page_cl1']=click['category'].map(dict_cat)
buy['page_cl1']="P"+str(len(click.category.unique())) #for the first question, this will be the page 'Bought'

#classifier 2
# for the training, the data need to result in buy. For the test, not necessary.
click_train_cl2=click[(click.session_id.isin(buy.session_id)) & (click.session_id.isin(train))].drop_duplicates()
click_test_cl2=click[click.session_id.isin(test)].drop_duplicates()

dict_item_b={}
dict_item={}
l=100000
for a, b in enumerate(set(click_train_cl2.item_id)|set(click_test_cl2.item_id)):
    dict_item[b] = "P"+str(a)
    dict_item_b[b] = "P"+str(l+a) 

click_train_cl2['page_cl2']=click_train_cl2['item_id'].map(dict_item)
click_test_cl2['page_cl2']=click_test_cl2['item_id'].map(dict_item)
buy['page_cl2']=buy['item_id'].map(dict_item_b)

## Classifier 1
#for cat, predict if buy or not
columns=['session_id','time','page_cl1']
c=click[columns]
b=buy[columns]
click_buy=c.append(b, ignore_index=True)
click_buy=click_buy.sort_values(by=['session_id','time'],ascending=True)

#preparing the training set for classifier 1
click_buy_train=click_buy[click_buy.session_id.isin(train)]
train_cl1=list(click_buy_train.groupby('session_id')['page_cl1'].apply(list))

#preparing the test set for classifier 1, into X and Y, so keeping click and buy separate
click_test=click[click.session_id.isin(test)]
click_test=click_test.sort_values(by=['session_id','time'],ascending=True)
buy_test=buy[buy.session_id.isin(test)]
buy_test=buy_test.sort_values(by=['session_id','time'],ascending=True)

x_test_cl1=click_test.groupby('session_id')['page_cl1'].apply(list)
y_test_cl1=buy_test.groupby('session_id')['page_cl1'].apply(list) 

## Classifier 2
#get the subset of data where a purchase was made
columns=['session_id','time','page_cl2']
c=click_train_cl2[columns]
b=buy[buy.session_id.isin(train)][columns]
click_buy_train_cl2=c.append(b, ignore_index=True)
click_buy_train_cl2=click_buy_train_cl2.sort_values(by=['session_id','time'],ascending=True)

#preparing the training test (same as classifier 1, but only when there was a purchase)
train_cl2=list(click_buy_train_cl2.groupby('session_id')['page_cl2'].apply(list))

#test set for classifier 2 will depend on results of prediction of classifier 1
# will be based on click_tesr_cl2

In [None]:
m = MarkovClickstream(train_cl1)
m2 = MarkovClickstream(train_cl2)

In [None]:
threshold=0.5
pred=pd.DataFrame(x_test_cl1.index,columns=['session_id'])
pred['pred_cl1']=False
pred['actual_cl1']=False

for i,s in enumerate(pred.session_id): 
    l=x_test_cl1[s]
    try:
        p=m.calc_prob_to_page(l, verbose=False)
        if p>=threshold: #then predict buy
            pred.loc[i,'pred_cl1']=True
        if s in list(y_test_cl1.index): #then something was bought
            pred.loc[i,'actual_cl1']=True
    except:
        print(i)
    

In [None]:
accuracy_score(pred.actual_cl1, pred.pred_cl1)

In [None]:
click_test_cl2=click_test_cl2[click_test_cl2.session_id.isin(pred[pred.pred_cl1==True].session_id)]
click_test_cl2=click_test_cl2.sort_values(by=['session_id','time'],ascending=True)

buy_test_cl2=buy_test[buy_test.session_id.isin(pred[pred.pred_cl1==True].session_id)]

x_test_cl2=click_test_cl2.groupby('session_id')['page_cl2'].apply(list)
y_test_cl2=buy_test_cl2.groupby('session_id')['page_cl2'].apply(list) 


In [None]:
l=100000
pred=pred.merge(y_test_cl2, left_on='session_id',right_index=True,how='left')
pred=pred.rename(columns={"page_cl2": "actual"})

sol_file=pd.DataFrame(pred[pred.pred_cl1==True].session_id,columns=['session_id'])
prediction=[]

for s in sol_file.session_id:
    x=x_test_cl2[s]
    # for all the items looked at, knowing we think they will be bought
    proba=[]
    for j in set(x):
        #print(j)
        b=str(l+int(j.replace('P','')))
        try :
            proba.append(m2.calc_prob_to_page(x_test+[b], verbose=False))
            print(1)
        except: #we have a problem that many purchases were not made in the train test so are not 'possible purchases here'
            proba.append(0)
            #print(s,j)
    prediction.append(set([x[k] for k,p in enumerate(proba) if p>=0.5]))

sol_file['bought']=prediction