# Recommender Systems and Choice: A Reproducibility Study

*by Adeline Liem*


## Data Wrangling

In [1]:
# Importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import SnowballStemmer
from scipy import stats
from statsmodels.formula.api import ols
import statsmodels.api as sm
import os

In [2]:
# Read datasets

kilands_df = pd.read_csv('kilands_20201103.csv')
cyberfoto_df = pd.read_csv('cyberfoto_20210121.csv')

In [3]:
kilands_df.shape

(14832, 26)

In [4]:
cyberfoto_df.shape

(37005, 24)

In [5]:
kilands_df.describe()

Unnamed: 0,number of add-to-carts,number of clicks,first click rank,first purchase rank,order value,number of purchases,number of searches in current session,search session length (seconds),number of products displayed to user,number of sessions,whole session length,time to add-to-cart (seconds),time to first click (seconds),time to purchase
count,14832.0,14832.0,7422.0,249.0,249.0,14832.0,14832.0,14832.0,14832.0,14832.0,14832.0,657.0,7422.0,249.0
mean,0.050499,0.98247,15.801536,12.88755,1638.630602,0.018406,2.158306,151.272923,42.654396,4.500202,514.805218,266.042618,36.330369,471.072289
std,0.258518,1.763807,40.195333,34.877369,2144.580567,0.146424,1.798311,1828.612696,66.212976,13.663486,6762.696042,289.368074,79.857679,360.449835
min,0.0,0.0,0.0,0.0,39.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,1.0,40.0
25%,0.0,0.0,1.0,0.0,409.5,0.0,1.0,0.0,5.0,1.0,18.0,71.0,8.0,195.0
50%,0.0,1.0,4.0,1.0,985.0,0.0,2.0,20.0,24.0,1.0,96.0,157.0,17.0,369.0
75%,0.0,1.0,15.0,7.0,2090.0,0.0,3.0,92.0,48.0,2.0,344.0,338.0,36.0,655.0
max,6.0,50.0,1634.0,321.0,16490.0,3.0,14.0,155064.0,1416.0,97.0,336633.0,1747.0,1507.0,1777.0


In [11]:
# Removing outliers

# Defining bounds for exclusion: search session length

upper_bound_length_kilands = kilands_df['search session length (seconds)'].mean() + \
                             2.5 * kilands_df['search session length (seconds)'].std()
print(upper_bound_length_kilands)

# Count the number of observations with search session length exceeding the upper bound
num_outliers_search = len(kilands_df[kilands_df['search session length (seconds)'] > upper_bound_length_kilands])
print(num_outliers_search)

# Creating new dataset with search session outliers removed
kilands_no_out = kilands_df[kilands_df['search session length (seconds)'] <= upper_bound_length_kilands]

4722.8046632183505
27


In [14]:
# Define bounds for exclusion: clicks
upper_bound_clicks_kilands = kilands_df['number of clicks'].mean() + \
                             2.5 * kilands_df['number of clicks'].std()
print(upper_bound_clicks_kilands)

# Count the number of observations with clicks exceeding the upper bound
num_outliers_clicks = len(kilands_no_out[kilands_no_out['number of clicks'] > upper_bound_clicks_kilands])
print(num_outliers_clicks)

# Updating new dataset with click outliers removed
kilands_no_out = kilands_df[kilands_df['number of clicks'] <= upper_bound_clicks_kilands]

5.3919867279927525
374


In [15]:
kilands_no_out

Unnamed: 0,search id,user,session,segment,query,slot,platform,search start,search end time,number of add-to-carts,...,search session length (seconds),number of products displayed to user,number of sessions,whole session length,time to add-to-cart (seconds),time to first click (seconds),time to purchase,toggled filters,click positions,purchase positions
0,u:004f5XJdPvnue61i/s:IFEonOw2XywDPmFc/q:q=sisa...,004f5XJdPvnue61i,IFEonOw2XywDPmFc,202004_kilands_shuffle7,sisal,,mobile,2020-05-11T13:48:33.391,2020-05-11T13:48:33.391,0,...,32,24,1,32,,32.0,,,23,
1,u:00U825g1ZwBVX8UW/s:tQCDykE6URsuP1t7/q:q=bilm...,00U825g1ZwBVX8UW,tQCDykE6URsuP1t7,202004_kilands_shuffle7,Bilmatta,,mobile,2020-04-26T06:38:01.135,2020-04-26T06:39:13.998,0,...,73,8,1,73,,13.0,,,0;0,
2,u:01U04AG9RtpnKveG/s:r0I5bjEYaljUF8Ff/q:q=tras...,01U04AG9RtpnKveG,r0I5bjEYaljUF8Ff,202004_kilands_shuffle7,trasmatt,,desktop,2020-05-03T21:18:20.127,2020-05-03T21:18:20.127,0,...,17,1,1,17,,17.0,,,0,
3,u:01WAyTwhvMkQVpSv/s:HIrTC5hOFuzum34m/q:q=matt...,01WAyTwhvMkQVpSv,HIrTC5hOFuzum34m,202004_gandalf_rel,runda mattor,,mobile,2020-04-18T15:28:23.350,2020-04-18T15:28:23.350,0,...,12,24,1,12,,12.0,,,8,
4,u:022YZKPnqb7KERVw/s:53Z43z9957vgQKIY/q:q=plas...,022YZKPnqb7KERVw,53Z43z9957vgQKIY,202004_gandalf_rel,plastmattor,,mobile,2020-04-29T23:18:04.324,2020-04-29T23:18:33.885,0,...,39,72,1,39,,39.0,,,53,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14827,u:zxw5bXtZLcAzBAPs/s:SVKHkRkMD5Z5dKMh/q:q=rund...,zxw5bXtZLcAzBAPs,SVKHkRkMD5Z5dKMh,202004_gandalf_rel,Rund,,tablet,2020-05-08T22:48:41.826,2020-05-08T22:49:42.753,0,...,61,120,1,71,,,,,,
14828,u:zxw5bXtZLcAzBAPs/s:SVKHkRkMD5Z5dKMh/q:q=matt...,zxw5bXtZLcAzBAPs,SVKHkRkMD5Z5dKMh,202004_gandalf_rel,rund matta,,tablet,2020-05-08T22:49:49.275,2020-05-08T22:49:52.962,0,...,4,48,1,71,,,,,,
14829,u:zyfLPL38d2wolWFK/s:iKyf7aO0Ta2asuqP/q:q=274 ...,zyfLPL38d2wolWFK,iKyf7aO0Ta2asuqP,202004_gandalf_rel,bristol 274,,desktop,2020-04-19T19:57:44.909,2020-04-19T19:57:44.909,0,...,33,1,1,33,,33.0,,,0,
14830,u:zz4XT5jMM6FMGos0/s:ClDePgytzZ4L4MGK/q:q=rio/...,zz4XT5jMM6FMGos0,ClDePgytzZ4L4MGK,202004_kilands_shuffle7,rio,,tablet,2020-04-29T01:51:45.688,2020-04-29T01:52:03.500,1,...,18,4,1,33,5.0,5.0,,,2,


## Project Progress Check 2
