# Recommender Systems and Choice: A Reproducibility Study

*by Adeline Liem*


## Data Wrangling

In [1]:
# Importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import SnowballStemmer
from scipy import stats
from statsmodels.formula.api import ols
import statsmodels.api as sm
import os

In [2]:
# Read datasets

kilands_df = pd.read_csv('kilands_20201103.csv')
cyberfoto_df = pd.read_csv('cyberfoto_20210121.csv')

In [3]:
kilands_df.shape

(14832, 26)

In [4]:
cyberfoto_df.shape

(37005, 24)

In [5]:
kilands_df.describe()

Unnamed: 0,number of add-to-carts,number of clicks,first click rank,first purchase rank,order value,number of purchases,number of searches in current session,search session length (seconds),number of products displayed to user,number of sessions,whole session length,time to add-to-cart (seconds),time to first click (seconds),time to purchase
count,14832.0,14832.0,7422.0,249.0,249.0,14832.0,14832.0,14832.0,14832.0,14832.0,14832.0,657.0,7422.0,249.0
mean,0.050499,0.98247,15.801536,12.88755,1638.630602,0.018406,2.158306,151.272923,42.654396,4.500202,514.805218,266.042618,36.330369,471.072289
std,0.258518,1.763807,40.195333,34.877369,2144.580567,0.146424,1.798311,1828.612696,66.212976,13.663486,6762.696042,289.368074,79.857679,360.449835
min,0.0,0.0,0.0,0.0,39.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,1.0,40.0
25%,0.0,0.0,1.0,0.0,409.5,0.0,1.0,0.0,5.0,1.0,18.0,71.0,8.0,195.0
50%,0.0,1.0,4.0,1.0,985.0,0.0,2.0,20.0,24.0,1.0,96.0,157.0,17.0,369.0
75%,0.0,1.0,15.0,7.0,2090.0,0.0,3.0,92.0,48.0,2.0,344.0,338.0,36.0,655.0
max,6.0,50.0,1634.0,321.0,16490.0,3.0,14.0,155064.0,1416.0,97.0,336633.0,1747.0,1507.0,1777.0


In [6]:
cyberfoto_df.describe()

Unnamed: 0,slot,number of add-to-carts,number of clicks,first click rank,first purchase rank,order value,number of purchases,number of searches in current session,search session length (seconds),number of products displayed to user,number of sessions,whole session length,time to add-to-cart (seconds),time to first click (seconds),time to purchase
count,0.0,37005.0,37005.0,13999.0,323.0,354.0,37005.0,37005.0,37005.0,37005.0,37005.0,37005.0,0.0,15498.0,354.0
mean,,0.0,0.598379,3.747339,3.752322,3093.827684,0.010026,2.389542,42.753844,14.566059,3.152709,534.569734,,47.214737,501.254237
std,,0.0,0.976304,5.64446,5.44144,5348.201642,0.105169,2.208083,150.492475,9.440833,3.931055,7095.224791,,123.183844,353.054687
min,,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,,2.0,27.0
25%,,0.0,0.0,0.0,0.0,391.25,0.0,1.0,0.0,5.0,1.0,6.0,,9.0,240.25
50%,,0.0,0.0,1.0,1.0,964.5,0.0,2.0,0.0,16.0,2.0,53.0,,19.0,407.5
75%,,0.0,1.0,5.0,5.0,2990.0,0.0,3.0,18.0,24.0,4.0,305.0,,40.0,654.25
max,,0.0,18.0,85.0,40.0,33079.0,3.0,36.0,1789.0,101.0,37.0,355827.0,,1756.0,1742.0


In [7]:
# Removing outliers

# Defining bounds for exclusion: search session length

upper_bound_length_kilands = kilands_df['search session length (seconds)'].mean() + \
                             2.5 * kilands_df['search session length (seconds)'].std()
print(upper_bound_length_kilands)

# Count the number of observations with search session length exceeding the upper bound
num_outliers_search = len(kilands_df[kilands_df['search session length (seconds)'] > upper_bound_length_kilands])
print(num_outliers_search)

# Creating new dataset with search session outliers removed
kilands_no_out = kilands_df[kilands_df['search session length (seconds)'] <= upper_bound_length_kilands]

4722.8046632183505
27


In [8]:
# Define bounds for exclusion: clicks
upper_bound_clicks_kilands = kilands_no_out['number of clicks'].mean() + \
                             2.5 * kilands_no_out['number of clicks'].std()
print(upper_bound_clicks_kilands)

# Count the number of observations with clicks exceeding the upper bound
num_outliers_clicks = len(kilands_no_out[kilands_no_out['number of clicks'] > upper_bound_clicks_kilands])
print(num_outliers_clicks)

5.389793656358605
374


In [9]:
kilands_no_out.shape

(14805, 26)

In [10]:
# Creating new dataset with clicks outliers removed
kilands_no_out1 = kilands_no_out[kilands_no_out['number of clicks'] <= upper_bound_clicks_kilands]

In [11]:
kilands_no_out1.shape

(14431, 26)

In [12]:
# Recode High Attractiveness (HA) and Low Attractiveness (LA)
kilands_no_out1.loc[kilands_no_out1['segment'] == "202004_gandalf_rel", 'segment'] = "HA"
kilands_no_out1.loc[kilands_no_out1['segment'] == "202004_kilands_shuffle7", 'segment'] = "LA"

# Keep only certain cols: search ID (1), user (2), segment (4), platform (7), 15, 10, 11, 12, 13,
##, 14, 17, 20, 21, 22, 23, 25, 26, 18
columns_to_keep = [
    kilands_no_out.columns[i - 1] for i in 
    [1, 2, 4, 7, 15, 10, 11, 17, 12, 13, 14, 20, 21, 23, 22, 25, 26, 18]
]
kilands_no_out1 = kilands_no_out[columns_to_keep]

In [13]:
kilands_no_out1.shape

(14805, 18)

In [14]:
# Recode attention_click
kilands_no_out1['attention_click'] = kilands_no_out1['first click rank'].apply(
    lambda x: "Top" if x <= 6 else ("Bottom" if x > 6 else x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kilands_no_out1['attention_click'] = kilands_no_out1['first click rank'].apply(


In [15]:
kilands_no_out1.shape

(14805, 19)

In [16]:
## Recode attention_purchase
kilands_no_out1['attention_purchase'] = kilands_no_out1['first purchase rank'].apply(
    lambda x: "Top" if x <= 6 else ("Bottom" if x > 6 else x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kilands_no_out1['attention_purchase'] = kilands_no_out1['first purchase rank'].apply(


In [17]:
kilands_no_out1

Unnamed: 0,search id,user,segment,platform,number of purchases,number of add-to-carts,number of clicks,search session length (seconds),first click rank,first purchase rank,order value,whole session length,time to add-to-cart (seconds),time to purchase,time to first click (seconds),click positions,purchase positions,number of products displayed to user,attention_click,attention_purchase
0,u:004f5XJdPvnue61i/s:IFEonOw2XywDPmFc/q:q=sisa...,004f5XJdPvnue61i,202004_kilands_shuffle7,mobile,0,0,1,32,23.0,,,32,,,32.0,23,,24,Bottom,
1,u:00U825g1ZwBVX8UW/s:tQCDykE6URsuP1t7/q:q=bilm...,00U825g1ZwBVX8UW,202004_kilands_shuffle7,mobile,0,0,2,73,0.0,,,73,,,13.0,0;0,,8,Top,
2,u:01U04AG9RtpnKveG/s:r0I5bjEYaljUF8Ff/q:q=tras...,01U04AG9RtpnKveG,202004_kilands_shuffle7,desktop,0,0,1,17,0.0,,,17,,,17.0,0,,1,Top,
3,u:01WAyTwhvMkQVpSv/s:HIrTC5hOFuzum34m/q:q=matt...,01WAyTwhvMkQVpSv,202004_gandalf_rel,mobile,0,0,1,12,8.0,,,12,,,12.0,8,,24,Bottom,
4,u:022YZKPnqb7KERVw/s:53Z43z9957vgQKIY/q:q=plas...,022YZKPnqb7KERVw,202004_gandalf_rel,mobile,0,0,1,39,53.0,,,39,,,39.0,53,,72,Bottom,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14827,u:zxw5bXtZLcAzBAPs/s:SVKHkRkMD5Z5dKMh/q:q=rund...,zxw5bXtZLcAzBAPs,202004_gandalf_rel,tablet,0,0,0,61,,,,71,,,,,,120,,
14828,u:zxw5bXtZLcAzBAPs/s:SVKHkRkMD5Z5dKMh/q:q=matt...,zxw5bXtZLcAzBAPs,202004_gandalf_rel,tablet,0,0,0,4,,,,71,,,,,,48,,
14829,u:zyfLPL38d2wolWFK/s:iKyf7aO0Ta2asuqP/q:q=274 ...,zyfLPL38d2wolWFK,202004_gandalf_rel,desktop,0,0,1,33,0.0,,,33,,,33.0,0,,1,Top,
14830,u:zz4XT5jMM6FMGos0/s:ClDePgytzZ4L4MGK/q:q=rio/...,zz4XT5jMM6FMGos0,202004_kilands_shuffle7,tablet,0,1,1,18,2.0,,,33,5.0,,5.0,2,,4,Top,


In [18]:
# Replace semicolons in 'click positions' and 'purchase positions' with ' '
kilands_no_out1['clicks_position'] = kilands_no_out1['click positions'].str.replace(";", " ")
kilands_no_out1['purchase_position'] = kilands_no_out1['purchase positions'].str.replace(";", " ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kilands_no_out1['clicks_position'] = kilands_no_out1['click positions'].str.replace(";", " ")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kilands_no_out1['purchase_position'] = kilands_no_out1['purchase positions'].str.replace(";", " ")


In [19]:
kilands_no_out1.shape

(14805, 22)

In [20]:
# Recode top_clicks and bottom_clicks
kilands_no_out1['top_clicks'] = kilands_no_out1['clicks_position'].apply(
    lambda x: sum(int(pos) <= 6 for pos in x.split() if pos.isdigit()) if pd.notna(x) else 0
)
kilands_no_out1['bottom_clicks'] = kilands_no_out1['clicks_position'].apply(
    lambda x: sum(int(pos) > 6 for pos in x.split() if pos.isdigit()) if pd.notna(x) else 0
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kilands_no_out1['top_clicks'] = kilands_no_out1['clicks_position'].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kilands_no_out1['bottom_clicks'] = kilands_no_out1['clicks_position'].apply(


In [21]:
kilands_no_out1.shape

(14805, 24)

In [22]:
# Recode top_purch and bottom_purch
kilands_no_out1['top_purch'] = kilands_no_out1['purchase_position'].apply(
    lambda x: sum(int(pos) <= 6 for pos in x.split() if pos.isdigit()) if pd.notna(x) else 0
)

kilands_no_out1['bottom_purch'] = kilands_no_out1['purchase_position'].apply(
    lambda x: sum(int(pos) > 6 for pos in x.split() if pos.isdigit()) if pd.notna(x) else 0
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kilands_no_out1['top_purch'] = kilands_no_out1['purchase_position'].apply(


In [23]:
kilands_no_out1.shape

(14805, 26)

In [24]:
# Convert `attention_click` and `attention_purchase` to category
kilands_no_out1['attention_click'] = kilands_no_out1['attention_click'].astype('category')
print(kilands_no_out1['attention_click'].describe())
print("\n")

kilands_no_out1['attention_purchase'] = kilands_no_out1['attention_purchase'].astype('category')
print(kilands_no_out1['attention_purchase'].describe())
print("\n")

# Value counts for attention_click
print(kilands_no_out1['attention_click'].value_counts())

count     7411
unique       2
top        Top
freq      4503
Name: attention_click, dtype: object


count     249
unique      2
top       Top
freq      184
Name: attention_purchase, dtype: object


attention_click
Top       4503
Bottom    2908
Name: count, dtype: int64


In [25]:
kilands_no_out1

Unnamed: 0,search id,user,segment,platform,number of purchases,number of add-to-carts,number of clicks,search session length (seconds),first click rank,first purchase rank,...,purchase positions,number of products displayed to user,attention_click,attention_purchase,clicks_position,purchase_position,top_clicks,bottom_clicks,top_purch,bottom_purch
0,u:004f5XJdPvnue61i/s:IFEonOw2XywDPmFc/q:q=sisa...,004f5XJdPvnue61i,202004_kilands_shuffle7,mobile,0,0,1,32,23.0,,...,,24,Bottom,,23,,0,1,0,0
1,u:00U825g1ZwBVX8UW/s:tQCDykE6URsuP1t7/q:q=bilm...,00U825g1ZwBVX8UW,202004_kilands_shuffle7,mobile,0,0,2,73,0.0,,...,,8,Top,,0 0,,2,0,0,0
2,u:01U04AG9RtpnKveG/s:r0I5bjEYaljUF8Ff/q:q=tras...,01U04AG9RtpnKveG,202004_kilands_shuffle7,desktop,0,0,1,17,0.0,,...,,1,Top,,0,,1,0,0,0
3,u:01WAyTwhvMkQVpSv/s:HIrTC5hOFuzum34m/q:q=matt...,01WAyTwhvMkQVpSv,202004_gandalf_rel,mobile,0,0,1,12,8.0,,...,,24,Bottom,,8,,0,1,0,0
4,u:022YZKPnqb7KERVw/s:53Z43z9957vgQKIY/q:q=plas...,022YZKPnqb7KERVw,202004_gandalf_rel,mobile,0,0,1,39,53.0,,...,,72,Bottom,,53,,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14827,u:zxw5bXtZLcAzBAPs/s:SVKHkRkMD5Z5dKMh/q:q=rund...,zxw5bXtZLcAzBAPs,202004_gandalf_rel,tablet,0,0,0,61,,,...,,120,,,,,0,0,0,0
14828,u:zxw5bXtZLcAzBAPs/s:SVKHkRkMD5Z5dKMh/q:q=matt...,zxw5bXtZLcAzBAPs,202004_gandalf_rel,tablet,0,0,0,4,,,...,,48,,,,,0,0,0,0
14829,u:zyfLPL38d2wolWFK/s:iKyf7aO0Ta2asuqP/q:q=274 ...,zyfLPL38d2wolWFK,202004_gandalf_rel,desktop,0,0,1,33,0.0,,...,,1,Top,,0,,1,0,0,0
14830,u:zz4XT5jMM6FMGos0/s:ClDePgytzZ4L4MGK/q:q=rio/...,zz4XT5jMM6FMGos0,202004_kilands_shuffle7,tablet,0,1,1,18,2.0,,...,,4,Top,,2,,1,0,0,0


In [26]:
# Create dummy binary variables for add-to-carts and purchases
kilands_no_out1['carts'] = kilands_no_out1['number of add-to-carts'].apply(
    lambda x: 1 if x > 0 else 0
)
kilands_no_out1['purch'] = kilands_no_out1['number of purchases'].apply(
    lambda x: 1 if x > 0 else 0
)

In [27]:
kilands_no_out1.shape

(14805, 28)

In [28]:
#### HYPOTHESIS TESTING ####

# * Hypotheses a: carts (Model 2) and b: purchase (Model 1) are now modeled as `Multilevel Logistic Regression`, i.e., logistic regression with random intercept per participant.
# * Hypothesis c: Products viewed (Model 3) is now modeled as `Multilevel Negative Binomial Regression`, i.e, negative binomial regression (count-model) with random intercept per participant.
# * Hypothesis d: Session Length (Model 4) is now modeled as `Multilevel Linear Regression`, i.e, just a regular linear mixed model with random intercept per participant.

#### Hypothesis a: carts ####


## Project Progress Check 3

For this project progress check, I am 99% finished cleaning one out of two datasets and will proceed with the regression tables soon. I struggled for awhile because I couldn't figure out why the shape of my datasets were different than the original paper's, only to realize that I think the author made a small mistake that caused them to remove outliers from the dataset, then apply a function that caused the dataset to revert back. After I ask the staff and receive feedback about what to do about this, I will be done with the first dataset. In the meantime, I can continue work by working on cleaning the second dataset. 