# INTRODUCTION
In this notebook we will be using the following packages:
  * [pyspark](https://spark.apache.org/docs/latest/api/python/pyspark.html)
  * [pandas](https://pandas.pydata.org/pandas-docs/stable/index.html)
  * [pyarrow](https://arrow.apache.org/docs/python/index.html)

We will be using the following data:
  * dressipi_recsys2022

The dataset is split in 4 csv files:
   * candidate_items : contains the candidate items for each user
   * item_features : contains the features for each item (items can have multiple features)
   * train_purchases : contains the purchases for each user
   * train_sessions : contains the sessions for each user



In [1]:
# Uncomment to install the required packages
# %pip install pyspark
# %pip install pyarrow


## Loading the dataset

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import numpy as np
import os

# start spark session
os.environ['PYSPARK_SUBMIT_ARGS'] ="--conf spark.driver.memory=2g  pyspark-shell"
# spark = SparkSession.builder.appName("Python Spark SQL basic example").config("spark.some.config.option", "some-value").getOrCreate()
spark = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("AppName") \
    .config('spark.ui.port', '4050')\
    .getOrCreate()

candidate_items, item_features, train_purchases, train_sessions = [None] * 4
datasets = [candidate_items, item_features, train_purchases, train_sessions]

#load all datasets
def load_candidate_items():
    global candidate_items
    candidate_items = spark.read.csv("dressipi_recsys2022/candidate_items.csv", header=True)
    candidate_items = candidate_items.withColumn("item_id", candidate_items["item_id"].cast("int"))

def load_item_features():
    global item_features
    item_features = spark.read.csv("dressipi_recsys2022/item_features.csv", header=True)
    item_features = item_features.withColumn("item_id", item_features["item_id"].cast("int"))
    item_features = item_features.withColumn("feature_category_id", item_features["feature_category_id"].cast("int"))
    item_features = item_features.withColumn("feature_value_id", item_features["feature_value_id"].cast("int"))

def load_train_purchases():
    global train_purchases
    train_purchases = spark.read.csv("dressipi_recsys2022/train_purchases.csv", header=True)
    train_purchases = train_purchases.withColumn("session_id", train_purchases["session_id"].cast("int"))
    train_purchases = train_purchases.withColumn("item_id", train_purchases["item_id"].cast("int"))
    train_purchases = train_purchases.withColumn("date", train_purchases["date"].cast("timestamp"))

def load_train_sessions():
    global train_sessions
    train_sessions = spark.read.csv("dressipi_recsys2022/train_sessions.csv", header=True)
    train_sessions = train_sessions.withColumn("session_id", train_sessions["session_id"].cast("int"))
    train_sessions = train_sessions.withColumn("item_id", train_sessions["item_id"].cast("int"))
    train_sessions = train_sessions.withColumn("date", train_sessions["date"].cast("timestamp"))

def load_datasets():
    global datasets
    load_candidate_items()
    load_item_features()
    load_train_purchases()
    load_train_sessions()
    datasets = [candidate_items, item_features, train_purchases, train_sessions]

load_datasets()


## Quick look at the data

### train_sessions

In [3]:
train_sessions.printSchema()
train_sessions.show(5)

root
 |-- session_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- date: timestamp (nullable = true)

+----------+-------+--------------------+
|session_id|item_id|                date|
+----------+-------+--------------------+
|         3|   9655|2020-12-18 21:25:...|
|         3|   9655|2020-12-18 21:19:...|
|        13|  15654|2020-03-13 19:35:...|
|        18|  18316|2020-08-26 19:18:...|
|        18|   2507|2020-08-26 19:16:...|
+----------+-------+--------------------+
only showing top 5 rows



### train_purchases

In [4]:
train_purchases.show(5)

+----------+-------+--------------------+
|session_id|item_id|                date|
+----------+-------+--------------------+
|         3|  15085|2020-12-18 21:26:...|
|        13|  18626|2020-03-13 19:36:...|
|        18|  24911|2020-08-26 19:20:...|
|        19|  12534|2020-11-02 17:16:...|
|        24|  13226|2020-02-26 18:27:...|
+----------+-------+--------------------+
only showing top 5 rows



### item_features

In [5]:
item_features.show(5)

+-------+-------------------+----------------+
|item_id|feature_category_id|feature_value_id|
+-------+-------------------+----------------+
|      2|                 56|             365|
|      2|                 62|             801|
|      2|                 68|             351|
|      2|                 33|             802|
|      2|                 72|              75|
+-------+-------------------+----------------+
only showing top 5 rows



### candidate_items

In [6]:
candidate_items.show(5)

+-------+
|item_id|
+-------+
|      4|
|      8|
|      9|
|     19|
|     20|
+-------+
only showing top 5 rows



# Part 1 : Pipeline

## Checking for null values

In [7]:
# Are there missing values (na or null) in any of those dataframes ? print the number of missing values
print("candidate_items:", candidate_items.filter(candidate_items["item_id"].isNull()).count())
print("item_features:", item_features.filter(item_features["item_id"].isNull()).count())
print("train_purchases:", train_purchases.filter(train_purchases["item_id"].isNull()).count())
print("train_sessions:", train_sessions.filter(train_sessions["item_id"].isNull()).count())

candidate_items: 0
item_features: 0
train_purchases: 0
train_sessions: 0


## Feature engineering

In [76]:
session_date_rdd = train_sessions.rdd.map(lambda x: (x["session_id"], x["date"])).cache()

1) Month

In [77]:
month = session_date_rdd.mapValues(lambda x: int(x.strftime("%m"))).reduceByKey(min)
print(month.take(5))

[(18, 8), (24, 2), (28, 5), (36, 6), (42, 3)]


2) Season (Meteorological)

In [12]:
def get_season(month):
    if month == 12 or month <= 2: return 0
    elif 2 < month <= 5: return 1
    elif 5 < month <= 8: return 2
    elif 8 < month <= 11: return 3

season = month.mapValues(get_season).reduceByKey(min)
print(season.take(5))

[(18, 2), (24, 0), (28, 1), (36, 2), (42, 1)]


3) Day of month

In [78]:
day_of_month = session_date_rdd.mapValues(lambda x: int(x.strftime("%d"))).reduceByKey(min)
print(day_of_month.take(5))

[(18, 26), (24, 26), (28, 18), (36, 21), (42, 1)]


4) Weekday

In [79]:
weekday = session_date_rdd.mapValues(lambda x: int(x.strftime("%w"))).reduceByKey(min)
print(weekday.take(5))

[(18, 3), (24, 3), (28, 1), (36, 0), (42, 1)]


5) Weekend

In [75]:
weekend = weekday.mapValues(lambda x: int(x in (5, 6)))
print(weekend.take(5))

[(18, 0), (24, 0), (28, 0), (36, 0), (42, 0)]


6) Hour

In [98]:
hour = session_date_rdd.mapValues(lambda x: int(x.strftime("%H"))).reduceByKey(min)
print(hour_period.take(5))

[(18, 19), (24, 17), (28, 12), (36, 10), (42, 15)]


In [103]:
hour.map(lambda x: [int(x[1] == i) for i in range(24)]).reduce(np.add)

array([22561, 10068,  6930,  5703,  6069, 11125, 25801, 43049, 55713,
       60538, 59598, 58602, 58549, 56888, 54420, 55018, 55260, 56836,
       62549, 69150, 67415, 53219, 33136, 11803])

7) Day period

In [106]:
def get_day(x):
    if 6 < x < 12: return 0
    elif 12 < x < 18: return 1
    elif 18 < x < 22: return 2
    else: return 3
    
day_period = hour.mapValues(get_day)
print(day_period.take(5))

[(18, 2), (24, 1), (28, 3), (36, 0), (42, 1)]


8) Night

In [108]:
night = day_period.mapValues(lambda x: int(x == 3))
print(night.take(5))

[(18, 0), (24, 0), (28, 1), (36, 0), (42, 0)]


9) Duration of the session

In [89]:
def get_session_duration(dates):
    dates = list(dates)
    dates.sort()
    return (dates[-1] - dates[0]).total_seconds() if len(dates) >= 2 else 0

duration = session_date_rdd.groupByKey().mapValues(get_session_duration)
print(duration.take(5))

[(18, 163.601), (24, 3703.867), (28, 87.513), (36, 43.074), (42, 120.348)]

10) Average time between consecutive item views

In [81]:
def get_average_time(dates):
    import datetime
    dates = sorted(list(dates))
    avgs = [dates[i+1] - dates[i] for i in range(len(dates)-1)]
    return (sum(avgs, datetime.timedelta())/len(avgs)).total_seconds() if len(avgs) > 0 else 0

average_time = session_date_rdd.groupByKey().mapValues(get_average_time)

print(average_time.take(5))

[(18, 81.8005), (24, 462.983375), (28, 29.171), (36, 43.074), (42, 40.116)]

In [82]:
session_item_rdd = train_sessions.rdd.map(lambda x: (x["session_id"], x["item_id"])).cache()

11) Number of distinct items

In [83]:
distinct_nb = session_item_rdd.groupByKey().mapValues(lambda x: len(set(x)))
print(distinct_nb.take(5))

[(18, 3), (24, 8), (28, 3), (36, 2), (42, 4)]

12) Number of repetitive items

In [84]:
repetitive_nb = session_item_rdd.groupByKey().mapValues(lambda x: len(list(x)) - len(set(x)))
print(repetitive_nb.take(5))

[(18, 0), (24, 1), (28, 1), (36, 0), (42, 0)]

13) Same category

In [16]:
item_features_rdd = item_features.rdd.map(lambda x: (x["item_id"], (x["feature_category_id"], x["feature_value_id"]))).groupByKey().mapValues(lambda x: [(a,b) for a, b in x])
print(item_features_rdd.take(1))

[(2,
  [(56, 365),
   (62, 801),
   (68, 351),
   (33, 802),
   (72, 75),
   (29, 123),
   (16, 38),
   (50, 76),
   (61, 462),
   (53, 6),
   (7, 394),
   (69, 885),
   (47, 123)])]

In [17]:
def get_same_category(x):
    dico = dict()
    for item in x:
        for cat in item:
            if cat in dico:
                dico[cat] += 1
            else:
                dico[cat] = 0

    res = 0
    for val in dico.values():
        if val > 0:
            res += 1

    return res

same_category = item_features_rdd.join(train_sessions.rdd.map(lambda x: (x["item_id"], x["session_id"]))).map(lambda x: (x[1][1], x[1][0])).groupByKey().mapValues(get_same_category)
print(same_category.take(5))

[(809760, 44), (2177144, 35), (2618892, 30), (4350700, 33), (2560368, 59)]

14) Different category

In [18]:
def get_different_category(x):
    dico = dict()
    for item in x:
        for cat in item:
            if cat in dico:
                dico[cat] += 1
            else:
                dico[cat] = 0

    res = 0
    for val in dico.values():
        if val == 0:
            res += 1

    return res

diff_category = item_features_rdd.join(train_sessions.rdd.map(lambda x: (x["item_id"], x["session_id"]))).map(lambda x: (x[1][1], x[1][0])).groupByKey().mapValues(get_different_category)
print(diff_category.take(5))

[(2044264, 47), (118976, 37), (252868, 6), (2082460, 29), (2761440, 30)]

15) Last item

In [19]:
def get_last_item(items):
    return max(list(items), key=lambda i: i[1])[0]

last_item = train_sessions.rdd.map(lambda x: (x["session_id"], (x["item_id"], x["date"]))).groupByKey().mapValues(lambda x: max(x, key=lambda i: i[1])[0])
print(last_item.take(5))

[(496, 26180), (1074, 6107), (1740, 13117), (2010, 1344), (2934, 20666)]

16) Most present category

In [20]:
def get_most_present_category(categories_i):
    categories = [cat for cat_i in categories_i for cat in cat_i]    
    categories.sort()
    
    most_viewed = (None, -1)
    last_viewed = categories[0]
    cnt = 0
    
    for category in categories:
        if last_viewed != category:
            if cnt > most_viewed[1]:
                most_viewed = (last_viewed, cnt)
                cnt = 1
                last_viewed = category
        else:
            cnt += 1
            
    if cnt > most_viewed[1]:
        most_viewed = (last_viewed, cnt)
    
    return most_viewed[0]
    
most_present_category = item_features_rdd.join(train_sessions.rdd.map(lambda x: (x["item_id"], x["session_id"]))).map(lambda x: (x[1][1], x[1][0])).groupByKey().mapValues(get_most_present_category)
print(most_present_category.take(5))

[(1287288, (4, 618)),
 (2642212, (4, 618)),
 (2927608, (3, 793)),
 (2060688, (3, 793)),
 (1518656, (3, 793))]

17) Most viewed item

In [87]:
def get_most_viewed_item(items):
    items = list(items)
    items.sort()
    most_viewed = (None, -1)
    
    last_viewed = items[0]
    cnt = 0
    
    for item in items:
        if last_viewed != item:
            if cnt > most_viewed[1]:
                most_viewed = (last_viewed, cnt)
                cnt = 1
                last_viewed = item
        else:
            cnt += 1
            
    if cnt > most_viewed[1]:
        most_viewed = (last_viewed, cnt)
    
    return most_viewed[0]
    
most_viewed_item = session_item_rdd.groupByKey().mapValues(get_most_viewed_item)
print(most_viewed_item.take(5))

[(18, 2507), (24, 2927), (28, 11529), (36, 25417), (42, 10395)]

18) Length of the session

In [91]:
length = session_item_rdd.groupByKey().mapValues(len)
print(length.take(5))

[(18, 3), (24, 9), (28, 4), (36, 2), (42, 4)]

19) Longest viewed item

In [115]:
def get_longest_item(items):
    items = list(items)
    t = [items[i+1][1] - items[i][1] for i in range(len(items)-1)]
    return items[np.argmax(t)][0] if len(t) > 0 else items[0][0]

longest_item = train_sessions.rdd.map(lambda x: (x["session_id"], (x["item_id"], x["date"]))).\
                    groupByKey().\
                    mapValues(get_longest_item)
print(longest_item.take(5))

[(18, 2507), (24, 2927), (28, 16895), (36, 26536), (42, 20523)]

20) Median number of categories

In [110]:
def get_categories_nb(cat):
    from statistics import median
    nb = [len(c) for c in cat]
    return median(nb)
    
categories_nb = item_features_rdd.join(train_sessions.rdd.map(lambda x: (x["item_id"], x["session_id"]))).map(lambda x: (x[1][1], x[1][0])).groupByKey().mapValues(get_categories_nb)
print(categories_nb.take(5))

[(7588, 24), (9396, 21.0), (28268, 18), (59544, 25.0), (61536, 24)]


21) Hash

In [120]:
hash_items = session_item_rdd.groupByKey().mapValues(lambda x: hash(str(list(x))))
print(hash_items.take(5))

[(18, -4836734901686818560), (24, -5971015408950527618), (28, -6017024854127706481), (36, 8998393808888012635), (42, 6148542230826542357)]


BIG RDD

In [124]:
session_item_id = train_purchases.rdd.map(lambda x: (x["session_id"], x["item_id"]))
# features = [month, season, day_of_month, weekday, weekend, hour, day_period, night, duration, average_time, distinct_nb, repetitive_nb, same_category, diff_category, last_item, most_present_category, most_viewed_item, length, longest_item, categories_nb, hash_items]
# features_name = ["month", "season", "day_of_month", "weekday", "weekend", "hour", "day_period", "night", "duration", "average_time", "distinct_nb", "repetitive_nb", "same_category", "diff_category", "last_item", "most_present_category", "most_viewed_item", "length", "longest_item", "categories_nb", "hash_items"]

features = [month, season, day_of_month, weekday, weekend, hour, day_period, night, duration, average_time, distinct_nb, repetitive_nb, same_category, diff_category, last_item, most_viewed_item, length, longest_item, categories_nb, hash_items]
features_name = ["month", "season", "day_of_month", "weekday", "weekend", "hour", "day_period", "night", "duration", "average_time", "distinct_nb", "repetitive_nb", "same_category", "diff_category", "last_item", "most_viewed_item", "length", "longest_item", "categories_nb", "hash_items"]


def get_BIG_RDD():
    temp = session_item_id
    for feature in features:
        temp = temp.join(feature).mapValues(lambda x: tuple(list(x[0])+[x[1]]) if isinstance(x[0], tuple) else x)
    return temp

BIG_RDD = get_BIG_RDD().cache()

In [125]:
print(BIG_RDD.take(1))

[(966000, (24101, 5, 1, 14, 5, 1, 7, 0, 0, 136.702, 68.351, 3, 0, 15, 21, 23875, 1808, 3, 18876, 19, 7261787585238309112))]


# Part 2 : Feature selection


## Ranking algorithm

In [26]:
def pearson_reduce(a, b):
    n = a[0] + b[0]
    x = a[1] + b[1]
    y = a[2] + b[2]
    x2 = a[3] + b[3]
    y2 = a[4] + b[4]
    xy = a[5] + b[5]

    return n, x, y, x2, y2, xy

def calculate_pearson(a):
    import math

    n = a[0]
    x = a[1]
    y = a[2]
    x2 = a[3]
    y2 = a[4]
    xy = a[5]

    return (n*xy - x * y) / (math.sqrt((n*x2 - (x**2)) * (n*y2 - (y**2))))

def compute_pearson(rdd):
    temp = rdd.flatMap(lambda x: [(i, (x[1][i+1], x[1][0])) for i in range(len(x[1])-1)]).\
        mapValues(lambda a: (1, a[0], a[1], a[0]**2, a[1]**2, a[0]*a[1])).\
        reduceByKey(pearson_reduce).\
        mapValues(calculate_pearson).collect()
    return temp

In [126]:
features_score = compute_pearson(BIG_RDD)
features_score.sort(key=lambda x: abs(x[1]), reverse=True)

print("Features ranking:")
for feat in features_score:
    print(f"\t- {features_name[feat[0]]}: {feat[1]}")

Features ranking:
	- categories_nb: -0.007708906072711242
	- month: -0.0071614357553861745
	- day_of_month: -0.006264159623711706
	- season: -0.004767394584903107
	- length: 0.002930539306274257
	- distinct_nb: 0.0027738942053009237
	- same_category: 0.0025279332476597477
	- most_viewed_item: -0.0024521005141891605
	- weekend: -0.0022594336768649668
	- repetitive_nb: 0.002235108828861097
	- hash_items: -0.0021588631256048373
	- diff_category: -0.0020495355809238697
	- weekday: -0.001951938823723691
	- duration: 0.0016147845578619738
	- longest_item: -0.00159280811850096
	- last_item: -0.0014335263673236035
	- night: -0.0006003490621040677
	- average_time: 0.0005275049188534646
	- hour: 0.00014790122764263172
	- day_period: -0.00013857527522151224


TEST Covariance

TEST Cramer's V

In [129]:
from pyspark.sql.types import Row

def f(x):
    dico = {}
    dico["session_id"] = x[0]
    dico["item_id"] = x[1][0]
    for i in range(len(features_name)):
        dico[features_name[i]] = x[1][i+1]
    return dico
        
        
df = BIG_RDD.map(lambda x: Row(**f(x))).toDF().toPandas()


In [132]:
df["hash_items"] = np.abs(df["hash_items"])

In [133]:
import scipy.stats as stats
import numpy as np

for fe in features_name:
    if fe not in ("lol"):
        data = df[[fe, "item_id"]].to_numpy()

        #Chi-squared test statistic, sample size, and minimum of rows and columns
        X2 = stats.chi2_contingency(data, correction=False)[0]
        n = np.sum(data)
        minDim = min(data.shape)-1

        #calculate Cramer's V 
        V = np.sqrt((X2/n) / minDim)

        #display Cramer's V
        print(fe, ":", V)

month : 0.037410992802345576
season : 0.020655312197883156
day_of_month : 0.06079113289868695
weekday : 0.029771706492512037
weekend : 0.01539462099643287
hour : 0.05166155392811931
day_period : 0.022634575436653575
night : 0.01596522430855064
duration : nan
average_time : nan
distinct_nb : 0.046137172607572434
repetitive_nb : 0.03641206907862046
same_category : 0.08184998398318331
diff_category : 0.0704978594689984
last_item : 0.4308699309693907
most_viewed_item : 0.4957581805865106
length : 0.05058107704663821
longest_item : 0.4317797983282462
categories_nb : nan
hash_items : 0.0001317630332520854


In [None]:
# sdi = df.select("stddev ('item_id')")
print(sdi)
for feature in features_name:
    print(f"{feature} : {df.cov('item_id', feature)}")

## Forward feature selection

# Part 3 : Model

In [None]:
#from pyspark.ml.classification import NaiveBayes
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree

In [None]:
test = BIG_RDD.map(lambda x: LabeledPoint(x[1][0], list(x[1][1:-4])+[str(x[1][4])]+list(x[1][-3:])))

In [None]:

model = NaiveBayes.train(test)
#nb = NaiveBayes()
#nb.setFeaturesCol("month")
#model = nb.fit(df)


In [None]:
# https://spark.apache.org/docs/latest/mllib-naive-bayes.html

predictionAndLabel = BIG_RDD.map(lambda x: (model.predict(list(x[1][1:-4])+[str(x[1][4])]+list(x[1][-3:])), x[1][0]))
accuracy = 1.0 * predictionAndLabel.filter(lambda pl: pl[0] == pl[1]).count() / test.count()
print('model accuracy {}'.format(accuracy))