# INTRODUCTION
In this notebook we will be using the following packages as bullet list:
  * [pyspark](https://spark.apache.org/docs/latest/api/python/pyspark.html)
  * [pandas](https://pandas.pydata.org/pandas-docs/stable/index.html)
  * [pyarrow](https://arrow.apache.org/docs/python/index.html)

We will be using the following data:
  * dressipi_recsys2022

The dataset is split in 4 csv files :
   * candidate_items : contains the candidate items for each user
   * item_features : contains the features for each item (items can have multiple features)
   * train_purchases : contains the purchases for each user
   * train_sessions : contains the sessions for each user



In [1]:
# Uncomment to install the required packages
# %pip install pyspark
# %pip install pyarrow


## Loading the dataset

In [52]:
# from dataset import *
# from pipeline import *
# from our_model import *
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import numpy as np

# start spark session
spark = SparkSession.builder.appName("Python Spark SQL basic example").config("spark.some.config.option", "some-value").getOrCreate()

candidate_items, item_features, train_purchases, train_sessions = [None] * 4
datasets = [candidate_items, item_features, train_purchases, train_sessions]

#load all datasets
def load_candidate_items():
    global candidate_items
    candidate_items = spark.read.csv("dressipi_recsys2022/candidate_items.csv", header=True)
    candidate_items = candidate_items.withColumn("item_id", candidate_items["item_id"].cast("int"))

def load_item_features():
    global item_features
    item_features = spark.read.csv("dressipi_recsys2022/item_features.csv", header=True)
    item_features = item_features.withColumn("item_id", item_features["item_id"].cast("int"))
    item_features = item_features.withColumn("feature_category_id", item_features["feature_category_id"].cast("int"))
    item_features = item_features.withColumn("feature_value_id", item_features["feature_value_id"].cast("int"))

def load_train_purchases():
    global train_purchases
    train_purchases = spark.read.csv("dressipi_recsys2022/train_purchases.csv", header=True)
    train_purchases = train_purchases.withColumn("session_id", train_purchases["session_id"].cast("int"))
    train_purchases = train_purchases.withColumn("item_id", train_purchases["item_id"].cast("int"))
    train_purchases = train_purchases.withColumn("date", train_purchases["date"].cast("timestamp"))

def load_train_sessions():
    global train_sessions
    train_sessions = spark.read.csv("dressipi_recsys2022/train_sessions.csv", header=True)
    train_sessions = train_sessions.withColumn("session_id", train_sessions["session_id"].cast("int"))
    train_sessions = train_sessions.withColumn("user_id", train_sessions["item_id"].cast("int"))
    train_sessions = train_sessions.withColumn("date", train_sessions["date"].cast("timestamp"))

def load_datasets():
    global datasets
    load_candidate_items()
    load_item_features()
    load_train_purchases()
    load_train_sessions()
    datasets = [candidate_items, item_features, train_purchases, train_sessions]

load_datasets()


## Quick look at the data

### train_sessions

In [54]:
train_sessions.printSchema()
train_sessions.show(5)

root
 |-- session_id: integer (nullable = true)
 |-- item_id: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- user_id: integer (nullable = true)

+----------+-------+--------------------+-------+
|session_id|item_id|                date|user_id|
+----------+-------+--------------------+-------+
|         3|   9655|2020-12-18 21:25:...|   9655|
|         3|   9655|2020-12-18 21:19:...|   9655|
|        13|  15654|2020-03-13 19:35:...|  15654|
|        18|  18316|2020-08-26 19:18:...|  18316|
|        18|   2507|2020-08-26 19:16:...|   2507|
+----------+-------+--------------------+-------+
only showing top 5 rows



### train_purchases

In [4]:
train_purchases.show(5)

+----------+-------+--------------------+
|session_id|item_id|                date|
+----------+-------+--------------------+
|         3|  15085|2020-12-18 21:26:...|
|        13|  18626|2020-03-13 19:36:...|
|        18|  24911|2020-08-26 19:20:...|
|        19|  12534|2020-11-02 17:16:...|
|        24|  13226|2020-02-26 18:27:...|
+----------+-------+--------------------+
only showing top 5 rows



### item_features

In [5]:
item_features.show(5)

+-------+-------------------+----------------+
|item_id|feature_category_id|feature_value_id|
+-------+-------------------+----------------+
|      2|                 56|             365|
|      2|                 62|             801|
|      2|                 68|             351|
|      2|                 33|             802|
|      2|                 72|              75|
+-------+-------------------+----------------+
only showing top 5 rows



### candidate_items

In [6]:
candidate_items.show(5)

+-------+
|item_id|
+-------+
|      4|
|      8|
|      9|
|     19|
|     20|
+-------+
only showing top 5 rows



# Part 1 : Pipeline

## Checking for null values

In [7]:
# Are there missing values (na or null) in any of those dataframes ? print the number of missing values
print("candidate_items:", candidate_items.filter(candidate_items["item_id"].isNull()).count())
print("item_features:", item_features.filter(item_features["item_id"].isNull()).count())
print("train_purchases:", train_purchases.filter(train_purchases["item_id"].isNull()).count())
print("train_sessions:", train_sessions.filter(train_sessions["item_id"].isNull()).count())

candidate_items: 0
item_features: 0
train_purchases: 0
train_sessions: 0


## Normalization

## Feature engineering

Session duration

In [44]:
train_sessions = train_sessions.drop("user_id")




DataFrame[session_id: int, item_id: string, date: timestamp]

In [25]:
# a = train_sessions.rdd.map(lambda x: x.session_id)
# a.take(5)

def get_session_duration(times):
    times.sort()
    return times[-1] - times[0]

# groupedBySessionID = train_sessions.rdd.union(train_purchases.rdd).groupBy(lambda x: x.session_id)



# sessions_duration = groupedBySessionID.map(lambda y: (y[0], get_session_duration([z.date for z in y[1]])))

# sessions_length = groupedBySessionID.map(lambda y: (y[0], len(list(y[1]))))

# print(sessions_duration.take(5))
# print(sessions_length.take(5))

[(48, datetime.timedelta(seconds=692, microseconds=820000)), (208, datetime.timedelta(seconds=20, microseconds=598000)), (352, datetime.timedelta(seconds=549, microseconds=149000)), (384, datetime.timedelta(seconds=436, microseconds=156000)), (464, datetime.timedelta(seconds=90, microseconds=695000))]
[(48, 3), (208, 2), (352, 3), (384, 11), (464, 2)]


In [55]:
fullData = train_sessions.groupBy("session_id").agg(F.collect_list("item_id"), F.collect_list("date")).join(train_purchases, "session_id", "fullouter")


Purchase rate of every item

In [100]:
purchase_rate = train_purchases.groupBy("item_id").agg(F.count("session_id"))
purchase_sum = purchase_rate.agg(F.sum("count(session_id)")).collect()[0][0]
print(purchase_sum)
purchase_rate = purchase_rate.withColumn("rate", purchase_rate["count(session_id)"] / purchase_sum)
purchase_rate = purchase_rate.drop("count(session_id)")

# train_purchases.select(F.countDistinct("item_id")).show(5)
# candidate_items.select(F.countDistinct("item_id")).show(5)
# train_purchases.selectExpr("count(distinct(item_id))").show(5)
# candidate_items.selectExpr("count(distinct(item_id))").show(5)

# item_features.select(F.countDistinct("item_id")).show(5)
# purchase_rate = purchase_rate.join(candidate_items, "item_id", "right")

# purchase_rate = purchase_rate.fillna(0, "rate")
# purchase_rate.show(5)
# purchase_rate.sort(F.asc("rate")).show(5)
#purchase_rate.where("rate==null").show(5)
# purchase_rate.agg(F.sum("rate")).show(5)

1000000


Now, we can transform our data to an RDD.

In [9]:
rdd = spark.sparkContext.parallelize(data)

NameError: name 'data' is not defined

# Part 2 : Algorithms

## Scalable feature selection algorithm 1

## Scalable feature selection algorithm 2

## Ranking algorithm

## Forward feature selection

# Part 3 : Model

In [None]:
rdd = spark.sparkContext.parallelize([1, 2, 3, 4, 5])
rdd.filter(lambda x: x % 2 == 0).collect()

In [None]:
# show rows with item_id == 2 in all dataframes
for d in datasets:
    print(d.filter(d["item_id"] == 2).collect())