# Project Update
#### Will Jarrard (wej5ar) Abhi Dommalapati (ad4bu), Sebastian Ranasinghe (sar2jf)

## Setup Environment and clean data

In [1]:
import numpy as np
import pandas as pd
import pyspark
from pyspark.sql import SparkSession

# Build spark session
spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "150g") \
    .appName('my-cool-app') \
    .getOrCreate()

spark.catalog.clearCache()

### Read in Data

In [2]:
transactions_train = spark.read.csv('/project/ds5559/h_and_m/transactions_train.csv',  inferSchema=True, header = True)
transactions_train.printSchema()

root
 |-- t_dat: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- article_id: integer (nullable = true)
 |-- price: double (nullable = true)
 |-- sales_channel_id: integer (nullable = true)



## Clean Data

#### Seperate out only 2020 because we get a memory error if we do more

In [3]:
from pyspark.sql.functions import from_unixtime, unix_timestamp, year, month, col, date_format

# Need to make it a string so we can turn it into unix timestamp
transactions_train =  transactions_train.withColumn('t_dat', transactions_train['t_dat'].cast('string'))

# Used https://stackoverflow.com/questions/53285032/how-do-i-convert-timestamp-to-unix-format-with-pyspark
# and https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.functions.from_unixtime.html
transactions_train = transactions_train.withColumn('date', from_unixtime(unix_timestamp('t_dat', 'yyyy-MM-dd')))

# This is to get the date then year so that we only use the year 2020
transactions_train = transactions_train.withColumn('year', year(col('date')))
transactions_train = transactions_train[transactions_train['year'] == 2020]

# Get count by transaction so we can use it for ALS modeling
transactions_train = transactions_train.groupby('customer_id', 'article_id').count()
transactions_train.show(5)

+--------------------+----------+-----+
|         customer_id|article_id|count|
+--------------------+----------+-----+
|00f5ce6142a289516...| 796210010|    2|
|01da48c6794598377...| 621073001|    1|
|0871a5a2f27641068...| 784727001|    1|
|109defd99fce9bfa0...| 803986002|    1|
|1596f86f7ae4b8977...| 654590002|    1|
+--------------------+----------+-----+
only showing top 5 rows



#  Model

### Need to fit string indexer to all data first

In [4]:
from pyspark.ml.feature import StringIndexer
cols = ['customer_id', 'article_id']
indexer = StringIndexer(inputCols=cols, outputCols=[x + "_index" for x in cols]) 
transactions_train = indexer.fit(transactions_train).transform(transactions_train)
(training,test) = transactions_train.randomSplit([0.8, 0.2])

### Train Explicit Model 
[see difference between implicit and explicit here](https://spark.apache.org/docs/latest/ml-collaborative-filtering.html#explicit-vs-implicit-feedback)

In [5]:
# Model
from pyspark.ml.recommendation import ALS

# See https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.mllib.recommendation.ALS.html
als=ALS(userCol="customer_id_index", 
        itemCol="article_id_index", 
        ratingCol="count",
        coldStartStrategy="drop", 
        nonnegative=True,
        implicitPrefs=False
       )

model = als.fit(training)

### Make predictions

In [6]:
predictions=model.transform(test)
predictions.show(10)

+--------------------+----------+-----+-----------------+----------------+----------+
|         customer_id|article_id|count|customer_id_index|article_id_index|prediction|
+--------------------+----------+-----+-----------------+----------------+----------+
|384dc99df15cab60d...| 817353008|    1|          78120.0|           148.0|  1.029453|
|56c982bed14f66475...| 817353008|    1|         391318.0|           148.0|0.97749376|
|1896d3091da3fe94c...| 817353008|    2|          56203.0|           148.0| 1.0770671|
|1cae596bdbdc76382...| 817353008|    1|           1903.0|           148.0| 1.0562613|
|94bb44a2887f2d8ab...| 817353008|    1|         497152.0|           148.0|0.98222816|
|5fd4cd2cddce2a1b7...| 817353008|    1|          18838.0|           148.0|0.99268097|
|5887a8256c7f0fa71...| 817353008|    1|          31759.0|           148.0|0.98300815|
|5c2eb4acf551af4ec...| 817353008|    1|         190874.0|           148.0| 0.9536376|
|a08f8751c9cb556f8...| 817353008|    1|         330731

### Evaluate using rmse

In [7]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator=RegressionEvaluator(metricName="rmse",labelCol="count",predictionCol="prediction")
rmse=evaluator.evaluate(predictions)
print(rmse)

0.4862462184706069


### Make recommendations based off of items

In [8]:
item_recs = model.recommendForAllItems(10)
item_recs.show(5)

+----------------+--------------------+
|article_id_index|     recommendations|
+----------------+--------------------+
|             148|[{771241, 38.5180...|
|             463|[{840952, 42.6872...|
|             471|[{771241, 34.7359...|
|             496|[{771241, 36.0331...|
|             833|[{771241, 36.0542...|
+----------------+--------------------+
only showing top 5 rows



### Make recommendations based off of users

In [9]:
user_recs = model.recommendForAllUsers(12)
user_recs.show(5)

+-----------------+--------------------+
|customer_id_index|     recommendations|
+-----------------+--------------------+
|              148|[{60727, 2.894691...|
|              463|[{34299, 3.846258...|
|              471|[{60727, 3.883132...|
|              496|[{56050, 3.435591...|
|              833|[{60727, 3.338421...|
+-----------------+--------------------+
only showing top 5 rows



### Reformat to Calculate Mean Average Precision 

In [10]:
transactions_train.show(5)

+--------------------+----------+-----+-----------------+----------------+
|         customer_id|article_id|count|customer_id_index|article_id_index|
+--------------------+----------+-----+-----------------+----------------+
|00f5ce6142a289516...| 796210010|    2|          67509.0|           175.0|
|01da48c6794598377...| 621073001|    1|         284500.0|          4769.0|
|0871a5a2f27641068...| 784727001|    1|         343407.0|         11369.0|
|109defd99fce9bfa0...| 803986002|    1|         671375.0|          1945.0|
|1596f86f7ae4b8977...| 654590002|    1|          26536.0|         12448.0|
+--------------------+----------+-----+-----------------+----------------+
only showing top 5 rows



In [11]:
import pyspark.sql.functions as f

grouped_trans = transactions_train.groupBy("customer_id").agg(
    f.first("customer_id_index").alias("customer_id_index"), 
    f.concat_ws(",", f.collect_list("article_id_index")).alias("labels"))

In [12]:
joined = grouped_trans.join(user_recs, grouped_trans.customer_id_index == user_recs.customer_id_index).drop(grouped_trans.customer_id_index)

In [13]:
joined.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- labels: string (nullable = false)
 |-- customer_id_index: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- article_id_index: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)



In [14]:
joined.show(1)

+--------------------+--------------------+-----------------+--------------------+
|         customer_id|              labels|customer_id_index|     recommendations|
+--------------------+--------------------+-----------------+--------------------+
|27a246bb259207e7b...|5103.0,153.0,2730...|              299|[{42201, 3.358554...|
+--------------------+--------------------+-----------------+--------------------+
only showing top 1 row



In [15]:
pred_labels = joined.select("recommendations.article_id_index", "labels").rdd
pred_labels = pred_labels.map(lambda x: (x[0], [int(float(s)) for s in x[1].split(",")]))

In [16]:
from pyspark.mllib.evaluation import RankingMetrics
metrics = RankingMetrics(pred_labels)

In [17]:
metrics.meanAveragePrecision

0.00023766120356183244