# Alternating Least Squares Recommender

In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [2]:
spark = SparkSession.builder \
    .appName("KuaiRecALS") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.maxResultSize", "2g") \
    .getOrCreate()

25/05/17 22:56:43 WARN Utils: Your hostname, cal.local resolves to a loopback address: 127.0.0.1; using 192.168.1.58 instead (on interface en0)
25/05/17 22:56:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/17 22:56:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
interactions_raw = pd.read_csv("data_final_project/KuaiRec 2.0/data/small_matrix.csv")
# Reduce the size of the DataFrame to fit into memory
interactions_raw = interactions_raw.sample(frac=0.1, random_state=42)
interactions_raw.head(5)

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio
224263,385,6088,3010,14240,2020-08-08 09:49:55.461,20200808.0,1596851000.0,0.211376
2682796,4275,570,1543,25310,2020-07-29 20:38:30.483,20200729.0,1596026000.0,0.060964
32875,55,7281,1276,6667,2020-08-28 07:44:46.355,20200828.0,1598572000.0,0.19139
262283,477,7093,6709,28929,2020-07-14 19:24:31.309,20200714.0,1594726000.0,0.231913
3228863,5000,7125,2976,6016,2020-07-18 11:21:36.833,20200718.0,1595042000.0,0.494681


In [5]:
# Normalise the data, watch_ratio from float to 0 -> 5
interactions_df = interactions_raw.copy()
interactions_df["is_like"] = interactions_df['watch_ratio'].apply(lambda x: 1 if x >= 2 else 0)
#interactions_df['ratings'] = interactions_df['watch_ratio'] * 2 + interactions_df['is_like'] * 3
interactions_df['ratings'] = interactions_df['watch_ratio'].apply(lambda x: 1 if x >= 2 else 0)
interactions_df.head(5)

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio,is_like,ratings
224263,385,6088,3010,14240,2020-08-08 09:49:55.461,20200808.0,1596851000.0,0.211376,0,0
2682796,4275,570,1543,25310,2020-07-29 20:38:30.483,20200729.0,1596026000.0,0.060964,0,0
32875,55,7281,1276,6667,2020-08-28 07:44:46.355,20200828.0,1598572000.0,0.19139,0,0
262283,477,7093,6709,28929,2020-07-14 19:24:31.309,20200714.0,1594726000.0,0.231913,0,0
3228863,5000,7125,2976,6016,2020-07-18 11:21:36.833,20200718.0,1595042000.0,0.494681,0,0


In [6]:
interactions_spark = spark.createDataFrame(interactions_df)

In [None]:
ratings_spark = interactions_spark.select('user_id', 'video_id', 'ratings')
ratings_spark.show()

In [None]:
indexer = [
    StringIndexer(inputCol=column, outputCol=column + "_index")
    for column in list(set(ratings_spark.columns) - set(["ratings"]))
]

pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(ratings_spark).transform(ratings_spark)
transformed.show()

In [None]:
(training, test) = transformed.randomSplit([0.8, 0.2], seed=42)

In [None]:
als = ALS(
    maxIter=5,
    regParam=0.25,
    rank=25,
    userCol="user_id_index",
    itemCol="video_id_index",
    ratingCol="ratings",
    coldStartStrategy="drop",
    nonnegative=True,
)

model = als.fit(training)

In [None]:
evaluator = RegressionEvaluator(
    metricName="rmse", labelCol="ratings", predictionCol="prediction"
)

predictions = model.transform(test)
rmse = evaluator.evaluate(predictions)

print("RMSE=" + str(rmse))
predictions.show()

In [None]:
user_recs = model.recommendForAllUsers(20).show(10)

In [None]:
recs = model.recommendForAllUsers(10).toPandas()
df_recs = (
    recs.recommendations.apply(pd.Series)
    .merge(recs, right_index=True, left_index=True)
    .drop(["recommendations"], axis=1)
    .melt(id_vars=["user_id_index"], value_name="recommendation")
    .drop("variable", axis=1)
    .dropna()
)

df_recs = df_recs.sort_values("user_id_index")
df_recs = pd.concat(
    [df_recs["recommendation"].apply(pd.Series), df_recs["user_id_index"]], axis=1
)

df_recs.columns = ["product_id_index", "ratings", "reviewer_id"]
tmp = transformed.select(
    transformed["user_id"],
    transformed["user_id_index"],
    transformed["video_id"],
    transformed["video_id_index"],
)
tmp = tmp.toPandas()

dict1 = dict(zip(tmp["user_id_index"], tmp["user_id"]))
dict2 = dict(zip(tmp["video_id_index"], tmp["video_id"]))

df_recs_copy = df_recs.copy()
df_recs_copy.loc[:, "user_id"] = df_recs["reviewer_id"].map(dict1)
df_recs_copy.loc[:, "video_id"] = df_recs["product_id_index"].map(dict2)
df_recs_copy = df_recs_copy.sort_values("user_id")
df_recs_copy.reset_index(drop=True, inplace=True)

new = df_recs_copy[["user_id", "video_id", "ratings"]]
new["recommendations"] = list(zip(new.video_id, new.ratings))

res = new[["user_id", "recommendations"]]
res_new = res["recommendations"].groupby([res.user_id]).apply(list).reset_index()

print(res_new)

In [None]:
user_id = 14
user_recommendations = res_new[res_new["user_id"] == user_id]["recommendations"].values[0]
video_ids = [video_id for video_id, _ in user_recommendations]
print(video_ids)
truth = set(interactions_df[(interactions_df["user_id"] == user_id) & (interactions_df["is_like"] == 1)]["video_id"].values)

precision = len(set(video_ids) & set(truth)) / len(video_ids) if video_ids else 0
recall = len(set(video_ids) & set(truth)) / len(truth) if truth else 0
print(f"Precision: {precision}, Recall: {recall}")