# Alternating Least Squares (ALS) Collaborative Filtering Recommender

In [50]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

## Spark Session Initialization
Initializing a Spark session with increased memory allocation to handle large datasets.

In [51]:
spark = SparkSession.builder \
    .appName("KuaiRecALS") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.maxResultSize", "2g") \
    .getOrCreate()

# Data Loading and Sampling
Loading the user-item interaction data and sampling a fraction to fit into memory for demonstration purposes.

In [52]:
interactions_raw = pd.read_csv("data_final_project/KuaiRec 2.0/data/small_matrix.csv")

# Reduce the size of the DataFrame to fit into memory
interactions_raw = interactions_raw.sample(frac=0.1, random_state=42)
interactions_raw.head(5)

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio
224263,385,6088,3010,14240,2020-08-08 09:49:55.461,20200808.0,1596851000.0,0.211376
2682796,4275,570,1543,25310,2020-07-29 20:38:30.483,20200729.0,1596026000.0,0.060964
32875,55,7281,1276,6667,2020-08-28 07:44:46.355,20200828.0,1598572000.0,0.19139
262283,477,7093,6709,28929,2020-07-14 19:24:31.309,20200714.0,1594726000.0,0.231913
3228863,5000,7125,2976,6016,2020-07-18 11:21:36.833,20200718.0,1595042000.0,0.494681


## Data Preparation

Creating a binary 'is_like' column to represent positive feedback and normalizing ratings for ALS.

In [53]:
interactions_df = interactions_raw.copy()
interactions_df["is_like"] = interactions_df['watch_ratio'].apply(lambda x: 1 if x >= 2 else 0)
interactions_df['ratings'] = interactions_df['watch_ratio'].apply(lambda x: 1 if x >= 2 else 0)
interactions_df.head(5)

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio,is_like,ratings
224263,385,6088,3010,14240,2020-08-08 09:49:55.461,20200808.0,1596851000.0,0.211376,0,0
2682796,4275,570,1543,25310,2020-07-29 20:38:30.483,20200729.0,1596026000.0,0.060964,0,0
32875,55,7281,1276,6667,2020-08-28 07:44:46.355,20200828.0,1598572000.0,0.19139,0,0
262283,477,7093,6709,28929,2020-07-14 19:24:31.309,20200714.0,1594726000.0,0.231913,0,0
3228863,5000,7125,2976,6016,2020-07-18 11:21:36.833,20200718.0,1595042000.0,0.494681,0,0


### Conversion to Spark DataFrame

In [54]:
interactions_spark = spark.createDataFrame(interactions_df)

### Selecting Relevant Columns

In [55]:
ratings_spark = interactions_spark.select('user_id', 'video_id', 'ratings')
ratings_spark.show(5)

25/05/17 23:12:17 WARN TaskSetManager: Stage 619 contains a task of very large size (2287 KiB). The maximum recommended task size is 1000 KiB.
[Stage 619:>                                                        (0 + 1) / 1]

+-------+--------+-------+
|user_id|video_id|ratings|
+-------+--------+-------+
|    385|    6088|      0|
|   4275|     570|      0|
|     55|    7281|      0|
|    477|    7093|      0|
|   5000|    7125|      0|
+-------+--------+-------+
only showing top 5 rows



25/05/17 23:12:21 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 619 (TID 1676): Attempting to kill Python Worker
                                                                                

### Indexing User and Item IDs
Encoding user and video IDs as numerical indices, which is required for Spark's ALS implementation.

In [56]:
indexer = [
    StringIndexer(inputCol=column, outputCol=column + "_index")
    for column in list(set(ratings_spark.columns) - set(["ratings"]))
]

pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(ratings_spark).transform(ratings_spark)
transformed.show(5)

25/05/17 23:12:21 WARN TaskSetManager: Stage 620 contains a task of very large size (2287 KiB). The maximum recommended task size is 1000 KiB.
25/05/17 23:12:21 WARN TaskSetManager: Stage 623 contains a task of very large size (2287 KiB). The maximum recommended task size is 1000 KiB.


+-------+--------+-------+--------------+-------------+
|user_id|video_id|ratings|video_id_index|user_id_index|
+-------+--------+-------+--------------+-------------+
|    385|    6088|      0|        1511.0|        549.0|
|   4275|     570|      0|        2641.0|        354.0|
|     55|    7281|      0|        2204.0|       1298.0|
|    477|    7093|      0|        2087.0|        307.0|
|   5000|    7125|      0|          53.0|        655.0|
+-------+--------+-------+--------------+-------------+
only showing top 5 rows



25/05/17 23:12:22 WARN TaskSetManager: Stage 626 contains a task of very large size (2287 KiB). The maximum recommended task size is 1000 KiB.


### Train-Test Split
Splitting the data into training and test sets to evaluate model performance.

In [57]:
(training, test) = transformed.randomSplit([0.8, 0.2], seed=42)

# ALS Model Training
Configuring and training the ALS model to learn latent factors for users and items.

In [58]:
als = ALS(
    maxIter=5,
    regParam=0.25,
    rank=25,
    userCol="user_id_index",
    itemCol="video_id_index",
    ratingCol="ratings",
    coldStartStrategy="drop",
    nonnegative=True,
)

model = als.fit(training)

25/05/17 23:12:22 WARN TaskSetManager: Stage 627 contains a task of very large size (2287 KiB). The maximum recommended task size is 1000 KiB.
25/05/17 23:12:22 WARN TaskSetManager: Stage 628 contains a task of very large size (2287 KiB). The maximum recommended task size is 1000 KiB.


## Evaluation
Evaluating the ALS model using RMSE on the test set to assess prediction accuracy.

In [59]:
evaluator = RegressionEvaluator(
    metricName="rmse", labelCol="ratings", predictionCol="prediction"
)

predictions = model.transform(test)
rmse = evaluator.evaluate(predictions)

print("RMSE=" + str(rmse))
predictions.show(5)

25/05/17 23:12:22 WARN TaskSetManager: Stage 661 contains a task of very large size (2287 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

RMSE=0.21494733109519942


25/05/17 23:12:23 WARN TaskSetManager: Stage 700 contains a task of very large size (2287 KiB). The maximum recommended task size is 1000 KiB.


+-------+--------+-------+--------------+-------------+-------------+
|user_id|video_id|ratings|video_id_index|user_id_index|   prediction|
+-------+--------+-------+--------------+-------------+-------------+
|   1015|    8718|      0|        1645.0|        392.0|1.3681786E-10|
|   2139|    8718|      0|        1645.0|       1064.0| 8.910314E-11|
|   1884|   10364|      0|        1238.0|       1269.0|1.2824435E-11|
|   1354|    8718|      0|        1645.0|        860.0|8.5908274E-11|
|    896|   10416|      0|        2122.0|        222.0|3.9887837E-12|
+-------+--------+-------+--------------+-------------+-------------+
only showing top 5 rows



                                                                                

In [60]:
user_recs = model.recommendForAllUsers(20).show(10)



+-------------+--------------------+
|user_id_index|     recommendations|
+-------------+--------------------+
|           12|[{2241, 3.3055317...|
|           22|[{2241, 9.2186325...|
|           26|[{2241, 1.180758E...|
|           27|[{2241, 1.9269052...|
|           28|[{2241, 2.177901E...|
|           31|[{2241, 5.652655E...|
|           34|[{2241, 7.935188E...|
|           44|[{2241, 4.964044E...|
|           47|[{2241, 4.3555692...|
|           53|[{2241, 1.9381602...|
+-------------+--------------------+
only showing top 10 rows



                                                                                

## Post-processing Recommendations
Converting Spark recommendations to Pandas, mapping indices back to original IDs, and organizing recommendations for evaluation.

In [61]:
recs = model.recommendForAllUsers(10).toPandas()
df_recs = (
    recs.recommendations.apply(pd.Series)
    .merge(recs, right_index=True, left_index=True)
    .drop(["recommendations"], axis=1)
    .melt(id_vars=["user_id_index"], value_name="recommendation")
    .drop("variable", axis=1)
    .dropna()
)

df_recs = df_recs.sort_values("user_id_index")
df_recs = pd.concat(
    [df_recs["recommendation"].apply(pd.Series), df_recs["user_id_index"]], axis=1
)

df_recs.columns = ["product_id_index", "ratings", "reviewer_id"]
tmp = transformed.select(
    transformed["user_id"],
    transformed["user_id_index"],
    transformed["video_id"],
    transformed["video_id_index"],
)
tmp = tmp.toPandas()

dict1 = dict(zip(tmp["user_id_index"], tmp["user_id"]))
dict2 = dict(zip(tmp["video_id_index"], tmp["video_id"]))

df_recs_copy = df_recs.copy()
df_recs_copy.loc[:, "user_id"] = df_recs["reviewer_id"].map(dict1)
df_recs_copy.loc[:, "video_id"] = df_recs["product_id_index"].map(dict2)
df_recs_copy = df_recs_copy.sort_values("user_id")
df_recs_copy.reset_index(drop=True, inplace=True)

new = df_recs_copy[["user_id", "video_id", "ratings"]]
new["recommendations"] = list(zip(new.video_id, new.ratings))

res = new[["user_id", "recommendations"]]
res_new = res["recommendations"].groupby([res.user_id]).apply(list).reset_index()

print(res_new)

25/05/17 23:12:28 WARN TaskSetManager: Stage 796 contains a task of very large size (2287 KiB). The maximum recommended task size is 1000 KiB.


      user_id                                    recommendations
0          14  [(9178, 5.043813144922638e-10), (5464, 4.58507...
1          19  [(600, 2.7589139306449795e-10), (5525, 2.53410...
2          21  [(1305, 5.332920216538639e-10), (2130, 4.43843...
3          23  [(2130, 7.548934810586161e-10), (7383, 7.70229...
4          24  [(154, 7.266417467732822e-10), (600, 8.0075612...
...       ...                                                ...
1406     7142  [(314, 1.8574650151315097e-10), (7383, 1.95947...
1407     7147  [(7383, 3.6516792367713435e-10), (1305, 4.3004...
1408     7153  [(4040, 1.7924446649164594e-10), (5525, 1.6438...
1409     7159  [(2130, 5.6445453866516e-10), (5525, 5.5618692...
1410     7162  [(9178, 1.4693225525164166e-09), (7383, 1.3762...

[1411 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new["recommendations"] = list(zip(new.video_id, new.ratings))


### Evaluation for a Specific User
Evaluating the recommendations for a specific user by comparing recommended videos to those the user actually liked, and calculating precision and recall.

In [62]:
user_id = 14
user_recommendations = res_new[res_new["user_id"] == user_id]["recommendations"].values[0]
video_ids = [video_id for video_id, _ in user_recommendations]
print(video_ids)
truth = set(interactions_df[(interactions_df["user_id"] == user_id) & (interactions_df["is_like"] == 1)]["video_id"].values)

precision = len(set(video_ids) & set(truth)) / len(video_ids) if video_ids else 0
recall = len(set(video_ids) & set(truth)) / len(truth) if truth else 0
print(f"Precision: {precision}, Recall: {recall}")

[9178, 5464, 1305, 7383, 314, 5525, 600, 154, 4040, 2130]
Precision: 0.0, Recall: 0.0
