# Alternating Least Squares (ALS) Collaborative Filtering Recommender

In [198]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

## Spark Session Initialization
Initializing a Spark session with increased memory allocation to handle large datasets.

In [199]:
spark = SparkSession.builder \
    .appName("KuaiRecALS") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.maxResultSize", "2g") \
    .getOrCreate()

# Data Loading and Sampling
Loading the user-item interaction data and sampling a fraction to fit into memory for demonstration purposes.

In [200]:
interactions_raw = pd.read_csv("data_final_project/KuaiRec 2.0/data/small_matrix.csv")
#interactions_raw = pd.read_csv("data_final_project/KuaiRec 2.0/data/big_matrix.csv")

# Reduce the size of the DataFrame to fit into memory
#interactions_raw = interactions_raw.sample(frac=0.1, random_state=42)
interactions_raw.head(5)

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio
0,14,148,4381,6067,2020-07-05 05:27:48.378,20200705.0,1593898000.0,0.722103
1,14,183,11635,6100,2020-07-05 05:28:00.057,20200705.0,1593898000.0,1.907377
2,14,3649,22422,10867,2020-07-05 05:29:09.479,20200705.0,1593898000.0,2.063311
3,14,5262,4479,7908,2020-07-05 05:30:43.285,20200705.0,1593898000.0,0.566388
4,14,8234,4602,11000,2020-07-05 05:35:43.459,20200705.0,1593899000.0,0.418364


## Data Preparation

Creating a binary 'is_like' column to represent positive feedback and normalizing ratings for ALS.

In [201]:
interactions_df = interactions_raw.copy()
interactions_df["is_like"] = interactions_df['watch_ratio'].apply(lambda x: 1 if x >= 2 else 0)
#interactions_df['ratings'] = interactions_df['watch_ratio'].apply(lambda x: 1 if x >= 2 else 0)

# Normalise the watch_ratio to a scale of 0-6
# Create a finer-grained scale for watch_ratio
def scale_watch_ratio(x):
    if x >= 6:
        return 12
    elif x >= 5.5:
        return 11
    elif x >= 5:
        return 10
    elif x >= 4.5:
        return 9
    elif x >= 4:
        return 8
    elif x >= 3.5:
        return 7
    elif x >= 3:
        return 6
    elif x >= 2.5:
        return 5
    elif x >= 2:
        return 4
    elif x >= 1.5:
        return 3
    elif x >= 1:
        return 2
    elif x >= 0.5:
        return 1
    else:
        return 0

interactions_df['watch_ratio'] = interactions_df['watch_ratio'].apply(scale_watch_ratio)
interactions_df['ratings'] = interactions_df['watch_ratio']

interactions_df.head(5)

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio,is_like,ratings
0,14,148,4381,6067,2020-07-05 05:27:48.378,20200705.0,1593898000.0,1,0,1
1,14,183,11635,6100,2020-07-05 05:28:00.057,20200705.0,1593898000.0,3,0,3
2,14,3649,22422,10867,2020-07-05 05:29:09.479,20200705.0,1593898000.0,4,1,4
3,14,5262,4479,7908,2020-07-05 05:30:43.285,20200705.0,1593898000.0,1,0,1
4,14,8234,4602,11000,2020-07-05 05:35:43.459,20200705.0,1593899000.0,0,0,0


### Conversion to Spark DataFrame

In [202]:
interactions_spark = spark.createDataFrame(interactions_df)

### Selecting Relevant Columns

In [203]:
ratings_spark = interactions_spark.select('user_id', 'video_id', 'ratings')
ratings_spark.show(5)

25/05/18 00:20:12 WARN TaskSetManager: Stage 4071 contains a task of very large size (20834 KiB). The maximum recommended task size is 1000 KiB.
[Stage 4071:>                                                       (0 + 1) / 1]

+-------+--------+-------+
|user_id|video_id|ratings|
+-------+--------+-------+
|     14|     148|      1|
|     14|     183|      3|
|     14|    3649|      4|
|     14|    5262|      1|
|     14|    8234|      0|
+-------+--------+-------+
only showing top 5 rows



25/05/18 00:20:16 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 4071 (TID 8188): Attempting to kill Python Worker
                                                                                

### Indexing User and Item IDs
Encoding user and video IDs as numerical indices, which is required for Spark's ALS implementation.

In [204]:
indexer = [
    StringIndexer(inputCol=column, outputCol=column + "_index")
    for column in list(set(ratings_spark.columns) - set(["ratings"]))
]

pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(ratings_spark).transform(ratings_spark)
transformed.show(5)

25/05/18 00:20:16 WARN TaskSetManager: Stage 4072 contains a task of very large size (20834 KiB). The maximum recommended task size is 1000 KiB.
25/05/18 00:20:17 WARN TaskSetManager: Stage 4075 contains a task of very large size (20834 KiB). The maximum recommended task size is 1000 KiB.
25/05/18 00:20:18 WARN TaskSetManager: Stage 4078 contains a task of very large size (20834 KiB). The maximum recommended task size is 1000 KiB.
[Stage 4078:>                                                       (0 + 1) / 1]

+-------+--------+-------+--------------+-------------+
|user_id|video_id|ratings|video_id_index|user_id_index|
+-------+--------+-------+--------------+-------------+
|     14|     148|      1|        3070.0|        238.0|
|     14|     183|      3|         203.0|        238.0|
|     14|    3649|      4|        3087.0|        238.0|
|     14|    5262|      1|        2329.0|        238.0|
|     14|    8234|      0|        3318.0|        238.0|
+-------+--------+-------+--------------+-------------+
only showing top 5 rows



25/05/18 00:20:22 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 4078 (TID 8219): Attempting to kill Python Worker
                                                                                

### Train-Test Split
Splitting the data into training and test sets to evaluate model performance.

In [205]:
(training, test) = transformed.randomSplit([0.8, 0.2], seed=42)

# ALS Model Training
Configuring and training the ALS model to learn latent factors for users and items.

In [206]:
als = ALS(
    maxIter=17,
    regParam=0.01,
    rank=25,
    userCol="user_id_index",
    itemCol="video_id_index",
    ratingCol="ratings",
    coldStartStrategy="drop",
    nonnegative=True,
    # implicitPrefs=True,
)

model = als.fit(training)

25/05/18 00:20:22 WARN TaskSetManager: Stage 4079 contains a task of very large size (20834 KiB). The maximum recommended task size is 1000 KiB.
25/05/18 00:20:23 WARN TaskSetManager: Stage 4080 contains a task of very large size (20834 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

## Evaluation
Evaluating the ALS model using RMSE on the test set to assess prediction accuracy.

In [207]:
evaluator = RegressionEvaluator(
    metricName="rmse", labelCol="ratings", predictionCol="prediction"
)

predictions = model.transform(test)
rmse = evaluator.evaluate(predictions)

print("RMSE=" + str(rmse))
predictions.show(5)

25/05/18 00:20:27 WARN TaskSetManager: Stage 4161 contains a task of very large size (20834 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

RMSE=1.230313879321708


25/05/18 00:20:29 WARN TaskSetManager: Stage 4248 contains a task of very large size (20834 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

+-------+--------+-------+--------------+-------------+----------+
|user_id|video_id|ratings|video_id_index|user_id_index|prediction|
+-------+--------+-------+--------------+-------------+----------+
|    120|     206|      1|        2142.0|         31.0| 0.9456809|
|    120|    2671|      1|         463.0|         31.0| 1.1577698|
|    120|    2820|      1|         496.0|         31.0| 0.6950925|
|    120|    5266|      1|        2659.0|         31.0| 1.3514937|
|     64|     762|      1|        1591.0|        451.0| 1.7157103|
+-------+--------+-------+--------------+-------------+----------+
only showing top 5 rows



## Post-processing Recommendations
Converting Spark recommendations to Pandas, mapping indices back to original IDs, and organizing recommendations for evaluation.

In [208]:
recs = model.recommendForAllUsers(10).toPandas()
df_recs = (
    recs.recommendations.apply(pd.Series)
    .merge(recs, right_index=True, left_index=True)
    .drop(["recommendations"], axis=1)
    .melt(id_vars=["user_id_index"], value_name="recommendation")
    .drop("variable", axis=1)
    .dropna()
)

df_recs = df_recs.sort_values("user_id_index")
df_recs = pd.concat(
    [df_recs["recommendation"].apply(pd.Series), df_recs["user_id_index"]], axis=1
)

df_recs.columns = ["product_id_index", "ratings", "reviewer_id"]
tmp = transformed.select(
    transformed["user_id"],
    transformed["user_id_index"],
    transformed["video_id"],
    transformed["video_id_index"],
)
tmp = tmp.toPandas()

dict1 = dict(zip(tmp["user_id_index"], tmp["user_id"]))
dict2 = dict(zip(tmp["video_id_index"], tmp["video_id"]))

df_recs_copy = df_recs.copy()
df_recs_copy.loc[:, "user_id"] = df_recs["reviewer_id"].map(dict1)
df_recs_copy.loc[:, "video_id"] = df_recs["product_id_index"].map(dict2)
df_recs_copy = df_recs_copy.sort_values("user_id")
df_recs_copy.reset_index(drop=True, inplace=True)

new = df_recs_copy[["user_id", "video_id", "ratings"]]
new["recommendations"] = list(zip(new.video_id, new.ratings))

res = new[["user_id", "recommendations"]]
res_new = res["recommendations"].groupby([res.user_id]).apply(list).reset_index()

print(res_new)

25/05/18 00:20:32 WARN TaskSetManager: Stage 4411 contains a task of very large size (20834 KiB). The maximum recommended task size is 1000 KiB.


      user_id                                    recommendations
0          14  [(1305, 4.730415344238281), (7383, 5.369937419...
1          19  [(314, 4.339854717254639), (5525, 4.0973510742...
2          21  [(9178, 4.689492225646973), (2130, 5.547003746...
3          23  [(314, 4.204347610473633), (8298, 5.2634778022...
4          24  [(5811, 3.389115333557129), (8298, 4.427169799...
...       ...                                                ...
1406     7142  [(4040, 4.0484418869018555), (4123, 4.08339023...
1407     7147  [(8298, 4.950584411621094), (154, 5.0683169364...
1408     7153  [(5464, 3.399148464202881), (1305, 4.011781215...
1409     7159  [(314, 3.6427805423736572), (9178, 4.795408248...
1410     7162  [(600, 5.654923915863037), (7383, 5.6269860267...

[1411 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new["recommendations"] = list(zip(new.video_id, new.ratings))


## Evaluation

In [209]:
def evaluate_topk_metrics(y_true, top_k_preds, k=10):
    top_k = top_k_preds[:k]
    relevant = set(y_true)
    hits = [1 if item in relevant else 0 for item in top_k]

    precision = sum(hits) / k
    recall = sum(hits) / len(relevant) if relevant else 0.0
    dcg = sum(hit / np.log2(i + 2) for i, hit in enumerate(hits))
    ideal_hits = [1] * min(len(relevant), k)
    idcg = sum(1 / np.log2(i + 2) for i in range(len(ideal_hits)))
    ndcg = dcg / idcg if idcg != 0 else 0.0

    # MAP@k: mean average precision
    ap_sum = 0.0
    hit_count = 0
    for i, hit in enumerate(hits):
        if hit:
            hit_count += 1
            ap_sum += hit_count / (i + 1)
    map_k = ap_sum / min(len(relevant), k) if relevant else 0.0

    return precision, recall, ndcg, map_k


In [210]:
#test_interactions_df = pd.read_csv("data_final_project/KuaiRec 2.0/data/small_matrix.csv")
#test_interactions_df["is_like"] = interactions_df['watch_ratio'].apply(lambda x: 1 if x >= 2 else 0)

In [211]:
user_recs_dict = dict(zip(res_new["user_id"], res_new["recommendations"]))

k_values = [1, 5, 10, 20]
results = []

for k in k_values:
    all_precisions, all_recalls, all_ndcgs, all_maps = [], [], [], []
    for user_id, recs in user_recs_dict.items():
        video_ids = [video_id for video_id, _ in recs][:k]
        y_true = interactions_df[(interactions_df["user_id"] == user_id) & (interactions_df["is_like"] == 1)]["video_id"].tolist()
        if not y_true or not video_ids:
            continue
        precision, recall, ndcg, map_k = evaluate_topk_metrics(y_true, video_ids, k)
        all_precisions.append(precision)
        all_recalls.append(recall)
        all_ndcgs.append(ndcg)
        all_maps.append(map_k)
    results.append({
        "k": k,
        "precision": np.mean(all_precisions),
        "recall": np.mean(all_recalls),
        "ndcg": np.mean(all_ndcgs),
        "map": np.mean(all_maps)
    })

for res in results:
    print(f"Results for k={res['k']}:")
    print(f"  Mean Precision@{res['k']}: {res['precision']:.4f}")
    print(f"  Mean Recall@{res['k']}: {res['recall']:.4f}")
    print(f"  Mean NDCG@{res['k']}: {res['ndcg']:.4f}")
    print(f"  Mean MAP@{res['k']}: {res['map']:.4f}\n")

Results for k=1:
  Mean Precision@1: 0.7385
  Mean Recall@1: 0.0125
  Mean NDCG@1: 0.7385
  Mean MAP@1: 0.7385

Results for k=5:
  Mean Precision@5: 0.7184
  Mean Recall@5: 0.0606
  Mean NDCG@5: 0.7224
  Mean MAP@5: 0.6216

Results for k=10:
  Mean Precision@10: 0.7174
  Mean Recall@10: 0.1214
  Mean NDCG@10: 0.7205
  Mean MAP@10: 0.5892

Results for k=20:
  Mean Precision@20: 0.3587
  Mean Recall@20: 0.1214
  Mean NDCG@20: 0.4699
  Mean MAP@20: 0.2995



## Conclusion

The ALS collaborative filtering recommender system was evaluated using Precision@K, Recall@K, NDCG@K, and MAP@K metrics for various values of K. The results show that the model is able to recommend relevant items to users with reasonable effectiveness, as indicated by the improvement in evaluation metrics compared to earlier iterations. 

While the current approach demonstrates a significant increase in performance (e.g., Precision@10 improved from 0.06 to 0.24), there is still room for further optimization. Potential improvements include additional hyperparameter tuning, experimenting with implicit feedback, filtering out users/items with very few interactions, and refining the evaluation methodology (such as using a temporal split or negative sampling).

Overall, the ALS-based collaborative filtering model provides a solid foundation for personalized recommendations and can be further enhanced with more advanced techniques and data preprocessing strategies.