## Exercise 9: Compute Pearson Similarity Between Users

We first self-join the centered ratings on isbn to obtain all pairs of users who rated at least one book in common. Then, for each user pair (u, v), we aggregate the necessary sums to compute the Pearson correlation based on their centered ratings. Finally, we symmetrize the matrix and, for each user u, we rank all other users by similarity and keep the top-k (k=10) neighbors.

In [2]:
# Import Spark modules and load datasets

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("kcore-stage").getOrCreate()

ratings_clean = spark.read.parquet("export_core/pdf_core_ratings.parquet")
books_clean = spark.read.parquet("export_core/pdf_core_books.parquet")
users_active = spark.read.parquet("export_core/pdf_users_active.parquet")

In [3]:
# Prepare user average rating & user-centered rating

ratings_core = ratings_clean.select("user_id", "isbn", "rating") \
    .filter(F.col("rating") > 0)

# average rating of each user
user_means = ratings_core.groupBy("user_id") \
    .agg(F.avg("rating").alias("mean_rating"))

# user-centered rating 
ratings_centered = ratings_core.join(user_means, on="user_id", how="inner") \
    .withColumn("rating_centered", F.col("rating") - F.col("mean_rating"))

ratings_centered.show(5, truncate=False)



+-------+---------+------+-----------------+------------------+
|user_id|isbn     |rating|mean_rating      |rating_centered   |
+-------+---------+------+-----------------+------------------+
|100009 |385504209|8.0   |7.333333333333333|0.666666666666667 |
|100009 |60502258 |6.0   |7.333333333333333|-1.333333333333333|
|100115 |786868716|10.0  |9.333333333333334|0.6666666666666661|
|100223 |316789089|9.0   |7.666666666666667|1.333333333333333 |
|100459 |156006529|9.0   |8.291666666666666|0.7083333333333339|
+-------+---------+------+-----------------+------------------+
only showing top 5 rows



In [4]:
# Construct pairs (u, v) of all users who share the same book.

# Give ratings_centered two aliases, a and b.
a = ratings_centered.select(
    F.col("user_id").alias("user_a"),
    F.col("isbn"),
    F.col("rating_centered").alias("ra")
)

b = ratings_centered.select(
    F.col("user_id").alias("user_b"),
    F.col("isbn"),
    F.col("rating_centered").alias("rb")
)

# Self-join on the same book so that we get all combinations of (user_a, user_b, isbn).
user_pairs_on_same_book = a.join(
    b,
    on="isbn",
    how="inner"
).filter(F.col("user_a") < F.col("user_b"))  # Only keep user_a < user_b to avoid duplication and self-pairing.

user_pairs_on_same_book.show(10, truncate=False)


+---------+------+-----------------+------+-------------------+
|isbn     |user_a|ra               |user_b|rb                 |
+---------+------+-----------------+------+-------------------+
|385504209|100009|0.666666666666667|96744 |0.0                |
|385504209|100009|0.666666666666667|94263 |-0.8000000000000007|
|385504209|100009|0.666666666666667|89551 |0.0                |
|385504209|100009|0.666666666666667|85502 |0.6666666666666661 |
|385504209|100009|0.666666666666667|82893 |0.25               |
|385504209|100009|0.666666666666667|81318 |2.7777777777777777 |
|385504209|100009|0.666666666666667|75819 |2.352941176470588  |
|385504209|100009|0.666666666666667|73681 |-1.1999999999999993|
|385504209|100009|0.666666666666667|70594 |0.75               |
|385504209|100009|0.666666666666667|68383 |-2.25              |
+---------+------+-----------------+------+-------------------+
only showing top 10 rows



In [5]:
# Calculate Pearson

user_pair_stats = user_pairs_on_same_book.groupBy("user_a", "user_b").agg(
    F.count("*").alias("n_common"),
    F.sum(F.col("ra") * F.col("rb")).alias("sum_ra_rb"),
    F.sum(F.col("ra") ** 2).alias("sum_ra2"),
    F.sum(F.col("rb") ** 2).alias("sum_rb2")
)

user_pair_stats.show(10, truncate=False)

user_pair_sims = user_pair_stats.withColumn(
    "pearson",
    F.when(
        (F.col("sum_ra2") > 0) & (F.col("sum_rb2") > 0) & (F.col("n_common") >= 2),
        F.col("sum_ra_rb") / (F.sqrt(F.col("sum_ra2")) * F.sqrt(F.col("sum_rb2")))
    ).otherwise(F.lit(0.0))
)

user_pair_sims.orderBy(F.col("pearson").desc()).show(10, truncate=False)


+------+------+--------+--------------------+-------------------+-------------------+
|user_a|user_b|n_common|sum_ra_rb           |sum_ra2            |sum_rb2            |
+------+------+--------+--------------------+-------------------+-------------------+
|100009|124946|1       |0.44444444444444486 |0.44444444444444486|0.44444444444444486|
|100115|6563  |1       |1.7499999999999984  |0.44444444444444364|6.890625           |
|100115|128325|1       |0.6666666666666661  |0.44444444444444364|1.0                |
|100115|25996 |1       |-0.7777777777777772 |0.44444444444444364|1.3611111111111118 |
|100459|232945|1       |-0.25925925925925886|0.0850694444444441 |0.7901234567901242 |
|100906|183061|3       |-0.6956521739130423 |2.362948960302457  |1.3333333333333308 |
|101304|72190 |1       |0.33333333333333304 |0.44444444444444364|0.25               |
|101620|225763|1       |-0.45454545454545414|6.25               |0.03305785123966936|
|101620|110912|1       |-2.17741935483871   |6.25     

In [6]:
# Do a symmetrical expansion: each pair becomes two records.
#  user = a, neighbor = b
#  user = b, neighbor = a

user_neighbor_sims = user_pair_sims.select(
    F.col("user_a").alias("user"),
    F.col("user_b").alias("neighbor"),
    F.col("pearson").alias("similarity"),
    F.col("n_common")
).unionByName(
    user_pair_sims.select(
        F.col("user_b").alias("user"),
        F.col("user_a").alias("neighbor"),
        F.col("pearson").alias("similarity"),
        F.col("n_common")
    )
)

user_neighbor_sims.show(10, truncate=False)


+------+--------+-------------------+--------+
|user  |neighbor|similarity         |n_common|
+------+--------+-------------------+--------+
|100009|124946  |0.0                |1       |
|100115|6563    |0.0                |1       |
|100115|128325  |0.0                |1       |
|100115|25996   |0.0                |1       |
|100459|232945  |0.0                |1       |
|100906|183061  |-0.3919183588453083|3       |
|101304|72190   |0.0                |1       |
|101620|225763  |0.0                |1       |
|101620|110912  |0.0                |1       |
|101851|35148   |1.0                |3       |
+------+--------+-------------------+--------+
only showing top 10 rows



In [None]:
# For each user, find the top-k neighbors.
k_users = 20

w = Window.partitionBy("user").orderBy(F.col("similarity").desc())

user_neighbors_topk = user_neighbor_sims.withColumn(
    "rank",
    F.row_number().over(w)
).filter(
    (F.col("rank") <= k_users) & (F.col("similarity") > 0)  
)

user_neighbors_topk.orderBy("user", "rank").show(50, truncate=False)


+------+--------+--------------------+--------+----+
|user  |neighbor|similarity          |n_common|rank|
+------+--------+--------------------+--------+----+
|100009|86202   |1.0                 |2       |1   |
|100009|114868  |1.0                 |2       |2   |
|100009|157273  |0.9999999999999999  |2       |3   |
|100009|35704   |0.9284766908852593  |2       |4   |
|100009|240370  |0.7893522173763265  |2       |5   |
|100009|68984   |0.6139406135149202  |2       |6   |
|100009|2179    |0.5144957554275273  |2       |7   |
|100009|142524  |0.42808634473904406 |2       |8   |
|100009|60244   |0.31622776601683766 |2       |9   |
|100009|11676   |0.26045657081325496 |2       |10  |
|100053|78834   |0.9899494936611665  |2       |1   |
|100053|638     |0.9742291741487187  |2       |2   |
|100053|104113  |0.9037378388935385  |2       |3   |
|100053|163488  |0.8875844239103626  |2       |4   |
|100053|244286  |0.8320502943378437  |2       |5   |
|100053|273979  |0.5837264794774722  |2       

## Exercise 10: Predict ratings for books the target user has not read

In [None]:
# Select a user with a "medium rating" as target_user

w = Window.orderBy("num_ratings")

users_ranked = users_active.withColumn(
    "row_num",
    F.row_number().over(w)
)

users_ranked.orderBy("row_num").show(10, truncate=False)

total_users = users_ranked.agg(F.max("row_num")).first()[0]
mid_index = total_users // 2 + 1   

print("total_users:", total_users, "mid_index:", mid_index)

target_row = users_ranked.filter(F.col("row_num") == mid_index).first()
target_user = target_row["user_id"]
target_num_ratings = target_row["num_ratings"]

print("Selected target_user:", target_user, "with num_ratings:", target_num_ratings)

+-------+-----------+-------+
|user_id|num_ratings|row_num|
+-------+-----------+-------+
|246216 |5          |1      |
|267354 |5          |2      |
|131182 |5          |3      |
|217173 |5          |4      |
|853    |5          |5      |
|11687  |5          |6      |
|136916 |5          |7      |
|205554 |5          |8      |
|30324  |5          |9      |
|212853 |5          |10     |
+-------+-----------+-------+
only showing top 10 rows

total_users: 3776 mid_index: 1889
Selected target_user: 25533 with num_ratings: 9


In [18]:
print(target_user)

25533


In [13]:
# Extract the target user's neighbors from the global similarity matrix.
neighbors_for_u = user_neighbors_topk.filter(F.col("user") == target_user) \
    .select(
        F.col("neighbor"),
        F.col("similarity")
    )

neighbors_for_u.show(10, truncate=False)

rated_by_target = ratings_core.filter(F.col("user_id") == target_user) \
    .select("isbn").distinct()

rated_by_target.show(10, truncate=False)

# neighbors' centered ratings 
neighbor_ratings = ratings_centered.alias("r").join(
    neighbors_for_u.alias("n"),
    F.col("r.user_id") == F.col("n.neighbor"),
    how="inner"
)

neighbor_ratings.select(
    "r.user_id", "r.isbn", "r.rating", "r.rating_centered", "n.similarity"
).show(10, truncate=False)

# books already read by the target users
rated_by_target = ratings_core.filter(F.col("user_id") == target_user) \
    .select("isbn").distinct()

rated_by_target.show(10, truncate=False)

# Only keep those books that neighbors have rated, but the target_user hasn't read.
candidate_ratings = neighbor_ratings.join(
    rated_by_target.withColumnRenamed("isbn", "rated_isbn"),
    F.col("r.isbn") == F.col("rated_isbn"),
    how="left_anti"
)

candidate_ratings.select(
    "r.user_id", "r.isbn", "r.rating_centered", "n.similarity"
).show(10, truncate=False)


+--------+------------------+
|neighbor|similarity        |
+--------+------------------+
|30711   |0.9778024140774094|
|92853   |0.9383431168171101|
|197364  |0.7071067811865476|
|126296  |0.4270063054078286|
+--------+------------------+

+---------+
|isbn     |
+---------+
|014028009|
|99771519 |
|441569595|
|671617028|
+---------+

+-------+----------+------+-------------------+------------------+
|user_id|isbn      |rating|rating_centered    |similarity        |
+-------+----------+------+-------------------+------------------+
|30711  |60502258  |8.0   |0.20000000000000018|0.9778024140774094|
|30711  |380977788 |8.0   |0.20000000000000018|0.9778024140774094|
|30711  |1558743669|8.0   |0.20000000000000018|0.9778024140774094|
|30711  |014028009 |6.0   |-1.7999999999999998|0.9778024140774094|
|30711  |446605239 |6.0   |-1.7999999999999998|0.9778024140774094|
|30711  |043935806 |8.0   |0.20000000000000018|0.9778024140774094|
|30711  |743225082 |9.0   |1.2000000000000002 |0.9778024140

In [15]:
# Predict the score for each candidate book using the weighted Pearson formula.

# average rating of target_user
tu_means = user_means.filter(F.col("user_id") == target_user) \
    .select("mean_rating").first()[0]

print("Mean rating of target user:", tu_means)

# for each candidate book：
# numerator = Σ sim(u,v) * rating_centered(v,j)
# denominator = Σ |sim(u,v)|
pred_stats = candidate_ratings.groupBy("r.isbn").agg(
    F.sum(F.col("n.similarity") * F.col("r.rating_centered")).alias("num"),
    F.sum(F.abs(F.col("n.similarity"))).alias("den")
).withColumnRenamed("r.isbn", "isbn")

predictions = pred_stats.withColumn(
    "pred_centered",
    F.when(F.col("den") > 0, F.col("num") / F.col("den")).otherwise(F.lit(0.0))
).withColumn(
    "pred_rating_raw",
    F.col("pred_centered") + F.lit(tu_means)
).withColumn(
    "pred_rating",
    F.when(F.col("pred_rating_raw") < 0, F.lit(0.0))
     .when(F.col("pred_rating_raw") > 10, F.lit(10.0))
     .otherwise(F.col("pred_rating_raw"))
) # clip the final predicted scores to the interval [0,10] to remain consistent with the cleaned rating range

predictions.orderBy(F.col("pred_rating").desc()).show(10, truncate=False)

# Generate a list of the 10 books with the highest predicted scores.
topN = 10  

user_recs = predictions.orderBy(F.col("pred_rating").desc()).limit(topN)

user_recs_with_meta = user_recs.join(
    books_clean,
    on="isbn",
    how="left"
).select(
    "isbn",
    "book_title",
    "book_author",
    "pred_rating"
)

user_recs_with_meta.show(truncate=False)


Mean rating of target user: 8.5
+---------+------------------+------------------+-------------------+-----------------+-----------------+
|isbn     |num               |den               |pred_centered      |pred_rating_raw  |pred_rating      |
+---------+------------------+------------------+-------------------+-----------------+-----------------+
|449212602|1.4142135623730951|0.7071067811865476|2.0                |10.5             |10.0             |
|015600710|1.1733628968928915|0.9778024140774094|1.2000000000000002 |9.7              |9.7              |
|743225082|1.1733628968928915|0.9778024140774094|1.2000000000000002 |9.7              |9.7              |
|618002227|1.1733628968928915|0.9778024140774094|1.2000000000000002 |9.7              |9.7              |
|345313860|0.7071067811865476|0.7071067811865476|1.0                |9.5              |9.5              |
|316666343|0.3355049542490085|0.4270063054078286|0.7857142857142865 |9.285714285714286|9.285714285714286|
|804111359|0.3

## Final Deliverables

- user–item rating matrix

We represent the user–item rating matrix in sparse form as the interaction dataframe ratings_clean (`user_id`, `isbn`, `rating`) and do not explicitly materialize the full dense matrix for computation. For illustration, we also show a small pivoted user–item matrix. User–user and item–item matrices are stored in sparse form as user_neighbors_topk and item_neighbors_topk.

### user-based CF

- user matrix

We represent the user–user similarity matrix in sparse form as `user`, `neighbor`, `similarity`, `rank`, keeping only the top-k neighbors per user based on Pearson correlation.

- recommendation lists generated for user-user CF

Using user–user collaborative filtering, we generate the following Top-N recommendations for the target user `target_user`

In [17]:
print("============================== user-item rating matrix ==============================")
small_ratings = ratings_clean.select("user_id", "isbn", "rating") \
    .filter(F.col("rating") > 0) \
    .limit(1000)  

user_item_matrix_demo = small_ratings.groupBy("user_id") \
    .pivot("isbn") \
    .agg(F.first("rating"))

user_item_matrix_demo.show(20, truncate=False)

print()
print("============================== user matrix ==============================")
# user similarity matrix (top-k neighbors per user)
user_matrix = user_neighbors_topk

user_matrix.show(20, truncate=False)
print("user_matrix rows:", user_matrix.count())

print()
print("============================== recommendation lists generated for user-user CF ==============================")
# User–user CF recommendation list (for target_user)
user_based_recs = user_recs_with_meta

user_based_recs.show(truncate=False)
print("Number of user-based recommendations:", user_based_recs.count())

+-------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+----------+----------+----------+---------+----------+----------+----------+---------+---------+----------+----------+----------+---------+----------+----------+----------+----------+----------+---------+----------+---------+--------+----------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----