## Exercise 11: Compute Pearson Similarity Between Books (Item–Item Collaborative Filtering）

In [1]:
# Import Spark modules and load datasets

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("kcore-stage").getOrCreate()

ratings_clean = spark.read.parquet("export_core/pdf_core_ratings.parquet")
books_clean = spark.read.parquet("export_core/pdf_core_books.parquet")
users_active = spark.read.parquet("export_core/pdf_users_active.parquet")

In [5]:
# Prepare book average rating & item-centered rating

ratings_core = ratings_clean.select("user_id", "isbn", "rating") \
    .filter(F.col("rating") > 0)

# book average rating
item_means = ratings_core.groupBy("isbn") \
    .agg(F.avg("rating").alias("mean_rating_item"))

# item-centered rating
ratings_centered = ratings_core.join(item_means, on="isbn", how="inner") \
    .withColumn("rating_centered", F.col("rating") - F.col("mean_rating_item"))

ratings_centered.show(5, truncate=False)

+---------+-------+------+------------------+-------------------+
|isbn     |user_id|rating|mean_rating_item  |rating_centered    |
+---------+-------+------+------------------+-------------------+
|385504209|100009 |8.0   |8.621513944223107 |-0.621513944223107 |
|60502258 |100009 |6.0   |7.7444444444444445|-1.7444444444444445|
|786868716|100115 |10.0  |8.125             |1.875              |
|316789089|100223 |9.0   |7.888888888888889 |1.1111111111111107 |
|156006529|100459 |9.0   |7.714285714285714 |1.2857142857142856 |
+---------+-------+------+------------------+-------------------+
only showing top 5 rows



In [2]:
# Select a book with a "medium rating" as target_isbn

ratings_core = ratings_clean.select("user_id", "isbn", "rating") \
    .filter(F.col("rating") > 0)

book_counts = ratings_core.groupBy("isbn") \
    .agg(F.count("*").alias("num_ratings"))

w_book = Window.orderBy("num_ratings")

books_ranked = book_counts.withColumn(
    "row_num",
    F.row_number().over(w_book)
)

books_ranked.orderBy("row_num").show(10, truncate=False)

total_books = books_ranked.agg(F.max("row_num")).first()[0]
mid_index_book = total_books // 2 + 1

print("total_books:", total_books, "mid_index_book:", mid_index_book)

target_book_row = books_ranked.filter(F.col("row_num") == mid_index_book).first()
target_isbn = target_book_row["isbn"]
target_book_num_ratings = target_book_row["num_ratings"]

print("Selected target_isbn:", target_isbn, "with num_ratings:", target_book_num_ratings)

books_clean.filter(F.col("isbn") == target_isbn) \
    .select("isbn", "book_title", "book_author") \
    .show(truncate=False)


+---------+-----------+-------+
|isbn     |num_ratings|row_num|
+---------+-----------+-------+
|307132668|1          |1      |
|590431250|1          |2      |
|61052515 |1          |3      |
|373711115|1          |4      |
|440117291|1          |5      |
|373196725|1          |6      |
|446602086|1          |7      |
|034531011|1          |8      |
|451628276|1          |9      |
|140253580|1          |10     |
+---------+-----------+-------+
only showing top 10 rows

total_books: 1133 mid_index_book: 567
Selected target_isbn: 380754843 with num_ratings: 9
+---------+------------------------------+------------+
|isbn     |book_title                    |book_author |
+---------+------------------------------+------------+
|380754843|Wayside School is Falling Down|Louis Sachar|
+---------+------------------------------+------------+



In [3]:
print(target_isbn)

380754843


In [None]:
# Construct pairs (u, v) of books of each user.

# Give ratings_centered two aliases, i and j. 
i = ratings_centered.select(
    F.col("isbn").alias("isbn_i"),
    F.col("user_id"),
    F.col("rating_centered").alias("ri")
)

j = ratings_centered.select(
    F.col("isbn").alias("isbn_j"),
    F.col("user_id"),
    F.col("rating_centered").alias("rj")
)

# Self-join on the same user so that we get all combinations of (isbn_i, isbn_j, user_id).
item_pairs_on_same_user = i.join(
    j,
    on="user_id",
    how="inner"
).filter(F.col("isbn_i") < F.col("isbn_j"))  

item_pairs_on_same_user.show(10, truncate=False)


+-------+---------+------------------+---------+-------------------+
|user_id|isbn_i   |ri                |isbn_j   |rj                 |
+-------+---------+------------------+---------+-------------------+
|100009 |385504209|-0.621513944223107|60392452 |-0.2151898734177209|
|100009 |385504209|-0.621513944223107|60502258 |-1.7444444444444445|
|100223 |316789089|1.1111111111111107|451181379|0.22727272727272751|
|100223 |316789089|1.1111111111111107|385497466|-1.3125            |
|100459 |156006529|1.2857142857142856|671021001|0.20869565217391273|
|100459 |156006529|1.2857142857142856|316899984|1.1304347826086953 |
|100459 |156006529|1.2857142857142856|671038184|0.35416666666666696|
|100459 |156006529|1.2857142857142856|385720106|0.8644067796610173 |
|100459 |156006529|1.2857142857142856|385497466|0.6875             |
|100459 |156006529|1.2857142857142856|804114986|1.7594936708860764 |
+-------+---------+------------------+---------+-------------------+
only showing top 10 rows



In [7]:
# Calculate Pearson

item_pair_stats = item_pairs_on_same_user.groupBy("isbn_i", "isbn_j").agg(
    F.count("*").alias("n_common_users"),
    F.sum(F.col("ri") * F.col("rj")).alias("sum_ri_rj"),
    F.sum(F.col("ri") ** 2).alias("sum_ri2"),
    F.sum(F.col("rj") ** 2).alias("sum_rj2")
)

item_pair_stats.show(10, truncate=False)

item_pair_sims = item_pair_stats.withColumn(
    "pearson",
    F.when(
        (F.col("sum_ri2") > 0) &
        (F.col("sum_rj2") > 0) &
        (F.col("n_common_users") >= 2),  
        F.col("sum_ri_rj") / (F.sqrt(F.col("sum_ri2")) * F.sqrt(F.col("sum_rj2")))
    ).otherwise(F.lit(0.0))
)

item_pair_sims.orderBy(F.col("pearson").desc()).show(10, truncate=False)


+---------+---------+--------------+-------------------+-------------------+-------------------+
|isbn_i   |isbn_j   |n_common_users|sum_ri_rj          |sum_ri2            |sum_rj2            |
+---------+---------+--------------+-------------------+-------------------+-------------------+
|440403278|671888587|1             |2.59090909090909   |2.983471074380163  |2.25               |
|014028009|440226430|11            |6.040543321671126  |55.10501997024043  |33.798810167658196 |
|446612790|671708635|1             |1.2254802831142573 |1.379259058950785  |1.0888468809073728 |
|452282152|743474325|3             |-1.0687830687830688|6.0278189300411515 |0.8367346938775506 |
|067976402|394557433|2             |1.324803149606299  |5.942401884803768  |0.625              |
|038076654|316693006|1             |8.455882352941178  |6.25               |11.440311418685123 |
|373484224|440225701|2             |6.679230769230769  |12.857988165680474 |6.381799999999998  |
|000649840|671789422|1        

In [8]:
# Do a symmetrical expansion: each pair becomes two records.
#  isbn = i, neighbor = j
#  isbn = j, neighbor = i
item_neighbor_sims = item_pair_sims.select(
    F.col("isbn_i").alias("isbn"),
    F.col("isbn_j").alias("neighbor_isbn"),
    F.col("pearson").alias("similarity"),
    F.col("n_common_users")
).unionByName(
    item_pair_sims.select(
        F.col("isbn_j").alias("isbn"),
        F.col("isbn_i").alias("neighbor_isbn"),
        F.col("pearson").alias("similarity"),
        F.col("n_common_users")
    )
)

item_neighbor_sims.show(10, truncate=False)



+---------+-------------+-------------------+--------------+
|isbn     |neighbor_isbn|similarity         |n_common_users|
+---------+-------------+-------------------+--------------+
|440403278|671888587    |0.0                |1             |
|014028009|440226430    |0.13996830885513503|11            |
|446612790|671708635    |0.0                |1             |
|452282152|743474325    |-0.4758998609503549|3             |
|067976402|394557433    |0.6874329317705201 |2             |
|038076654|316693006    |0.0                |1             |
|373484224|440225701    |0.737340931105187  |2             |
|000649840|671789422    |0.0                |1             |
|000649840|055356451    |0.0                |1             |
|006092988|051512768    |0.0                |1             |
+---------+-------------+-------------------+--------------+
only showing top 10 rows



In [None]:
# For each book, find the top-k neighbors.
k_items = 20  

w_items = Window.partitionBy("isbn").orderBy(F.col("similarity").desc())

item_neighbors_topk = item_neighbor_sims.withColumn(
    "rank",
    F.row_number().over(w_items)
).filter(
    (F.col("rank") <= k_items) & (F.col("similarity") > 0)
)

item_neighbors_topk.orderBy("isbn", "rank").show(50, truncate=False)

# similarity vector for the target book
print()
print("=============== similarity vector for the target book =============")
book_similarity_vector = item_neighbors_topk.filter(
    F.col("isbn") == target_isbn
).orderBy(F.col("similarity").desc())

book_similarity_vector.show(20, truncate=False)
print("Number of similar books for target:", book_similarity_vector.count())


+---------+-------------+-------------------+--------------+----+
|isbn     |neighbor_isbn|similarity         |n_common_users|rank|
+---------+-------------+-------------------+--------------+----+
|000649840|345404769    |1.0                |2             |1   |
|000649840|330267388    |1.0                |2             |2   |
|000649840|038542017    |0.9986270896324835 |2             |3   |
|000649840|394820371    |0.9203309184584746 |2             |4   |
|000649840|804114986    |0.9083384250620382 |2             |5   |
|000649840|142000205    |0.8560740127031655 |2             |6   |
|000649840|552996009    |0.7803587331579733 |2             |7   |
|000649840|671027360    |0.7801293965902516 |5             |8   |
|000649840|452264464    |0.7645223227757788 |2             |9   |
|000649840|743418174    |0.6556823785662133 |3             |10  |
|000649840|971880107    |0.6474763348480055 |4             |11  |
|000649840|014028009    |0.6337705270626014 |3             |12  |
|000649840

## Exercise 12: Compute Pearson Similarity Between Books 

In [13]:
for k in [2, 4]:
    print(f"\nTop {k} most similar books to {target_isbn}:\n")

    topk_similar = book_similarity_vector.limit(k)

    item_recs_with_meta = topk_similar.join(
        books_clean,
        topk_similar.neighbor_isbn == books_clean.isbn,
        how="left"
    ).select(
        F.col("neighbor_isbn").alias("similar_isbn"),
        "book_title",
        "book_author",
        "similarity",
        "n_common_users"
    )

    item_recs_with_meta.show(truncate=False)



Top 2 most similar books to 380754843:

+------------+--------------------------------------------+---------------+------------------+--------------+
|similar_isbn|book_title                                  |book_author    |similarity        |n_common_users|
+------------+--------------------------------------------+---------------+------------------+--------------+
|439139597   |Harry Potter and the Goblet of Fire (Book 4)|J. K. Rowling  |0.9981674001450533|2             |
|312278586   |The Nanny Diaries: A Novel                  |Emma McLaughlin|0.7547262236781133|2             |
+------------+--------------------------------------------+---------------+------------------+--------------+


Top 4 most similar books to 380754843:

+------------+-------------------------------------------------------+---------------+-------------------+--------------+
|similar_isbn|book_title                                             |book_author    |similarity         |n_common_users|
+------------

For our chosen book, we only find 4 neighbors that have enough common raters and positive Pearson correlation. This illustrates the sparsity of the dataset: many books do not share enough users to compute a reliable similarity.

## Final Deliverables
### user-based CF

- item matrix

Similarly, the item–item similarity matrix is stored in sparse form as `isbn`, `neighbor_isbn`, `similarity`, `rank`, keeping only the top-k most similar books per item

- recommendation lists generated for item-item CF

Using item–item collaborative filtering, we generate Top-N similar books for the target book `target_isbn`

In [14]:
print("============================== item matrix ==============================")
# Item similarity matrix (top-k similar items per book)
item_matrix = item_neighbors_topk

item_matrix.show(20, truncate=False)
print("item_matrix rows:", item_matrix.count())

print()
print("============================== recommendation lists generated for item-item CF ==============================")
# Item–item CF recommendation list (books similar to target_isbn)
item_based_recs = item_recs_with_meta

item_based_recs.show(truncate=False)
print("Number of item-based recommendations:", item_based_recs.count())


+---------+-------------+-------------------+--------------+----+
|isbn     |neighbor_isbn|similarity         |n_common_users|rank|
+---------+-------------+-------------------+--------------+----+
|000649840|345404769    |1.0                |2             |1   |
|000649840|330267388    |1.0                |2             |2   |
|000649840|038542017    |0.9986270896324835 |2             |3   |
|000649840|394820371    |0.9203309184584746 |2             |4   |
|000649840|804114986    |0.9083384250620382 |2             |5   |
|000649840|142000205    |0.8560740127031655 |2             |6   |
|000649840|552996009    |0.7803587331579733 |2             |7   |
|000649840|671027360    |0.7801293965902516 |5             |8   |
|000649840|452264464    |0.7645223227757788 |2             |9   |
|000649840|743418174    |0.6556823785662133 |3             |10  |
|000649840|971880107    |0.6474763348480055 |4             |11  |
|000649840|014028009    |0.6337705270626014 |3             |12  |
|000649840