In [16]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

In [17]:
user_review_df = pd.read_csv(r"data_for_db\user_reviews.csv")

In [19]:
fav_plus = 3
def add_fav_score(row):
    if row["is_favourite"] == True:
        if pd.isna(row["book_rating"]):
            return 10
        else:
            return row["book_rating"] + fav_plus
    else:
        return row["book_rating"]
user_review_df["book_rating_plus"] = user_review_df.apply(add_fav_score, axis=1)

In [22]:
user_review_df.dropna(subset="book_rating_plus", inplace=True)
user_review_df.dropna(subset="book_rating", inplace=True)

In [44]:
popular_books = user_review_df.groupby('book_id')['book_rating'].count().sort_values(ascending=False).head(20)
print("Top 10 Popular Books")
display(popular_books)

Top 10 Popular Books


book_id
38729     707
216737    581
71056     487
32497     383
7378      320
125415    313
21423     307
94175     295
87704     281
104344    278
38642     272
133573    269
145506    256
38892     243
170437    242
158581    236
43377     230
62376     229
33527     226
23216     226
Name: book_rating, dtype: int64

In [43]:
# 1. Calculate count and mean for each book
stats = user_review_df.groupby('book_id')['book_rating'].agg(['count', 'mean'])

# 2. Filter out books with very few ratings (e.g., must have > 50 ratings)
# This ensures statistical significance.
best_rated = stats[stats['count'] > 50].sort_values(by='mean', ascending=False).head(20)

print("Top 10 Highest Rated (with at least 50 reviews):")
display(best_rated)

Top 10 Highest Rated (with at least 50 reviews):


Unnamed: 0_level_0,count,mean
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
43428,77,9.402597
84163,137,9.262774
84770,53,9.207547
43427,83,9.120482
84143,133,9.082707
11269,68,9.073529
84142,141,9.035461
84767,206,9.033981
125414,119,8.983193
23448,51,8.980392


In [45]:
# Standard deviation measures the 'spread' of the ratings
controversial = user_review_df.groupby('book_id')['book_rating'].std().sort_values(ascending=False).head(10)

print("Top 10 Most Controversial Books (Highest Rating Variance):")
print(controversial)

Top 10 Most Controversial Books (Highest Rating Variance):
book_id
72676     6.363961
113237    6.363961
217673    6.363961
129795    6.363961
187975    6.363961
229174    6.363961
119894    6.363961
137133    6.363961
124559    6.363961
252516    6.363961
Name: book_rating, dtype: float64


In [46]:
# 1. Calculate basic stats per book
stats = user_review_df.groupby('book_id')['book_rating'].agg(['count', 'mean'])

# 2. Calculate the global parameters
C = stats['mean'].mean()  # The average rating across all books
m = stats['count'].quantile(0.75)  # Threshold: Book must have more reviews than 75% of others

# 3. Define the Bayesian function
def bayesian_rating(row, m, C):
    v = row['count']
    R = row['mean']
    return (v / (v + m) * R) + (m / (v + m) * C)

# 4. Apply the formula
stats['weighted_score'] = stats.apply(bayesian_rating, axis=1, args=(m, C))

# 5. Sort by the new fair score
fair_top_10 = stats.sort_values('weighted_score', ascending=False).head(10)

In [47]:
fair_top_10

Unnamed: 0_level_0,count,mean,weighted_score
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
84909,23,9.869565,9.682203
243809,11,10.0,9.619622
189908,13,9.923077,9.603672
130260,25,9.72,9.557596
5108,20,9.75,9.547958
74985,8,10.0,9.505509
74968,14,9.785714,9.503443
205802,7,10.0,9.450565
230173,7,10.0,9.450565
190022,24,9.583333,9.425196
