# Product RATING Analytics

**What Makes a Customer Buy a Product?** <br>
 ++ Social Proof (The wisdom of crowds => Rating);
- Product scores calculation
- sorting of products
- sorting user comments on product detail pages
- Design of page, process and interaction areas
- feature trials
- testing possible actions and reactions

In [1]:
###################################################
# Rating Products
###################################################
# - Average
# - Time-Based Weighted Average
# - User-Based Weighted Average
# - Weighted Rating

### MiniApp: User and Time Weighted Course Score Calculation

In [None]:
# COURSE-XXX
# Score: 4.8 (4.764925)
# Total Score: 4611
# Score Percentage: 75, 20, 4, 1, <1
# Approximate Numerical Equivalents: 3458, 922, 184, 46, 6

In [2]:
import pandas as pd
import math
import scipy.stats as st
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [None]:
df = pd.read_csv("./datasets/course_reviews.csv")
df.head()
df.shape

In [None]:
# rating distribution
df["Rating"].value_counts()

In [None]:
df["Questions Asked"].value_counts()

In [None]:
df.groupby("Questions Asked").agg({"Questions Asked": "count",
                                   "Rating": "mean"})

In [None]:
df.head()

In [None]:
# Average
####################
# Mean Rating Score
df["Rating"].mean()

In [None]:
# Time-Based Weighted Average
####################
df.info() 

In [None]:
df["Timestamp"] = pd.to_datetime(df["Timestamp"])
current_date = pd.to_datetime('2021-02-10 0:0:0')

# How many days have passed since the rating was given?
df["days"] = (current_date - df["Timestamp"]).dt.days

In [None]:
df.loc[df["days"] <= 30, "Rating"].mean()

In [None]:
df.loc[(df["days"] > 30) & (df["days"] <= 90), "Rating"].mean()

In [None]:
df.loc[(df["days"] > 90) & (df["days"] <= 180), "Rating"].mean()

In [None]:
df.loc[(df["days"] > 180), "Rating"].mean()

In [None]:
# I also took into account weight values according to time intervals(Attention, the sum of the weights must be 100!)
def time_based_weighted_average(dataframe, w1=28, w2=26, w3=24, w4=22):
    return dataframe.loc[df["days"] <= 30, "Rating"].mean() * w1 / 100 + \
           dataframe.loc[(dataframe["days"] > 30) & (dataframe["days"] <= 90), "Rating"].mean() * w2 / 100 + \
           dataframe.loc[(dataframe["days"] > 90) & (dataframe["days"] <= 180), "Rating"].mean() * w3 / 100 + \
           dataframe.loc[(dataframe["days"] > 180), "Rating"].mean() * w4 / 100

In [None]:
time_based_weighted_average(df)

In [None]:
time_based_weighted_average(df, 30, 26, 22, 22)

In [None]:
# User-Based/User-Quality(By User Progress Weighted Average
####################
df.head()

In [None]:
df.groupby("Progress").agg({"Rating": "mean"})

In [None]:
def user_based_weighted_average(dataframe, w1=22, w2=24, w3=26, w4=28):
    return dataframe.loc[dataframe["Progress"] <= 10, "Rating"].mean() * w1 / 100 + \
           dataframe.loc[(dataframe["Progress"] > 10) & (dataframe["Progress"] <= 45), "Rating"].mean() * w2 / 100 + \
           dataframe.loc[(dataframe["Progress"] > 45) & (dataframe["Progress"] <= 75), "Rating"].mean() * w3 / 100 + \
           dataframe.loc[(dataframe["Progress"] > 75), "Rating"].mean() * w4 / 100

In [None]:
user_based_weighted_average(df, 20, 24, 26, 30)

In [None]:
# Weighted Rating
####################

def course_weighted_rating(dataframe, time_w=50, user_w=50):
    return time_based_weighted_average(dataframe) * time_w/100 + user_based_weighted_average(dataframe)*user_w/100

In [None]:
course_weighted_rating(df)

In [None]:
course_weighted_rating(df, time_w=40, user_w=60)

# Product SORTING Analytics

#### In the keyword search made from the Search section on the website,There is a ranking.

### MiniApp: Course Sorting

In [None]:
import pandas as pd
import math
import scipy.stats as st
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [None]:
df = pd.read_csv(".datasets/product_sorting.csv")
print(df.shape)
df.head(10)

In [None]:
# Sorting by Rating
####################
df.sort_values("rating", ascending=False).head(20)

In [None]:
# Sorting by Comment Count or Purchase Count
####################
df.sort_values("purchase_count", ascending=False).head(20)
df.sort_values("commment_count", ascending=False).head(20)

In [None]:
# Sorting by Rating, Comment and Purchase (Sorting by RCP) => This is more convenient to social proof!!
####################
df["purchase_count_scaled"] = MinMaxScaler(feature_range=(1, 5)).fit(df[["purchase_count"]]).transform(df[["purchase_count"]])
df.describe().T

In [None]:
df["comment_count_scaled"] = MinMaxScaler(feature_range=(1, 5)).fit(df[["commment_count"]]).transform(df[["commment_count"]])

In [None]:
# General scoring
def weighted_sorting_score(dataframe, w1=32, w2=26, w3=42):
    return (dataframe["comment_count_scaled"] * w1 / 100 +
            dataframe["purchase_count_scaled"] * w2 / 100 +
            dataframe["rating"] * w3 / 100)

In [None]:
df["weighted_sorting_score"] = weighted_sorting_score(df)

In [None]:
df.sort_values("weighted_sorting_score", ascending=False).head(20)

In [None]:
df[df["course_name"].str.contains("Data Science")].sort_values("weighted_sorting_score", ascending=False).head(20)

#### Bayesian Average Rating Score - Sorting by potential using a statistical model

In [None]:
# Other nomenclatures mentioned in the literature:
# Sorting Products with 5 Star Rated
# Sorting Products According to Distribution of 5 Star Rating

In [None]:
# The function below gives me an average rating value using the distribution information of the points (1_point, ....). 
# I'll either sort accordingly or take a better approach to develop a hybrid method.

# n =>  x[["1_point", "2_point","3_point", "4_point", "5_point"]]
def bayesian_average_rating(n, confidence=0.95):
    if sum(n) == 0:
        return 0
    K = len(n)
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    N = sum(n)
    first_part = 0.0
    second_part = 0.0
    for k, n_k in enumerate(n):
        first_part += (k + 1) * (n[k] + 1) / (N + K)
        second_part += (k + 1) * (k + 1) * (n[k] + 1) / (N + K)
    score = first_part - z * math.sqrt((second_part - first_part * first_part) / (N + K + 1))
    return score

In [None]:
df.head()

In [None]:
# Since this place focuses only on point distribution, there is a situation of overriding others.
# That's why hybrid is important because I also take the weights into account.
df["bar_score"] = df.apply(lambda x: bayesian_average_rating(x[["1_point",
                                                                "2_point",
                                                                "3_point",
                                                                "4_point",
                                                                "5_point"]]), axis=1)

In [None]:
df.sort_values("weighted_sorting_score", ascending=False).head(20)

In [None]:
df.sort_values("bar_score", ascending=False).head(20)

In [None]:
# index 5th ve 1st fetched
df[df["course_name"].index.isin([5, 1])].sort_values("bar_score", ascending=False)

In [None]:
# Hybrid Sorting: BAR Score + Other Factors
####################
def hybrid_sorting_score(dataframe, bar_w=60, wss_w=40):
    bar_score = dataframe.apply(lambda x: bayesian_average_rating(x[["1_point",
                                                                     "2_point",
                                                                     "3_point",
                                                                     "4_point",
                                                                     "5_point"]]), axis=1)
    wss_score = weighted_sorting_score(dataframe)
    
    return bar_score*bar_w/100 + wss_score*wss_w/100

In [None]:
df["hybrid_sorting_score"] = hybrid_sorting_score(df)

In [None]:
df.sort_values("hybrid_sorting_score", ascending=False).head(20)

In [None]:
df[df["course_name"].str.contains("Data Science")].sort_values("hybrid_sorting_score", ascending=False).head(20)

#### MiniApp: IMDB Movie Scoring & Sorting

In [3]:
import pandas as pd
import math
import scipy.stats as st
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [4]:
df = pd.read_csv("datasets/movies_metadata.csv",
                 low_memory=False)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,overview,original_language,vote_count,vote_average
0,0,Ad Astra,"The near future, a time when both hope and har...",en,2853,5.9
1,1,Bloodshot,"After he and his wife are murdered, marine Ray...",en,1349,7.2
2,2,Bad Boys for Life,Marcus and Mike are forced to confront new thr...,en,2530,7.1
3,3,Ant-Man,Armed with the astonishing ability to shrink i...,en,13611,7.1
4,4,Percy Jackson: Sea of Monsters,"In their quest to confront the ultimate evil, ...",en,3542,5.9


In [6]:
df = df[["title", "vote_average", "vote_count"]]

In [7]:
df.head(10)

Unnamed: 0,title,vote_average,vote_count
0,Ad Astra,5.9,2853
1,Bloodshot,7.2,1349
2,Bad Boys for Life,7.1,2530
3,Ant-Man,7.1,13611
4,Percy Jackson: Sea of Monsters,5.9,3542
5,Birds of Prey (and the Fantabulous Emancipatio...,7.1,2639
6,Live Free or Die Hard,6.5,3714
7,Cold Blood,5.1,119
8,Underwater,6.5,584
9,The Platform,7.2,1924


In [8]:
df.shape

(10000, 3)

In [9]:
df.sort_values("vote_average", ascending=False).head(20)

Unnamed: 0,title,vote_average,vote_count
1582,The Water Monster,10.0,1
401,Mamu (and a Mother Too),10.0,1
664,Four Kids and It,10.0,1
9674,Gully,10.0,1
3387,Prostitution,10.0,1
6278,You're Bacon Me Crazy,10.0,1
6604,Erotica: Moonlight,9.8,3
5125,Bulletproof 2,9.5,2
9102,Bad Education,9.5,4
8808,Kinky Boots: The Musical,9.2,22


In [10]:
df["vote_count"].describe([0.10, 0.25, 0.50, 0.70, 0.80, 0.90, 0.95, 0.99]).T

count   10000.00000
mean     1020.82510
std      1992.30501
min         0.00000
10%        67.00000
25%       143.00000
50%       332.00000
70%       744.00000
80%      1219.20000
90%      2555.00000
95%      4462.15000
99%     10914.41000
max     25148.00000
Name: vote_count, dtype: float64

In [11]:
df[df["vote_count"] > 110].sort_values("vote_average", ascending=False).head(20)

Unnamed: 0,title,vote_average,vote_count
357,Dilwale Dulhania Le Jayenge,8.8,2230
559,Steven Universe: The Movie,8.8,144
96,The Shawshank Redemption,8.7,15522
110,The Godfather,8.7,11768
252,Schindler's List,8.6,9366
772,The Green Mile,8.5,9740
8544,We All Loved Each Other So Much,8.5,207
205,Your Name.,8.5,5225
5524,One Direction: Where We Are – The Concert Film,8.5,123
25,Parasite,8.5,6046


In [12]:
from sklearn.preprocessing import MinMaxScaler

df["vote_count_score"] = MinMaxScaler(feature_range=(1, 10)).fit(df[["vote_count"]]).transform(df[["vote_count"]])

In [13]:
# vote_average * vote_count
########################
df["average_count_score"] = df["vote_average"] * df["vote_count_score"]

In [14]:
df.sort_values("average_count_score", ascending=False).head(20)

Unnamed: 0,title,vote_average,vote_count,vote_count_score,average_count_score
80,Inception,8.3,25148,10.0,83.0
88,The Dark Knight,8.4,21561,8.71628,73.21675
47,Interstellar,8.3,21589,8.7263,72.42829
128,Deadpool,7.6,22225,8.95391,68.04974
86,The Avengers,7.7,21893,8.8351,68.03024
402,Guardians of the Galaxy,7.9,20132,8.20487,64.81845
243,Fight Club,8.4,18664,7.6795,64.50778
216,Pulp Fiction,8.5,18003,7.44294,63.26497
132,Avatar,7.4,20778,8.43606,62.42683
361,Forrest Gump,8.4,17344,7.20709,60.53959


In [15]:
########################
# IMDB Weighted Rating : Scoring method used by IMDB
########################
# weighted_rating = (v/(v+M) * r) + (M/(v+M) * C)
# r = vote average
# v = vote count
# M = minimum votes required to be listed in the Top 250
# C = the mean vote across the whole report (currently 7.0)

In [16]:
M = 2500
C = df['vote_average'].mean()

In [17]:
def weighted_rating(r, v, M, C):
    return (v / (v + M) * r) + (M / (v + M) * C)

In [18]:
df.sort_values("average_count_score", ascending=False).head(10)

Unnamed: 0,title,vote_average,vote_count,vote_count_score,average_count_score
80,Inception,8.3,25148,10.0,83.0
88,The Dark Knight,8.4,21561,8.71628,73.21675
47,Interstellar,8.3,21589,8.7263,72.42829
128,Deadpool,7.6,22225,8.95391,68.04974
86,The Avengers,7.7,21893,8.8351,68.03024
402,Guardians of the Galaxy,7.9,20132,8.20487,64.81845
243,Fight Club,8.4,18664,7.6795,64.50778
216,Pulp Fiction,8.5,18003,7.44294,63.26497
132,Avatar,7.4,20778,8.43606,62.42683
361,Forrest Gump,8.4,17344,7.20709,60.53959


In [19]:
weighted_rating(8.30000, 25148.00000, M, C)

8.119718424479167

In [20]:
weighted_rating(8.40000, 21561.00000, M, C)

8.182451893105025

In [21]:
df["weighted_rating"] = weighted_rating(df["vote_average"],
                                        df["vote_count"], M, C)

In [22]:
df.sort_values("weighted_rating", ascending=False).head(10)

Unnamed: 0,title,vote_average,vote_count,vote_count_score,average_count_score,weighted_rating
96,The Shawshank Redemption,8.7,15522,6.55503,57.0288,8.36794
110,The Godfather,8.7,11768,5.21155,45.34046,8.28057
216,Pulp Fiction,8.5,18003,7.44294,63.26497,8.23251
88,The Dark Knight,8.4,21561,8.71628,73.21675,8.18245
243,Fight Club,8.4,18664,7.6795,64.50778,8.15267
361,Forrest Gump,8.4,17344,7.20709,60.53959,8.13622
80,Inception,8.3,25148,10.0,83.0,8.11972
252,Schindler's List,8.6,9366,4.35192,37.42648,8.11673
197,The Lord of the Rings: The Return of the King,8.4,14987,6.36357,53.45397,8.10067
47,Interstellar,8.3,21589,8.7263,72.42829,8.09308


In [23]:
# Bayesian Average Rating Score
####################
def bayesian_average_rating(n, confidence=0.95):
    if sum(n) == 0:
        return 0
    K = len(n)
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    N = sum(n)
    first_part = 0.0
    second_part = 0.0
    for k, n_k in enumerate(n):
        first_part += (k + 1) * (n[k] + 1) / (N + K)
        second_part += (k + 1) * (k + 1) * (n[k] + 1) / (N + K)
    score = first_part - z * math.sqrt((second_part - first_part * first_part) / (N + K + 1))
    return score

In [24]:
#I gave the voting numbers 1, 2,...,10 as the n value for the movie The Shawshank Redemption.
bayesian_average_rating([34733, 4355, 4704, 6561, 13515, 26183, 87368, 273082, 600260, 1295351])

9.14538444560111

In [25]:
#I gave the voting numbers 1, 2,...,10 as the n value for the movie The GodFather.
bayesian_average_rating([37128, 5879, 6268, 8419, 16603, 30016, 78538, 199430, 402518, 837905])

8.940007324860396

# Review SORTING Analytics 
## Sorting the relevant comment according to whether other users find it useful or not

In [27]:
import pandas as pd
import math
import scipy.stats as st

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [28]:
###################################################
# Up-Down Diff Score = (up ratings) − (down ratings)
###################################################

# Review 1: 600 up 400 down total 1000
# Review 2: 5500 up 4500 down total 10000

def score_up_down_diff(up, down):
    return up - down

# Review 1 Score:
score_up_down_diff(600, 400)

200

In [29]:
# Review 2 Score
score_up_down_diff(5500, 4500)

1000

In [30]:
###################################################
# Score = Average rating = (up ratings) / (all ratings)
###################################################

def score_average_rating(up, down):
    if up + down == 0:
        return 0
    return up / (up + down)

score_average_rating(600, 400)

0.6

In [31]:
score_average_rating(5500, 4500)

0.55

In [32]:
###################################################
# Wilson Lower Bound Score (WLB Score)
###################################################
def wilson_lower_bound(up, down, confidence=0.95):
    """
    Wilson Lower Bound Score Calculation

    - The lower limit of the confidence interval to be calculated for parameter Bernoulli is considered as the WLB score. 
    - The score to be calculated is used for product ranking.
    - Note:
    If the scores are between 1-5; 1-3 stars => negative (down), 4-5 stars => positive (up) and can be adapted to bernoulli.
    This brings with it some problems. For this reason, it is necessary to make a bayesian average rating.

    Parameters
    ----------
    up: int
        up count
    down: int
        down count
    confidence: float
        confidence

    Returns
    -------
    wilson score: float

    """
    n = up + down
    if n == 0:
        return 0
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    phat = 1.0 * up / n
    
    return (phat + z * z / (2 * n) - z * math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n)

In [33]:
wilson_lower_bound(600, 400)

0.5693094295142663

In [34]:
wilson_lower_bound(5500, 4500)

0.5402319557715324

In [35]:
# Case Study
###################################################
up = [15, 70, 14, 4, 2, 5, 8, 37, 21, 52, 28, 147, 61, 30, 23, 40, 37, 61, 54, 18, 12, 68]
down = [0, 2, 2, 2, 15, 2, 6, 5, 23, 8, 12, 2, 1, 1, 5, 1, 2, 6, 2, 0, 2, 2]
comments = pd.DataFrame({"up": up, "down": down})

In [36]:
# score_pos_neg_diff
comments["score_pos_neg_diff"] = comments.apply(lambda x: score_up_down_diff(x["up"],
                                                                             x["down"]), axis=1)

In [37]:
# score_average_rating
comments["score_average_rating"] = comments.apply(lambda x: score_average_rating(x["up"], x["down"]), axis=1)

In [38]:
# wilson_lower_bound
comments["wilson_lower_bound"] = comments.apply(lambda x: wilson_lower_bound(x["up"], x["down"]), axis=1)

In [39]:
comments.sort_values("wilson_lower_bound", ascending=False)

Unnamed: 0,up,down,score_pos_neg_diff,score_average_rating,wilson_lower_bound
11,147,2,145,0.98658,0.95238
12,61,1,60,0.98387,0.91413
1,70,2,68,0.97222,0.90426
21,68,2,66,0.97143,0.90168
18,54,2,52,0.96429,0.87881
15,40,1,39,0.97561,0.87405
13,30,1,29,0.96774,0.83806
16,37,2,35,0.94872,0.83114
19,18,0,18,1.0,0.82412
17,61,6,55,0.91045,0.81807
