# Sorting Reviews Using Python

In this section, we will be sorting the reviews with python.

We tried three different methods:

  * Up-Down Diff Score (Up - Down)
  * Average Rating (Up Ratings) / (All Ratings)
  * Wilson Lower Bound Score

In [8]:
import numpy as np
import pandas as pd
import scipy.stats as st
from sklearn.preprocessing import MinMaxScaler
import math
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)
pd.set_option("display.width",500)
pd.set_option("display.expand_frame_repr",False)
pd.set_option("display.float_format",lambda x: '%.5f' % x)

# Up-Down Diff Score

In [2]:
# Review 1 - 600 Up 400 Down Total 1000
# Review 2 - 5500 Up 4500 Down Total 10000

def up_down_diff_score(up, down):
  return up - down

In [3]:
# Review 1 Score
up_down_diff_score(600,400)

200

In [4]:
# Review 2 Score
up_down_diff_score(5500,4500)

1000

# Average Rating

In [5]:
def average_rating_score(up, down):
  if up + down == 0:
    return 0
  return up / (up + down)
  

In [6]:
# Review 1 Score
average_rating_score(600,400)

0.6

In [7]:
# Review 2 Score
average_rating_score(5500,4500)

0.55

# Wilson Lower Bound Score

In [9]:
import scipy.stats as st
import math
def wilson_lower_bound(up, down, confidence=0.95):
    n = up + down
    if n == 0:
        return 0
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    phat = 1.0 * up / n
    return (phat + z * z / (2 * n) - z * math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n)

In [10]:
# Review 1 Score
wilson_lower_bound(600,400)

0.5693094295142663

In [11]:
# Review 2 Score
wilson_lower_bound(5500,4500)

0.5402319557715324

# Case Study

In [14]:
up = [15, 70, 14, 4, 2, 5, 8, 37, 21, 52, 28, 147, 61, 30, 23, 40, 37, 61, 54, 18, 12, 68]
down = [0, 2, 2, 2, 15, 2, 6, 5, 23, 8, 12, 2, 1, 1, 5, 1, 2, 6, 2, 0, 2, 2]
comments = pd.DataFrame({"up": up, "down": down})
comments

Unnamed: 0,up,down
0,15,0
1,70,2
2,14,2
3,4,2
4,2,15
5,5,2
6,8,6
7,37,5
8,21,23
9,52,8


In [15]:
# Up-Down Diff Score
comments["up_down_diff_score"] = comments.apply(lambda x: up_down_diff_score(x["up"],x["down"]),axis=1)
comments["average_rating_score"] = comments.apply(lambda x: average_rating_score(x["up"],x["down"]),axis=1)
comments["wilson_lower_bound"] = comments.apply(lambda x: wilson_lower_bound(x["up"],x["down"]),axis=1)

In [16]:
comments.sort_values("wilson_lower_bound",ascending=False)

Unnamed: 0,up,down,up_down_diff_score,average_rating_score,wilson_lower_bound
11,147,2,145,0.98658,0.95238
12,61,1,60,0.98387,0.91413
1,70,2,68,0.97222,0.90426
21,68,2,66,0.97143,0.90168
18,54,2,52,0.96429,0.87881
15,40,1,39,0.97561,0.87405
13,30,1,29,0.96774,0.83806
16,37,2,35,0.94872,0.83114
19,18,0,18,1.0,0.82412
17,61,6,55,0.91045,0.81807
