In [1]:
import pandas as pd
import numpy as np
import datetime
import re

In [None]:
!pip install plotly 
import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots
plotly.offline.init_notebook_mode(connected=True)

<h3>Rating Distribution of All Reviews and Fake Reviews</h3>

In [3]:
df = pd.read_csv('Relabeled_Moisturizer.csv', index_col = [0])
dist = df.groupby(['review_rating']).describe()['item_id']['count'].to_dict()

In [4]:
fakedf = df[df['predicted_fake_reviews'] == 1]
fake_dist = fakedf.groupby(['review_rating']).describe()['item_id']['count'].to_dict()

In [5]:
specs = [[{'type':'domain'}, {'type':'domain'}]]
fig1 = make_subplots(rows=1, cols=2, specs=specs, subplot_titles = 
                     ('Rating Distribution of All Reviews','Rating Distribution of Fake Reviews'))
labels = [1, 2, 3, 4, 5]

fig1.add_trace(go.Pie(labels=labels, values=[dist[i] for i in dist], name = 'All Reviews'), 1, 1)
fig1.add_trace(go.Pie(labels=labels, values=[fake_dist[i] for i in fake_dist], name = 'Fake Reviews'), 1, 2)

<h3>Change in Ratings After Removing Fake Reviews</h3>

In [6]:
#calculate current rating for every product 
current_ratings = {}
current_count = df.groupby(['item_id','review_rating']).size()
current_items = current_count.index.get_level_values(0).unique()
for item in current_items:
    ratings = np.array(current_count[item].index)
    counts = np.array(current_count[item])
    current_rating = (ratings @ counts)/sum(counts)
    current_ratings[item] = current_rating
    
#calculate rating for every product after removing fake reviews
truedf = df[df['predicted_fake_reviews'] == 0]
true_ratings = {}
true_count = truedf.groupby(['item_id','review_rating']).size()
true_items = true_count.index.get_level_values(0).unique()
for item in true_items:
    ratings = np.array(true_count[item].index)
    counts = np.array(true_count[item])
    true_rating = (ratings @ counts)/sum(counts)
    true_ratings[item] = true_rating

ratings = pd.DataFrame()
ratings['current'] = pd.Series(current_ratings)
ratings['true'] = pd.Series(true_ratings)
ratings['true'] = ratings['true'].fillna(0)
ratings['difference'] = ratings['true'] - ratings['current']

def change(row):
    if row['difference'] < 0:
        return 'Decrease'
    elif row['difference'] > 0:
        return 'Increase'
    else:
        return 'No change'

ratings['change'] = ratings.apply(lambda row: change(row), axis=1)

In [7]:
ratings.head()

Unnamed: 0,current,true,difference,change
74476,4.329457,4.3,-0.029457,Decrease
112680,4.319355,4.306859,-0.012496,Decrease
112706,4.708861,4.681159,-0.027701,Decrease
153726,4.486891,4.454918,-0.031973,Decrease
241539,4.471616,4.404255,-0.06736,Decrease


In [8]:
labels = list(ratings.groupby('change').size().index)
values = list(ratings.groupby('change').size())
fig3 = go.Figure()
fig3.add_trace(go.Pie(labels = labels, values = values))
fig3.update_layout(title='Change in Ratings After Removing Fake Reviews')
fig3.show()

<h3>Realtionship between Number of Reviews and Fake Reviews</h3>

In [9]:
product_reviews = df.groupby('item_id').size()
fake_reviews = fakedf.groupby('item_id').size()
rel = pd.DataFrame()
rel['reviews'] = product_reviews
rel['fake_reviews'] = fake_reviews
rel['fake_reviews'] = rel['fake_reviews'].fillna(0)
rel['fake_reviews'] = rel['fake_reviews'].astype(int)

In [10]:
rel.corr()

Unnamed: 0,reviews,fake_reviews
reviews,1.0,0.510346
fake_reviews,0.510346,1.0


In [11]:
fig2 = go.Figure()
fig2.add_trace(go.Scatter(x = rel['fake_reviews'] , y = rel['reviews']))
fig2.update_traces(mode='markers', marker=dict(line_width=1, symbol='circle', size=8))
fig2.update_layout(title='Realtionship between Number of Reviews and Fake Reviews',
                  xaxis_title='Number of Reviews',
                  yaxis_title='Number of Fake Reviews')
fig2.show()

<h3>Realtionship between Numbers of 5-Star Fake Reviews and Product Rating</h3>

In [12]:
#see products with more than 1000 reviews for example 
morethan1000 = df.groupby(['item_id']).count()['total_reviews']
interval = df[df['item_id'].isin(dict(morethan1000[morethan1000 > 1000]))]
interval = interval[['item_id','review_rating','review_time','predicted_fake_reviews']]
interval = interval.reset_index(drop = True)

In [13]:
#Some rows of review_time show X day ago, since our web scraping takes a long time and these rows only count a small amount, discard them 
def converttime(string):
    pattern = r'^[\d]{1,2} [A-Za-z]{3} [\d]{4}$'
    pattern2 = r'^[\d]{1,2}-[A-Za-z]{3}-[\d]{2}$'
    
    if re.match(pattern,string):
        string = datetime.datetime.strptime(string,'%d %b %Y')
    elif re.match(pattern2,string):
        string = datetime.datetime.strptime(string,'%d-%b-%y')    
    else:
        string = None
    return string

In [14]:
interval.loc[:,'review_time'] = interval['review_time'].apply(converttime)
interval = interval[interval['review_time'].notnull()]
interval.loc[:,'review_time'] = pd.to_datetime(interval['review_time'])

In [17]:
for i in interval.groupby('item_id').count().index:
    all_reviews = interval[(interval['item_id'] == i)]
    all_reviews
    
    #calculate the average rating of the product when new ratings received
    avgrating = {}
    for k in sorted(all_reviews['review_time'].unique()):
        ratings = all_reviews[all_reviews['review_time'] <= k]
        avgrating[str(k)[:10]] = sum(ratings['review_rating'])/len(ratings)
    
    #calcualte number of 5-star fake reviews
    fake_reviews = all_reviews[(all_reviews['predicted_fake_reviews'] == 1) & (all_reviews['review_rating'] == 5)]
    fakecount = fake_reviews.groupby('review_time').count()['predicted_fake_reviews']

    
    #plot data
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig.add_trace(go.Scatter(x=[i for i in avgrating], y=[avgrating[i] for i in avgrating], name = 'Average Rating'),secondary_y=True)
    fig.add_trace(go.Histogram(x=fakecount.index, y=fakecount,nbinsx = 100, histnorm='probability', opacity=0.5, name = 'Number of 5-Star Fake Reviews'), secondary_y=False)
    fig.update_yaxes(title_text="Normalized Number of 5-Star Fake Reviews", secondary_y=False)
    fig.update_yaxes(title_text="Avgerage Rating of the Product", secondary_y=True)
    fig.show()