# **AMAZON FINE FOOD REVIEWS ANALYSIS**

This Jupyter Notebook comprises analysis Dataset which comprises of more than 500000+ food reviews on Amazon.


Attribute Information:

1)Id

2)ProductId - unique identifier for the product

3)UserId - unqiue identifier for the user

4)ProfileName

5)HelpfulnessNumerator - number of users who found the review helpful

6)HelpfulnessDenominator - number of users who indicated whether they found the review helpful or not

7)Score - rating between 1 and 5

8)Time - timestamp for the review

9)Summary - brief summary of the review

10)Text - text of the review



In [1]:
#importing Libraries
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report, roc_auc_score

In [2]:
#Loading Data
data = pd.read_csv('../input/amazon-fine-food-reviews/Reviews.csv')

In [3]:
#First Five Rows of the Data
data.head()

**DATA EXPLORATION**

In [4]:
data.info()

In [5]:
data.describe()

In [6]:
#Total Number of Reviewers

len(data['UserId'].unique())

In [7]:
count_product_id = data["ProductId"].value_counts().index
count_product_id

In [8]:
data['Helpful %'] = np.where(data['HelpfulnessDenominator'] > 0, data['HelpfulnessNumerator'] / data['HelpfulnessDenominator'], -1)
data['% Upvote'] = pd.cut(data['Helpful %'], bins = [-1, 0, 0.2, 0.4, 0.6, 0.8, 1.0], labels = ['Empty', '0-20%', '20-40%', '40-60%', '60-80%', '80-100%'], include_lowest = True)

In [9]:
data.head()

In [10]:
df = data.groupby(['Score', '% Upvote']).agg({'Id': 'count'})
df = df.unstack()
df.columns = df.columns.get_level_values(1)
fig = plt.figure(figsize=(15,10))

sns.heatmap(df[df.columns[::-1]].T, annot = True, fmt = 'd', cbar_kws={'label': '# reviews'})
plt.yticks(rotation=0)
plt.title('How helpful users find among user scores')

**User Rating Distribution**

In [11]:
plt.figure(figsize = (10, 8))
sns.countplot(data['Score'])

In [12]:
plt.figure(figsize = (10, 8))
sns.distplot(data['Score'])

In [13]:
data.Text[:10]

In [14]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
first_20_text = data.Text[:20]
for test in first_20_text:
    test
    ss = analyser.polarity_scores(test)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]))
    print(test)

In [15]:
data.head()

In [16]:
df =  data
X = df['Text']
y_dict = {1:0, 2:0, 3:1, 4:0, 5:0}
y = df['Score'].map(y_dict)

In [17]:
df_user = data.groupby(['UserId']).agg({'Score':['count', 'mean']})
df_user.columns = df_user.columns.get_level_values(1)
df_user.columns = ['Rating count', 'Rating mean']
df_user = df_user.sort_values(by = 'Rating count', ascending = False)
print(df_user.head(10))

In [18]:
data1 = data
data1["is_bad_review"] = data1["Score"].apply(lambda x: 1 if x < 4 else 0)
data1.head()

In [19]:
data1["is_bad_review"].value_counts(normalize = True)

In [20]:
#Counting No of words per review
data1["words"] = data1["Text"].apply(lambda x: len(x.split(" ")))

In [21]:
data1.head()

**Analyzing Sentiment of reviews**

In [22]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

Intensity = SentimentIntensityAnalyzer()

In [23]:
res = {}
for i, row in tqdm(data1.iterrows(), total=len(df)):
    text = row['Text']
    myid = row['Id']
    res[myid] = Intensity.polarity_scores(text)

In [24]:
sentiment = pd.DataFrame(res).T
sentiment = sentiment.reset_index().rename(columns={'index': 'Id'})
data1 = data1.merge(sentiment, how='right')

In [25]:
data1.head()

**SCORE PREDICTION**

In [26]:
X = data1[['HelpfulnessNumerator', 'HelpfulnessDenominator', 'is_bad_review', 'Helpful %', 'words', 'pos', 'neg', 'compound', 'neu']]
y = data1['Score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [27]:
rf = RandomForestClassifier(n_estimators = 100, random_state = 42)
rf.fit(X_train, y_train)

# show feature importance
features = ['HelpfulnessNumerator', 'HelpfulnessDenominator', 'is_bad_review', 'Helpful %', 'words', 'pos', 'neg', 'compound', 'neu']

feature_importances = pd.DataFrame({"feature": features, "importance": rf.feature_importances_}).sort_values("importance", ascending = False)
feature_importances.head(20)

In [28]:
prediction = rf.predict(X_test)

In [29]:
from sklearn.metrics import accuracy_score
print("Accuracy Score:",accuracy_score(y_test, prediction))

In [30]:

from sklearn.metrics import roc_curve, auc, roc_auc_score
y_pred = [x[1] for x in rf.predict_proba(X_test)]
fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label = 1)

roc_auc = auc(fpr, tpr)

plt.figure(1, figsize = (15, 10))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

In [31]:
print('Classification Report:')
CR = classification_report(y_test,prediction)
print(CR)