In [None]:
!pip install contractions

In [None]:
import pandas as pd
import numpy as np
import nltk
import re
from bs4 import BeautifulSoup
import string
import contractions
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support as scores
from sklearn.metrics import classification_report

In [None]:
nltk.download('wordnet')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# ! pip install bs4 # in case you don't have it installed

# # Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz

## Read Data

In [None]:
pd.set_option("display.max_colwidth", 200)

In [None]:
 data= pd.read_csv("drive/MyDrive/amazon_reviews_us_Beauty_v1_00.tsv", sep = '\t', on_bad_lines= 'skip')

In [None]:
data.head()

## Keep Reviews and Ratings

In [None]:
 data = data[["review_body", "star_rating"]]

###Missing Values

In [None]:
data.head()

####stars

In [None]:
 data.dropna(subset = ["star_rating"], inplace= True)
 data["star_rating"].isnull().sum()

In [None]:
data[data["review_body"].isna()].shape

In [None]:
def splitting(text):
  text = str(text)
  text = text[0]
  return text

data["star_rating"] =data["star_rating"].apply(lambda x : splitting(x))

In [None]:
def change_type(text):
  text = int(text)
  return text

data['star_rating'] = data["star_rating"].apply(lambda x: change_type(x))
  

In [None]:
def typ(text):
  if type(text)!=int:
    print(type(text), text)
data["star_rating"].apply(lambda x : typ(x))


####reviews

In [None]:
data.dropna(subset =["review_body"], inplace = True)
data["review_body"].isna().sum()

 ## We form three classes and select 20000 reviews randomly from each class.



In [None]:
data_1 = data[data['star_rating']==1]
data_1 = data_1.append(data[data['star_rating']==2])
data_1["class"] = 1
data_1 = data_1.sample(n=20000)

In [None]:
data_2 = data[data['star_rating']==3]
data_2 = data_2.append(data[data['star_rating']==4])
data_2["class"] = 2
data_2 = data_2.sample(n=20000)
data_1 = data_1.append(data_2)

In [None]:
data_3 = data[data['star_rating']==5]
data_3["class"] = 3
data_3 = data_3.sample(n=20000)
data_1 = data_1.append(data_3)

In [None]:
df = data_1

In [None]:
df.head()
df.shape

In [None]:
'''
df = pd.read_csv("cleaned.csv")
df.head()'''

# Data Cleaning



In [None]:
df.head()

In [None]:
cnt_b_clean = (df["review_body"].str.len()).mean()

###Removing Contractions


In [None]:
#df = pd.read_csv("drive/MyDrive/cleaned.csv")
df.shape

In [None]:
df["review_body"]

In [None]:
df["contracted_reviews"] = df["review_body"].apply(lambda x : contractions.fix(x) )
df = df.drop(["review_body", 'star_rating'], axis =1)
df.head()

In [None]:
cnt_a_clean = (df["contracted_reviews"].str.len()).mean()

In [None]:
print('Average length of reviews before and after data cleaning:', cnt_b_clean, ",", cnt_a_clean)

# Pre-processing

In [None]:
cnt_b_pp = (df["contracted_reviews"].str.len()).mean()

## remove the stop words 

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stopword = stopwords.words("english")
stopwords = stopwords.words('french')
stopword.append(stopwords)

###Stemming

In [None]:
ps = nltk.PorterStemmer()

## perform lemmatization  

In [None]:
nltk.download('omw-1.4')

In [None]:
from nltk.stem import WordNetLemmatizer
wn = WordNetLemmatizer()

In [None]:
df["contracted_reviews"]

In [None]:
def clean_data(text):
  text = "".join([char for char in text if char not in string.punctuation])
  token = re.split("\W+", text)
  words = [wn.lemmatize(word) for word in token if word not in stopword]
  #words = [ps.stem(word) for word in words if word not in stopword]
  return words

In [None]:
df_temp = pd.DataFrame()
df_temp["contracted_reviews"]= df["contracted_reviews"].apply(lambda x : clean_data(x))

In [None]:
df_temp["contracted_reviews"]

In [None]:
cnt_a_pp = (df_temp["contracted_reviews"].str.len()).mean()

In [None]:
print('Average length of reviews before and after data preprocessing:', cnt_b_pp, ",", cnt_a_pp)

In [None]:
df_temp['class'] = df["class"]
df_temp["body_len"] = df_temp["contracted_reviews"].apply(lambda x : len(x) - x.count(" "))

In [None]:
df_temp.head()

In [None]:
import matplotlib.pyplot as py

In [None]:
bins = np.linspace(0,200,50)
py.hist(df_temp[df_temp["class"]==1]["body_len"], bins, alpha = 0.5, density = True, label= "class 1" )
py.legend(loc = "upper right")
py.plot()

In [None]:
py.hist(df_temp[df_temp["class"]==2]["body_len"], bins, alpha = 0.5,  label= "class 2" )
py.legend(loc = "upper left")
py.plot()

In [None]:
py.hist(df_temp[df_temp["class"]==3]["body_len"], bins, alpha = 0.5,  label= "class 3" )
py.legend(loc = "upper left")
py.plot()

# TF-IDF Feature Extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(analyzer=clean_data)
X_tf = tfidf.fit_transform(df["contracted_reviews"])
X_tf

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_tf, df["class"], test_size=0.2)

# Perceptron

###Grid Search

In [None]:
from sklearn.linear_model import Perceptron

In [None]:
"""
def train_perceptron(pen, tol, s):
  clf = Perceptron(penalty = pen,tol= tol, shuffle = s, n_jobs=-1)
  clf.fit(X_train, Y_train)
  y_pred = clf.predict(X_test)
  precision, recall, fscore, support = scores(Y_test, y_pred)
  print('Penalty : {} / Tol : {} / precision:{} / Recall:{} / Accuracy:{} / Avg_precision:{}'.format(pen, tol, precision, recall, (y_pred==Y_test).sum() / len(y_pred), np.mean(precision)))
"""

In [None]:
"""
for pen in ['l1', 'l2','elasticnet', None]:
  for tol in [0.1,0.01,0.5,0.05,None]:
    for s in [True, False]:
      train_perceptron(pen,tol,s)
      """

In [None]:
clf= Perceptron(penalty = None,tol= 0.01, shuffle = True, n_jobs=-1)
clf.fit(X_train, Y_train)
y_pred = clf.predict(X_test)
precision, recall, fscore, support = scores(Y_test, y_pred)
report = classification_report(Y_test, y_pred,output_dict=True)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score( Y_test,y_pred)

In [None]:
print(str(report['1']['precision'])+","+str(report['1']['recall'])+","+str(report['1']['f1-score']))
print(str(report['2']['precision'])+","+str(report['2']['recall'])+","+str(report['2']['f1-score']))
print(str(report['3']['precision'])+","+str(report['3']['recall'])+","+str(report['3']['f1-score']))
print(str(report['weighted avg']['precision'])+","+str(report['weighted avg']['recall'])+","+str(report['weighted avg']['f1-score']))

# SVM

In [None]:
from sklearn.svm import LinearSVC

In [None]:
'''
def train_svm(k, tol):
  clf = SVC(tol= tol)
  clf.fit(X_train, Y_train)
  y_pred = clf.predict(X_test)
  precision, recall, fscore, support = scores(Y_test, y_pred)
  print('Kernel : {} / Tol : {} / precision:{} / Recall:{} / Accuracy:{}'.format(k, tol, precision, recall, (y_pred==Y_test).sum() / len(y_pred)))
  '''


In [None]:
'''
for k in ['linear','poly', 'sigmoid','rbf','precomputed']:
  for tol in [0.1,0.01]:
    train_svm(k,tol)'''

In [None]:
clf = LinearSVC(tol= 0.1)
clf.fit(X_train, Y_train)
y_pred = clf.predict(X_test)
report = classification_report(Y_test, y_pred,output_dict=True)

In [None]:
accuracy_score( Y_test,y_pred)

In [None]:
print(str(report['1']['precision'])+","+str(report['1']['recall'])+","+str(report['1']['f1-score']))
print(str(report['2']['precision'])+","+str(report['2']['recall'])+","+str(report['2']['f1-score']))
print(str(report['3']['precision'])+","+str(report['3']['recall'])+","+str(report['3']['f1-score']))
print(str(report['weighted avg']['precision'])+","+str(report['weighted avg']['recall'])+","+str(report['weighted avg']['f1-score']))

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [None]:
"""
def train_lr(pen,s):
  clf = LogisticRegression(penalty= pen, solver = s)
  clf.fit(X_train, Y_train)
  y_pred = clf.predict(X_test)
  precision, recall, fscore, support = scores(Y_test, y_pred)
  print('Penalty:{} / Solver : {} / precision:{} / Recall:{} / Accuracy:{} / Avg_precision:{}'.format(pen,s,  precision, recall, (y_pred==Y_test).sum() / len(y_pred), np.mean(precision)))
"""

In [None]:
"""
for solver in ['lbfgs', 'liblinear','newton-cg','sag','saga']:
  for pen in [ "l2"]:
      train_lr(pen,solver)
      """

In [None]:
clf = LogisticRegression(solver='liblinear', n_jobs=-1)
clf.fit(X_train, Y_train)
y_pred = clf.predict(X_test)
report = classification_report(Y_test, y_pred,output_dict=True)

In [None]:
print(str(report['1']['precision'])+","+str(report['1']['recall'])+","+str(report['1']['f1-score']))
print(str(report['2']['precision'])+","+str(report['2']['recall'])+","+str(report['2']['f1-score']))
print(str(report['3']['precision'])+","+str(report['3']['recall'])+","+str(report['3']['f1-score']))
print(str(report['weighted avg']['precision'])+","+str(report['weighted avg']['recall'])+","+str(report['weighted avg']['f1-score']))

# Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
gnb = MultinomialNB()

In [None]:
gnb.fit(X_train, Y_train)
y_pred = gnb.predict(X_test)
report = classification_report(Y_test, y_pred,output_dict=True)

In [None]:
print(str(report['1']['precision'])+","+str(report['1']['recall'])+","+str(report['1']['f1-score']))
print(str(report['2']['precision'])+","+str(report['2']['recall'])+","+str(report['2']['f1-score']))
print(str(report['3']['precision'])+","+str(report['3']['recall'])+","+str(report['3']['f1-score']))
print(str(report['weighted avg']['precision'])+","+str(report['weighted avg']['recall'])+","+str(report['weighted avg']['f1-score']))