In [None]:
!pip install langdetect

In [2]:
import pandas as pd
import numpy as np
import langdetect
from sklearn.model_selection import train_test_split
import spacy
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier

# Product Reviews Preprocessing

In [2]:
#Removing rows where columns rating and content contains NA
df = pd.read_csv('product_reviews.csv',header=0)
print("Original length of df: ", len(df))
df = df.dropna(subset=['review_rating', 'review_content'])
print("After drop length: ", len(df))

Original length of df:  39015
After drop length:  37889


In [None]:
# filtering the entire dataset for only english
df['Language'] = df['review_content'].apply(lambda x: langdetect.detect(x))
filtered_en = df.loc[df['Language'] == 'en']
print("Length of english reviews: ", len(filtered_en))
filtered_en.to_csv("filtered_reviews.csv",index=False)

In [4]:
#Split into train and test dataset
filtered_en = pd.read_csv("filtered_reviews.csv",header=0)
train, test = train_test_split(filtered_en, test_size=0.1)
rating = np.array(train['review_rating'])
#Use ratings as sentiment (positive, negative, neutral) for train dataset
sentiment = []
for i in range(len(rating)):
    if rating[i] < 3:
        sentiment.append("negative")
    elif rating[i] > 3:
        sentiment.append("positive")
    else:
        sentiment.append("neutral")
        
train['sentiment'] = sentiment
train = train[['product_name','review_content','sentiment']]
train.to_csv("train_data.csv", index=False)
test = test[['product_name','review_content', 'review_rating']]
test.to_csv("test_data.csv",index=False)

In [4]:
!pip install transformers requests pandas numpy

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.5-cp38-cp38-win_amd64.whl (3.3 MB)
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.4.0 sacremoses-0.0.47 tokenizers-0.11.5 transformers-4.16.2


In [134]:
#Inter-annotator Agreement (Cohen-Kappa score)
from sklearn.metrics import cohen_kappa_score
test_df = pd.read_csv("test_data.csv",header=0)
labeler1 = np.array(test_df["Annotator_1"])
labeler2 = np.array(test_df["Annotator_2"])
cohen_kappa_score(labeler1, labeler2)

0.8012793721595832

In [None]:
from nltk.corpus import stopwords 
from textblob import Word
#Removing stopwords, apply lowercase, remove other symbols, lemmatize
dataset = ['train_data','test_data']
for ds in dataset:
    df = pd.read_csv("{}.csv".format(ds))
    df['review_content'] = df['review_content'].apply(lambda x: " ".join(x.lower() for
    x in x.split()))
    df['review_content'] = df['review_content'].str.replace('[^\w\s]', "")
    stop = stopwords.words('english')
    df['review_content'] = df['review_content'].apply(lambda x: " ".join(x for x in
    x.split() if x not in stop))
    df['review_content'] = df['review_content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    df.to_csv("{}_Processed.csv".format(ds),index=False)
    print(df)