In [1]:
import pandas as pd
import numpy as np
import string
import nltk
from matplotlib import pyplot as plt
# from nltk.corpus import stopwords
# nltk.download("stopwords")
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import tree 
from sklearn import svm 
from sklearn.naive_bayes import GaussianNB

In [2]:
# Using pandas ro load the dataset remotely
df = pd.read_csv("https://raw.githubusercontent.com/yqian000/OnlineReviewDetection/main/reviews.csv")

### Data preprocessing  
1. Remove punctuation  
2. Remove stopwords

In [3]:
'''
def clean_text(text):
    # remove punctuation
    text_not_punc = []
    i = 0
    for c in text:
        if c not in string.punctuation:
            text_not_punc.append(c)
    # Join characters into string.
    text_not_punc = ''.join(text_not_punc) 

    # remove stopwords
    text_not_punc = list(text_not_punc.split(" "))
    text_clean = []
    for word in text_not_punc:
        if word.lower() not in stopwords.words('english'):
            text_clean.append(word)
    return text_clean
'''

'\ndef clean_text(text):\n    # remove punctuation\n    text_not_punc = []\n    i = 0\n    for c in text:\n        if c not in string.punctuation:\n            text_not_punc.append(c)\n    # Join characters into string.\n    text_not_punc = \'\'.join(text_not_punc) \n\n    # remove stopwords\n    text_not_punc = list(text_not_punc.split(" "))\n    text_clean = []\n    for word in text_not_punc:\n        if word.lower() not in stopwords.words(\'english\'):\n            text_clean.append(word)\n    return text_clean\n'

In [4]:
# Convert label column to binary 0 and 1: "CG" = 1, "OR" = 0
df.loc[(df.label == 'CG'), 'label'] = 1
df.loc[(df.label == 'OR'), 'label'] = 0
df['label'] = df['label'].astype('int')

In [5]:
# Create train and test dataset
X = df[["category", "text_"]]
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [6]:
# Create a small test dataset on one specific product sector "book"
test_set = pd.concat([X_test, y_test], axis=1) # axis=1 to combine horizontally
book_test_set = test_set[test_set['category'] == 'Books_5']
X_book_test = book_test_set[["category", "text_"]]
y_book_test = book_test_set["label"]

In [7]:
def print_train_test_accuracy(model, trainSet, testSet, testSetSmall):
    print("Perfomace metrics on entire test set")
    y_pred_train = model.predict(trainSet)
    y_pred_test = model.predict(testSet)
    
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    # precision and recall evaluated on the test set
    precision = precision_score(y_test, y_pred_test)
    recall = recall_score(y_test, y_pred_test)
    print(f"Train Accuracy: {train_accuracy*100:.2f}%")
    print(f"Test Accuracy: {test_accuracy*100:.2f}%, Precision: {precision*100:.2f}%, Recall: {recall*100:.2f}%")
    
    print("\nPerfomace metrics on one specific product sector: books")
    y_pred_test_small = model.predict(testSetSmall)
    test_accuracy_small = accuracy_score(y_book_test, y_pred_test_small)
    # precision and recall evaluated on the test set
    precision_small = precision_score(y_book_test, y_pred_test_small)
    recall_small = recall_score(y_book_test, y_pred_test_small)
    print(f"Test Accuracy: {test_accuracy_small*100:.2f}%, Precision: {precision_small*100:.2f}%, Recall: {recall_small*100:.2f}%")

### Feature Extraction
note: should call transform only on the test set to use the same vocabulary as for the training data

#### CountVectorizer + TfidfTransformer

In [8]:
cv = CountVectorizer()
tfidf = TfidfTransformer()

# train set
cv_train_features = cv.fit_transform(X_train["text_"])
tfidf_train_features = tfidf.fit_transform(cv_train_features)

# test set
cv_test_features = cv.transform(X_test["text_"])
tfidf_test_features = tfidf.transform(cv_test_features)

# small book test set
cv_test_features_small = cv.transform(X_book_test["text_"])
tfidf_test_features_small = tfidf.transform(cv_test_features_small)

### DecisionTree model

In [9]:
model = tree.DecisionTreeClassifier()
model.fit(tfidf_train_features, y_train)
print_train_test_accuracy(model, tfidf_train_features, tfidf_test_features, tfidf_test_features_small)

Perfomace metrics on entire test set
Train Accuracy: 100.00%
Test Accuracy: 78.04%, Precision: 76.72%, Recall: 80.08%

Perfomace metrics on one specific product sector: books
Test Accuracy: 81.09%, Precision: 81.25%, Recall: 81.43%


### SVM model

In [11]:
model = svm.SVC()
model.fit(tfidf_train_features, y_train)
print_train_test_accuracy(model, tfidf_train_features, tfidf_test_features, tfidf_test_features_small)

Perfomace metrics on entire test set
Train Accuracy: 98.75%
Test Accuracy: 91.96%, Precision: 93.29%, Recall: 90.31%

Perfomace metrics on one specific product sector: books
Test Accuracy: 91.80%, Precision: 93.30%, Recall: 90.28%


### Naive Bayes model

In [None]:
model = GaussianNB()
model.fit(tfidf_train_features.toarray(), y_train)
print_train_test_accuracy(model, tfidf_train_features, tfidf_test_features, tfidf_test_features_small)