# Classification of the comments

## Installing necessary libraries

Pass this step this if the libries are already installed.

In [None]:
!npm install pandas
!npm install numpy
%pip install seaborn
!npm install matplotlib
%pip install scikit-learn

## Importing neccessary libraries

Importing all neccessary libraries that will be used in this Jupyter notebook.

In [None]:
import os
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
import string

## Preparing the collected comments from the vader script

Reading the collected comments from the .csv file and printing out the first and the last five entries of the data.

In [159]:
df_bad_comments = pd.read_csv('bad_comments.csv')
df_good_comments = pd.read_csv('good_comments.csv')

In [160]:
df_bad_comments.head(5)

Unnamed: 0,file,comment,sentiment
0,reviews_10003.json,**One of the worst espionage films of all time...,bad
1,reviews_100092.json,A railway line is being sabotaged and after on...,bad
2,reviews_100100.json,This might have worked better as a silent feat...,bad
3,reviews_10014.json,Possession is nine-tenths of the law.\r\n\r\nA...,bad
4,reviews_10014.json,Freddy's Revenge deserves redemption. If you d...,bad


In [161]:
df_good_comments.head(5)

Unnamed: 0,file,comment,sentiment
0,reviews_100.json,I just plain love this movie!,good
1,reviews_100.json,"Far from being a good movie, with tons of flaw...",good
2,reviews_100.json,Genuinely one of my favorite movies of all tim...,good
3,reviews_100.json,The very entertaining ending makes this film.\...,good
4,reviews_10002.json,Really good British neo-noir featuring great p...,good


Adding a new column by name 'class' to each DataFrame to indicate the sentiment.

In [166]:
df_bad_comments['class'] = 0
df_good_comments['class'] = 1

In [167]:
df_bad_comments.shape, df_good_comments.shape

((6252, 4), (22921, 4))

Merging of the goood and the bad comments.

In [168]:
df_merged_comments = pd.concat([df_bad_comments, df_good_comments], axis=0)
df_merged_comments.head()

Unnamed: 0,file,comment,sentiment,class
0,reviews_10003.json,**One of the worst espionage films of all time...,bad,0
1,reviews_100092.json,A railway line is being sabotaged and after on...,bad,0
2,reviews_100100.json,This might have worked better as a silent feat...,bad,0
3,reviews_10014.json,Possession is nine-tenths of the law.\r\n\r\nA...,bad,0
4,reviews_10014.json,Freddy's Revenge deserves redemption. If you d...,bad,0


In [169]:
df_merged_comments.columns

Index(['file', 'comment', 'sentiment', 'class'], dtype='object')

Removing the unnecessary classes for easier data classification.

In [170]:
df_comments = df_merged_comments.drop(['file','sentiment'], axis = 1)

In [171]:
df_comments.isnull().sum()

comment    0
class      0
dtype: int64

Fraction was used for the random shuffling of the data frame.

In [172]:
df_comments = df_comments.sample(frac=1)
df_comments.head()

Unnamed: 0,comment,class
15458,"Good wholesome move, very inspiring, especiall...",1
6577,"Right in the opening segment, director Siva la...",1
12481,"Yeah, pure nostalgia score for this one. But ...",1
209,"Linda Hunt is the museum curator ""Cuthbert"" wh...",0
4024,Every actor in this is **horrendous** with the...,0


Removing index column.

In [173]:
df_comments.reset_index(inplace=True)
df_comments.drop(['index'], axis=1, inplace=True)

In [174]:
df_comments.columns

Index(['comment', 'class'], dtype='object')

In [175]:
df_comments.head()

Unnamed: 0,comment,class
0,"Good wholesome move, very inspiring, especiall...",1
1,"Right in the opening segment, director Siva la...",1
2,"Yeah, pure nostalgia score for this one. But ...",1
3,"Linda Hunt is the museum curator ""Cuthbert"" wh...",0
4,Every actor in this is **horrendous** with the...,0


Pre-processing the comments. 

In [176]:
def preprocess_comments(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\[.*?\]', '', text)  # Remove text within square brackets
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters 
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text).strip()  # Remove URLs
    text = re.sub(r'<.*?>+', '', text)  # Remove HTML tags
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)  # Remove punctuation
    text = re.sub(r'\n', ' ', text).strip()  # Remove newlines
    text = re.sub(r'\r', '', text).strip()  # Remove carriage returns
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words containing digits
    return text


In [177]:
df_comments['comment'] = df_comments ['comment'].apply(preprocess_comments)

In [178]:
X = df_comments['comment']
y = df_comments['class']

Sampling test and train data.

In [179]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)

In [180]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

## Logistic Regression

In [181]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train_vectorized, y_train)

In [182]:
pred_lr = LR.predict(X_test_vectorized)
LR.score(X_test_vectorized, y_test)

0.8418372343551073

In [183]:
print("             Logistic Regression Classification Report:")
print(classification_report(y_test, pred_lr, zero_division=0))

             Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.31      0.45      2190
           1       0.84      0.99      0.91      8021

    accuracy                           0.84     10211
   macro avg       0.86      0.65      0.68     10211
weighted avg       0.85      0.84      0.81     10211



## Naive Bayes

In [184]:
from sklearn.naive_bayes import BernoulliNB
NB = BernoulliNB()
NB.fit(X_train_vectorized, y_train)

In [185]:
pred_nb = NB.predict(X_test_vectorized)
NB.score(X_test_vectorized, y_test)

0.7932621682499266

In [186]:
print("                 Naive Bayes Classification Report:")
print(classification_report(y_test, pred_nb, zero_division=0))

                 Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.17      0.26      2190
           1       0.81      0.96      0.88      8021

    accuracy                           0.79     10211
   macro avg       0.68      0.57      0.57     10211
weighted avg       0.76      0.79      0.75     10211



In [187]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100, random_state=42)
RF.fit(X_train_vectorized, y_train)

In [188]:
pred_rf = RF.predict(X_test_vectorized)

In [189]:
RF.score(X_test_vectorized, y_test)

0.7993340515130741

## Random Forest Classifier

In [190]:
print("                Random Forest Classification Report:")
print(classification_report(y_test, pred_rf, zero_division=0))

                Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.08      0.15      2190
           1       0.80      1.00      0.89      8021

    accuracy                           0.80     10211
   macro avg       0.81      0.54      0.52     10211
weighted avg       0.81      0.80      0.73     10211



## Decision Tree Classifier

In [195]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier(random_state=42)
DT.fit(X_train_vectorized, y_train)

In [196]:
pred_dt = DT.predict(X_test_vectorized)

In [197]:
DT.score(X_test_vectorized, y_test)

0.7511507198119675

In [201]:
print("                Decision Tree Classification Report:")
print(classification_report(y_test, pred_dt, zero_division=0))

                Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.40      0.41      2190
           1       0.84      0.85      0.84      8021

    accuracy                           0.75     10211
   macro avg       0.63      0.62      0.63     10211
weighted avg       0.75      0.75      0.75     10211



## Testing 

Running the scraped data from TDBM database through Logistic Regression, Naive Bayes, Random Forest and Decision Tree classifiers.


In [206]:
def output_label(n):
    if n == 0:
        return 'Bad comment'
    elif n == 1:
        return 'Good comment'
    
def manual_testing(comment):
    testing_comments = {"comment": [comment]}
    new_def_test = pd.DataFrame(testing_comments)
    new_def_test['comment'] = new_def_test['comment'].apply(preprocess_comments)
    new_x_test = new_def_test['comment']
    new_x_test_vectorized = vectorizer.transform(new_x_test)
    pred_lr = LR.predict(new_x_test_vectorized)
    pred_nb = NB.predict(new_x_test_vectorized)
    pred_rf = RF.predict(new_x_test_vectorized)
    pred_dt = DT.predict(new_x_test_vectorized)

    return print(f" Logistic Regression Prediction: {output_label(pred_lr)}\n Naive Bayes Prediction: {output_label(pred_nb)}\n Random Forest Prediction: {output_label(pred_rf)} \n Decision Tree Prediction: {output_label(pred_dt)}") 
    

In [214]:
test_comment = str(input('Enter a comment to test: '))
manual_testing(test_comment)

 Logistic Regression Prediction: Bad comment
 Naive Bayes Prediction: Good comment
 Random Forest Prediction: Good comment 
 Decision Tree Prediction: Bad comment
