# Import Libraries

In [2]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import re,os
import nltk
import glob

In [4]:
folder_path = 'movie_reviews'
folder_path

'movie_reviews'

In [5]:
folders = os.listdir(folder_path)
folders

['neg', 'pos']

In [6]:
pos_review_path = os.path.join(folder_path,'pos')
pos_review_path

'movie_reviews\\pos'

In [7]:
neg_review_path = os.path.join(folder_path,'neg')
neg_review_path

'movie_reviews\\neg'

In [8]:
pos_review_files = glob.glob(f'{pos_review_path}/*.txt')
neg_review_files = glob.glob(f'{neg_review_path}/*.txt')

In [12]:
len(neg_review_files)

1000

In [13]:
len(pos_review_files)

1000

In [21]:
review_list = []

for file_name in pos_review_files:
#     print(file_name)
    f = open(file_name, 'r')
    text = f.read()
#     print(text)
    text = re.sub('[^A-Za-z]+',' ', text)
#     print(text)
    review_list.append(text)
    
    
    
for file_name in neg_review_files:
#     print(file_name)
    f = open(file_name, 'r')
    text = f.read()
#     print(text)
    text = re.sub('[^A-Za-z]+',' ', text)
#     print(text)
    review_list.append(text)

In [22]:
len(review_list)

2000

# Target Variable

In [24]:
# pos_target = np.ones(1000, dtype = 'int')
# pos_target

In [25]:
pos_target = np.ones(len(pos_review_files), dtype = 'int')
neg_target = np.zeros(len(neg_review_files), dtype = 'int')

In [28]:
# neg_target

In [32]:
# class_2 = np.full(1000,0)
# class_2

In [33]:
y = np.append(pos_target,neg_target)
y = pd.Series(y)
y

0       1
1       1
2       1
3       1
4       1
       ..
1995    0
1996    0
1997    0
1998    0
1999    0
Length: 2000, dtype: int32

# 1. Bag of Words

In [46]:
count_vector = CountVectorizer(lowercase=True, stop_words='english', min_df=10)
x_count = count_vector.fit_transform(review_list)
# x_count

count_vect_column = count_vector.get_feature_names()
# print(count_vect_column)
print('Number of columns or Attributes in the given data:', len(count_vect_column))

x_count_vector_df = pd.DataFrame(x_count.toarray(), columns = count_vect_column)
x_count_vector_df

Number of columns or Attributes in the given data: 7818




Unnamed: 0,aaron,abandon,abandoned,abilities,ability,able,ably,aboard,abound,abruptly,...,youth,yup,zane,zany,zellweger,zero,zeta,zombie,zone,zooms
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1996,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 2. TF-IDF Vectorizer

In [49]:
tfidf_vect = TfidfVectorizer(lowercase=True, stop_words='english', min_df=10)
x_tfidf_vect = tfidf_vect.fit_transform(review_list)

tf_idf_column = tfidf_vect.get_feature_names()
# print(tf_idf_column)

x_tfidf_df = pd.DataFrame(x_tfidf_vect.toarray(), columns = tf_idf_column)
x_tfidf_df



Unnamed: 0,aaron,abandon,abandoned,abilities,ability,able,ably,aboard,abound,abruptly,...,youth,yup,zane,zany,zellweger,zero,zeta,zombie,zone,zooms
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.078118,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.03594,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1996,0.0,0.0,0.0,0.0,0.041443,0.0,0.000000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997,0.0,0.0,0.0,0.0,0.065398,0.0,0.000000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1998,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Train Test Split

In [50]:
x_train,x_test,y_train,y_test = train_test_split(x_tfidf_df,y,test_size=0.2,random_state=1,stratify=y)

In [53]:
# x_test

# Model Training

# 1. Gaussian NB

In [54]:
gnb_model = GaussianNB()
gnb_model.fit(x_train,y_train)

In [59]:
y_pred = gnb_model.predict(x_test)

# y_pred


acc_score = accuracy_score(y_test,y_pred)
print('Accuracy Score is :', acc_score)


clf_matrix = confusion_matrix(y_test,y_pred)
print('Confusion matrix is :\n', clf_matrix)

clf_report = classification_report(y_test,y_pred)
print('Classification report is:\n', clf_report)

Accuracy Score is : 0.685
Confusion matrix is :
 [[139  61]
 [ 65 135]]
Classification report is:
               precision    recall  f1-score   support

           0       0.68      0.69      0.69       200
           1       0.69      0.68      0.68       200

    accuracy                           0.69       400
   macro avg       0.69      0.69      0.68       400
weighted avg       0.69      0.69      0.68       400



# 2. Multinomial NB

In [60]:
mnb_model = MultinomialNB()
mnb_model.fit(x_train,y_train)

In [61]:
y_pred = mnb_model.predict(x_test)

# y_pred


acc_score = accuracy_score(y_test,y_pred)
print('Accuracy Score is :', acc_score)


clf_matrix = confusion_matrix(y_test,y_pred)
print('Confusion matrix is :\n', clf_matrix)

clf_report = classification_report(y_test,y_pred)
print('Classification report is:\n', clf_report)

Accuracy Score is : 0.8125
Confusion matrix is :
 [[172  28]
 [ 47 153]]
Classification report is:
               precision    recall  f1-score   support

           0       0.79      0.86      0.82       200
           1       0.85      0.77      0.80       200

    accuracy                           0.81       400
   macro avg       0.82      0.81      0.81       400
weighted avg       0.82      0.81      0.81       400



# 3. Bernoulli NB

In [62]:
bnb_model = BernoulliNB()
bnb_model.fit(x_train,y_train)

In [63]:
y_pred = bnb_model.predict(x_test)

# y_pred


acc_score = accuracy_score(y_test,y_pred)
print('Accuracy Score is :', acc_score)


clf_matrix = confusion_matrix(y_test,y_pred)
print('Confusion matrix is :\n', clf_matrix)

clf_report = classification_report(y_test,y_pred)
print('Classification report is:\n', clf_report)

Accuracy Score is : 0.7925
Confusion matrix is :
 [[174  26]
 [ 57 143]]
Classification report is:
               precision    recall  f1-score   support

           0       0.75      0.87      0.81       200
           1       0.85      0.71      0.78       200

    accuracy                           0.79       400
   macro avg       0.80      0.79      0.79       400
weighted avg       0.80      0.79      0.79       400

