In [1]:
import pandas as pd
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score



nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nvish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import warnings
warnings.filterwarnings('ignore')

import dataset

In [3]:
#importing the training data
df = pd.read_csv(r"Datasets\aclImdb_data_50000.csv")
print(df.shape)
df

(50000, 2)


Unnamed: 0,text,label
0,In a college dorm a guy is killed by somebody ...,neg
1,The production year says it all. The movie is ...,neg
2,A pleasant surprise! I expected a further down...,pos
3,"The ""math"" aspect to this is merely a gimmick ...",neg
4,Some of the greatest and most loved horror mov...,neg
...,...,...
49995,I found this gem in a rack the local video ren...,neg
49996,If we consider three films with a similar subj...,pos
49997,King of Masks (Bian Lian in China) is a shocki...,pos
49998,It's hard to know what was going through Per K...,neg


**Exploratery data analysis**

In [4]:
# summary of data
df.describe()

Unnamed: 0,text,label
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,neg
freq,5,25000


In [5]:
# label count
df["label"].value_counts()

label
neg    25000
pos    25000
Name: count, dtype: int64

*data is balanced*

In [6]:
#Tokenization of text
tokenizer=ToktokTokenizer()
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

In [7]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text
#Apply function on review column
df['text']=df['text'].apply(denoise_text)

In [8]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
#Apply function on review column
df['text']=df['text'].apply(remove_special_characters)

In [9]:
#set stopwords to english
stop=set(stopwords.words('english'))
print(stop)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
#Apply function on review column
df['text']=df['text'].apply(remove_stopwords)
df

{"hasn't", 'you', 'there', 'a', 'for', 'until', 'their', "shouldn't", 'won', 'more', 'myself', 'don', "you're", 'weren', 'our', 'we', 'had', 'other', 'that', 'when', 'shan', "that'll", 'have', 'what', 'is', 'should', "shan't", 'some', "couldn't", 'his', 'he', 'needn', 'by', 'll', 'themselves', 'shouldn', 'couldn', 'such', 'once', "don't", 'ourselves', 'i', 'which', 's', 'it', 'if', "you'll", 'them', 'she', 'under', 'wouldn', "you've", 'was', 'whom', 'and', 'down', 'again', 'from', "aren't", 'isn', 'now', "didn't", 'haven', 'after', 'above', 'with', 'being', 'the', 'didn', 'to', 'same', 'too', 't', 'not', "doesn't", 'having', 'during', "wasn't", "needn't", 'herself', 'into', 'while', 'its', 'both', "it's", "wouldn't", 'yourself', 'just', 'aren', 'yourselves', 'or', 've', 'her', "haven't", 'where', "hadn't", 'own', 'were', "isn't", 'me', 'out', 'because', 'yours', 'mustn', 'then', 'but', 'at', 'y', 'hasn', 'how', 'my', 'does', 'each', 'no', 'will', 'between', 'ours', 'over', 'in', 'any',

Unnamed: 0,text,label
0,college dorm guy killed somebody scythe girlfr...,neg
1,production year says movie marauding mess poli...,neg
2,pleasant surprise expected downgrade along lin...,pos
3,math aspect merely gimmick try set TV show apa...,neg
4,greatest loved horror movies wicked sense humo...,neg
...,...,...
49995,found gem rack local video rental store tapes ...,neg
49996,consider three films similar subject one made ...,pos
49997,King Masks Bian Lian China shockingly beautifu...,pos
49998,hard know going Per Kristensen Morten Lindberg...,neg


In [10]:
#normalized train reviews
norm_train_reviews=df.text[:40000]
norm_train_reviews[0]

'college dorm guy killed somebody scythe girlfriend Beth Dorie Barton discovers tries commit suicide Shes institutionalized year later shes new boyfriend named Hank Joseph Lawrence spend Spring Break Hank four mindless friends BIG beautiful condo Florida Naturally killer pops reason starts killing againLousy slasher thrillera textbook example lowbudget horror movie starters large portions film ENDLESS filler six idiots videotaping fun fun audience getting drunk acting stupid etc etc Also nudity Im saying horror film needs nudity ANYTHING liven would helped None deaths really shown hear little bloody gore Theres one REAL gruesome onebut thats till endWith exceptions acting sucks Dorie Barton dreadful main woman Tom Jay Jones lousy Oz Chad Allen pops Brad hes TERRIBLE Lawrence actually goodhandsome hunky giving crap Jeff Conaway pops small role pretty good jobLogic lapses aboundafter realize friend killed two girls casually talk sex Bastons non reaction seeing friend getting killed kind 

In [11]:
#Normalized test reviews
norm_test_reviews=df.text[40000:]
norm_test_reviews[40001]

'Despite 2001 movie direction kind 90s arthouse style considered old outofdate years ago cheesy cuts effects painful watch script decent enough scenes kind captivate like taxi driver brings bridge night story line detective whos sister killed obsessed suicide plain terrible performance actor plays Selma Blairs married boyfriend seriously bothered sit whole thing though rare kind random whatisthis movie find TV decide watch'

**Bag of words**

In [12]:
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0.0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train_reviews=cv.fit_transform(norm_train_reviews)
#transformed test reviews
cv_test_reviews=cv.transform(norm_test_reviews)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)

BOW_cv_train: (40000, 6670554)
BOW_cv_test: (10000, 6670554)


**Tf-idf**

In [13]:
#Tfidf vectorizer
tv=TfidfVectorizer(min_df=0.0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=tv.fit_transform(norm_train_reviews)
#transformed test reviews
tv_test_reviews=tv.transform(norm_test_reviews)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

Tfidf_train: (40000, 6670554)
Tfidf_test: (10000, 6670554)


In [14]:
#labeling the sentient data
lb=LabelBinarizer()
#transformed sentiment data
sentiment_data=lb.fit_transform(df['label'])
print(sentiment_data.shape)

(50000, 1)


In [15]:
#Spliting the sentiment data
train_sentiments=sentiment_data[:40000]
test_sentiments=sentiment_data[40000:]
print(train_sentiments.shape,test_sentiments.shape)

(40000, 1) (10000, 1)


**Logistic Regression** \
*Let us build logistic regression model for both bag of words and tfidf features*

In [16]:
#training the model
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
#Fitting the model for Bag of words
lr_bow=lr.fit(cv_train_reviews,train_sentiments)
print(lr_bow)
#Fitting the model for tfidf features
lr_tfidf=lr.fit(tv_train_reviews,train_sentiments)
print(lr_tfidf)

LogisticRegression(C=1, max_iter=500, random_state=42)
LogisticRegression(C=1, max_iter=500, random_state=42)


In [17]:
#Predicting the model for bag of words
lr_bow_predict=lr_bow.predict(cv_test_reviews)
print(lr_bow_predict)
##Predicting the model for tfidf features
lr_tfidf_predict=lr_tfidf.predict(tv_test_reviews)
print(lr_tfidf_predict)

[1 1 0 ... 1 1 0]
[1 1 0 ... 1 1 0]


In [18]:
#Accuracy score for bag of words
lr_bow_score=accuracy_score(test_sentiments,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)
#Accuracy score for tfidf features
lr_tfidf_score=accuracy_score(test_sentiments,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

lr_bow_score : 0.7396
lr_tfidf_score : 0.7398


In [19]:
#confusion matrix for bag of words
print('confusion matrix for bag of words')
cm_bow=confusion_matrix(test_sentiments,lr_bow_predict,labels=[1,0])
print(cm_bow)
#confusion matrix for tfidf features
print('confusion matrix for tfidf features')
cm_tfidf=confusion_matrix(test_sentiments,lr_tfidf_predict,labels=[1,0])
print(cm_tfidf)

confusion matrix for bag of words
[[3726 1244]
 [1360 3670]]
confusion matrix for tfidf features
[[3738 1232]
 [1370 3660]]


*Both approaches perform similarly with slight differences. TF-IDF has a marginally higher TP and lower FP but also a slightly higher FN and lower TN compared to the Bag of Words approach.*

In [20]:
#Classification report for bag of words
print('Classification report for bag of words')
lr_bow_report=classification_report(test_sentiments,lr_bow_predict,target_names=['Positive','Negative'])
print(lr_bow_report)

#Classification report for tfidf features
print('Classification report for tfidf features')
lr_tfidf_report=classification_report(test_sentiments,lr_tfidf_predict,target_names=['Positive','Negative'])
print(lr_tfidf_report)

Classification report for bag of words
              precision    recall  f1-score   support

    Positive       0.75      0.73      0.74      5030
    Negative       0.73      0.75      0.74      4970

    accuracy                           0.74     10000
   macro avg       0.74      0.74      0.74     10000
weighted avg       0.74      0.74      0.74     10000

Classification report for tfidf features
              precision    recall  f1-score   support

    Positive       0.75      0.73      0.74      5030
    Negative       0.73      0.75      0.74      4970

    accuracy                           0.74     10000
   macro avg       0.74      0.74      0.74     10000
weighted avg       0.74      0.74      0.74     10000



*Both methods yield identical classification performance, indicating no significant difference between using Bag of Words and TF-IDF features in this scenario.*

**support vector machines** \
*Stochastic gradient descent or Linear support vector machines for bag of words and tfidf features*

In [21]:
#training the linear svm
svm=SGDClassifier(loss='hinge',max_iter=500,random_state=42)
#fitting the svm for bag of words
svm_bow=svm.fit(cv_train_reviews,train_sentiments)
print(svm_bow)
#fitting the svm for tfidf features
svm_tfidf=svm.fit(tv_train_reviews,train_sentiments)
print(svm_tfidf)

SGDClassifier(max_iter=500, random_state=42)
SGDClassifier(max_iter=500, random_state=42)


In [22]:
#Predicting the model for bag of words
svm_bow_predict=svm_bow.predict(cv_test_reviews)
print(svm_bow_predict)
#Predicting the model for tfidf features
svm_tfidf_predict=svm_tfidf.predict(tv_test_reviews)
print(svm_tfidf_predict)

[1 1 1 ... 1 1 1]
[1 1 1 ... 1 1 1]


In [23]:
#Accuracy score for bag of words
svm_bow_score=accuracy_score(test_sentiments,svm_bow_predict)
print("svm_bow_score :",svm_bow_score)
#Accuracy score for tfidf features
svm_tfidf_score=accuracy_score(test_sentiments,svm_tfidf_predict)
print("svm_tfidf_score :",svm_tfidf_score)

svm_bow_score : 0.5075
svm_tfidf_score : 0.497


In [24]:
#confusion matrix for bag of words
print("confusion matrix for bag of words")
cm_bow=confusion_matrix(test_sentiments,svm_bow_predict,labels=[1,0])
print(cm_bow)
#confusion matrix for tfidf features
print("confusion matrix for tfidf features")
cm_tfidf=confusion_matrix(test_sentiments,svm_tfidf_predict,labels=[1,0])
print(cm_tfidf)

confusion matrix for bag of words
[[4970    0]
 [4925  105]]
confusion matrix for tfidf features
[[4970    0]
 [5030    0]]


*Both models are performing poorly with these new confusion matrices, with TF-IDF performing worse by predicting only one class. The models need to be re-evaluated and possibly retrained to improve their performance on predicting both classes correctly.*

In [25]:
#Classification report for bag of words
print("Classification report for bag of words") 
svm_bow_report=classification_report(test_sentiments,svm_bow_predict,target_names=['Positive','Negative'])
print(svm_bow_report)
#Classification report for tfidf features
print("Classification report for tfidf features")
svm_tfidf_report=classification_report(test_sentiments,svm_tfidf_predict,target_names=['Positive','Negative'])
print(svm_tfidf_report)

Classification report for bag of words
              precision    recall  f1-score   support

    Positive       1.00      0.02      0.04      5030
    Negative       0.50      1.00      0.67      4970

    accuracy                           0.51     10000
   macro avg       0.75      0.51      0.35     10000
weighted avg       0.75      0.51      0.35     10000

Classification report for tfidf features
              precision    recall  f1-score   support

    Positive       0.00      0.00      0.00      5030
    Negative       0.50      1.00      0.66      4970

    accuracy                           0.50     10000
   macro avg       0.25      0.50      0.33     10000
weighted avg       0.25      0.50      0.33     10000



*Both models are inadequate in their current state, especially the TF-IDF model, which completely fails to predict the positive class. Significant improvements and re-evaluation are necessary.*

**Naive Bayes** \
*Multinomial Naive Bayes for bag of words and tfidf features*

In [26]:
#training the model
mnb=MultinomialNB()
#fitting the svm for bag of words
mnb_bow=mnb.fit(cv_train_reviews,train_sentiments)
print(mnb_bow)
#fitting the svm for tfidf features
mnb_tfidf=mnb.fit(tv_train_reviews,train_sentiments)
print(mnb_tfidf)

MultinomialNB()
MultinomialNB()


In [27]:
#Predicting the model for bag of words
mnb_bow_predict=mnb_bow.predict(cv_test_reviews)
print(mnb_bow_predict)
#Predicting the model for tfidf features
mnb_tfidf_predict=mnb_tfidf.predict(tv_test_reviews)
print(mnb_tfidf_predict)

[0 1 0 ... 1 1 0]
[0 1 0 ... 1 1 0]


In [28]:
#Accuracy score for bag of words
mnb_bow_score=accuracy_score(test_sentiments,mnb_bow_predict)
print("mnb_bow_score :",mnb_bow_score)
#Accuracy score for tfidf features
mnb_tfidf_score=accuracy_score(test_sentiments,mnb_tfidf_predict)
print("mnb_tfidf_score :",mnb_tfidf_score)

mnb_bow_score : 0.7396
mnb_tfidf_score : 0.7403


In [29]:
#confusion matrix for bag of words
print('confusion matrix for bag of words')
cm_bow=confusion_matrix(test_sentiments,mnb_bow_predict,labels=[1,0])
print(cm_bow)
#confusion matrix for tfidf features
print('confusion matrix for tfidf features')
cm_tfidf=confusion_matrix(test_sentiments,mnb_tfidf_predict,labels=[1,0])
print(cm_tfidf)

confusion matrix for bag of words
[[3694 1276]
 [1328 3702]]
confusion matrix for tfidf features
[[3730 1240]
 [1357 3673]]


*Both models now show very similar and satisfactory performance, indicating that either approach could be used effectively*

In [30]:
#Classification report for bag of words 
print("Classification report for bag of words")
mnb_bow_report=classification_report(test_sentiments,mnb_bow_predict,target_names=['Positive','Negative'])
print(mnb_bow_report)
#Classification report for tfidf features
print("Classification report for tfidf features")
mnb_tfidf_report=classification_report(test_sentiments,mnb_tfidf_predict,target_names=['Positive','Negative'])
print(mnb_tfidf_report)

Classification report for bag of words
              precision    recall  f1-score   support

    Positive       0.74      0.74      0.74      5030
    Negative       0.74      0.74      0.74      4970

    accuracy                           0.74     10000
   macro avg       0.74      0.74      0.74     10000
weighted avg       0.74      0.74      0.74     10000

Classification report for tfidf features
              precision    recall  f1-score   support

    Positive       0.75      0.73      0.74      5030
    Negative       0.73      0.75      0.74      4970

    accuracy                           0.74     10000
   macro avg       0.74      0.74      0.74     10000
weighted avg       0.74      0.74      0.74     10000



*Both Bag of Words and TF-IDF models perform similarly with a slight edge in precision for the positive class in the TF-IDF model. Overall, both methods achieve a balanced and consistent performance with an accuracy of 0.75 and similar macro and weighted averages*

**Conclusion** \
observed that both logistic regression and multinomial naive bayes model performing well compared to linear support vector machines

In [31]:
import pickle

# save models

with open('models/lr_bow.pkl', 'wb') as f:
    pickle.dump(lr_bow, f)

with open('models/lr_tfidf.pkl', 'wb') as f:
    pickle.dump(lr_tfidf, f)

with open('models/svm_bow.pkl', 'wb') as f:
    pickle.dump(svm_bow, f)

with open('models/svm_tfidf.pkl', 'wb') as f:
    pickle.dump(svm_tfidf, f)

with open('models/mnb_bow.pkl', 'wb') as f:
    pickle.dump(mnb_bow, f)

with open('models/mnb_tfidf.pkl', 'wb') as f:
    pickle.dump(mnb_tfidf, f)