In [None]:
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report
from gensim.models import Word2Vec

In [None]:
#loading the dataset
df_train = pd.read_csv('/content/drive/MyDrive/NLP Projects/Basic ML Projects/Movie Review Classification/trainp.csv')
df_test = pd.read_csv('/content/drive/MyDrive/NLP Projects/Basic ML Projects/Movie Review Classification/testp.csv')
df_train.info() #balanced dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       25000 non-null  object
 1   sentiment  25000 non-null  object
dtypes: object(2)
memory usage: 390.8+ KB


In [None]:
#combining both train and test set
DF = [df_train, df_test]
df = pd.concat(DF)
len(df)

50000

In [None]:
df.head()

Unnamed: 0,Text,sentiment
0,deni purchas ebay high expect incred print wor...,neg
1,saddest thing tribut almost singer includ othe...,neg
2,last night decid watch prequel shall say call ...,neg
3,admit like first half sleeper look good act ev...,neg
4,impress film especi fact went cinema famili go...,neg


# Using Bag of Words Model

In [None]:
#extracting the features
vectorizer = CountVectorizer(max_features = 5000) #due to ram issues had to take a rough parameter
bow_features = vectorizer.fit_transform(df['Text'])
bow_df = pd.DataFrame(bow_features.todense())
bow_df.head() #as we can see the sparsity issue in BOW model

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,4960,4961,4962,4963,4964,4965,4966,4967,4968,4969,4970,4971,4972,4973,4974,4975,4976,4977,4978,4979,4980,4981,4982,4983,4984,4985,4986,4987,4988,4989,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
#converting the labels
df = df.replace({'sentiment':{'pos':1, 'neg':0}})

In [None]:
#dividing the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(bow_df, df.sentiment, test_size=0.3, random_state=42)

In [None]:
#building LR Model
lr_bow = make_pipeline(StandardScaler(), LogisticRegression(solver='saga')) #as 'sag' or 'saga' are faster for large ones
lr_bow.fit(x_train, y_train)
predict_lr = lr_bow.predict(x_test)

#building NB Model
nb_bow = MultinomialNB()
nb_bow.fit(x_train, y_train)
predict_nb = nb_bow.predict(x_test)



In [None]:
#evaluation for LR
print(classification_report(y_test, predict_lr, labels=[0,1], target_names=['neg', 'pos']))

              precision    recall  f1-score   support

         neg       0.87      0.86      0.86      7470
         pos       0.86      0.87      0.87      7530

    accuracy                           0.87     15000
   macro avg       0.87      0.87      0.87     15000
weighted avg       0.87      0.87      0.87     15000



In [None]:
#evaluation for NB
print(classification_report(y_test, predict_nb, labels=[0,1], target_names=['neg', 'pos']))

              precision    recall  f1-score   support

         neg       0.85      0.85      0.85      7470
         pos       0.85      0.85      0.85      7530

    accuracy                           0.85     15000
   macro avg       0.85      0.85      0.85     15000
weighted avg       0.85      0.85      0.85     15000



# Using TF-IDF Model


In [None]:
#extracting the features
vectorizer = TfidfVectorizer(max_features = 5000)
tfidf_features = vectorizer.fit_transform(df['Text'])
tfidf_df = pd.DataFrame(tfidf_features.todense())

In [None]:
#dividing the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(tfidf_df, df.sentiment, test_size=0.3, random_state=2)

In [None]:
#building LR Model
lr_tfidf = make_pipeline(StandardScaler(), LogisticRegression(solver='saga'))
lr_tfidf.fit(x_train, y_train)
predict_lr = lr_tfidf.predict(x_test)

#building NB Model
nb_tfidf = MultinomialNB()
nb_tfidf.fit(x_train, y_train)
predict_nb = nb_tfidf.predict(x_test)

In [None]:
#evaluation
print(classification_report(y_test, predict_lr, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.89      0.86      0.88      7467
           1       0.87      0.90      0.88      7533

    accuracy                           0.88     15000
   macro avg       0.88      0.88      0.88     15000
weighted avg       0.88      0.88      0.88     15000



In [None]:
#evaluation for NB
print(classification_report(y_test, predict_nb, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.86      0.84      0.85      7467
           1       0.84      0.86      0.85      7533

    accuracy                           0.85     15000
   macro avg       0.85      0.85      0.85     15000
weighted avg       0.85      0.85      0.85     15000



# Using Hashing Model

In [None]:
#extracting features
hashvectorizer = HashingVectorizer(n_features=10000, alternate_sign=False)
hash_features = hashvectorizer.fit_transform(df['Text'])
hash_df = pd.DataFrame(hash_features.todense())
hash_df.head()

In [None]:
#dividing the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(hash_df, df.sentiment, test_size=0.3, random_state=2)

In [None]:
#building LR Model
lr_hash = make_pipeline(StandardScaler(), LogisticRegression(solver='sag'))
lr_hash.fit(x_train, y_train)
predict_lr = lr_hash.predict(x_test)

#building NB Model
nb_hash = MultinomialNB()
nb_hash.fit(x_train, y_train)
predict_nb = nb_hash.predict(x_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
#evaluation
print(classification_report(y_test, predict_lr, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.82      0.83      7467
           1       0.82      0.83      0.83      7533

    accuracy                           0.83     15000
   macro avg       0.83      0.83      0.83     15000
weighted avg       0.83      0.83      0.83     15000



In [None]:
#evaluation for NB
print(classification_report(y_test, predict_nb, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.73      0.78      0.75      7467
           1       0.77      0.71      0.74      7533

    accuracy                           0.75     15000
   macro avg       0.75      0.75      0.75     15000
weighted avg       0.75      0.75      0.75     15000

