<a href="https://colab.research.google.com/github/sumukhbhat12/Natural-Language-Processing-Course/blob/main/NLTP_Course_simple_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The dataset used is IMDB 50k movie reviews dataset available in kaggle

https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [4]:
import pandas as pd
import numpy as np

**import the csv dataset using pd.read_csv() method**

In [15]:
import csv

dataset = pd.read_csv('/content/IMDB Dataset.csv', on_bad_lines=None)

print(dataset.shape)

print(dataset.head(10))

(50000, 2)
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
5  Probably my all-time favorite movie, a story o...  positive
6  I sure would like to see a resurrection of a u...  positive
7  This show was an amazing, fresh & innovative i...  negative
8  Encouraged by the positive comments about this...  negative
9  If you like original gut wrenching laughter yo...  positive


check for null values

In [16]:
print(dataset.isnull().sum())
print(dataset['sentiment'].value_counts())

review       0
sentiment    0
dtype: int64
positive    25000
negative    25000
Name: sentiment, dtype: int64


**Text Normalization**

imports

In [17]:
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud, STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize

import spacy
import re, string, unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob, Word
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from bs4 import BeautifulSoup

In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [19]:
#Tokenization of text
tokenizers = ToktokTokenizer()

#English Stopwords
stopwords = nltk.corpus.stopwords.words('english')

# print(stopwords)

Remove the noise from the text

In [20]:
def noiseremoval_text(text):
  #BeautifulSoup is used to remove the html or xml tags from the data
  soup = BeautifulSoup(text, "html.parser")
  text = soup.get_text()
  text = re.sub('\[[^]]*\]', '', text)
  return text

In [21]:
dataset['review'] = dataset['review'].apply(noiseremoval_text)

  soup = BeautifulSoup(text, "html.parser")


**Stemming**

In [22]:
def stemmer(text):
  ps = PorterStemmer()
  text = ' '.join([ps.stem(word) for word in text.split()])
  return text

In [23]:
dataset.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


**Remove Stopwords**

In [24]:
def removing_stopwords(text, is_lower_case=False):
  tokenizers = ToktokTokenizer()
  tokens = tokenizers.tokenize(text)
  tokens = [i.strip() for i in tokens]
  filtokens = [i for i in tokens if i.lower() not in stopwords]
  filtered_text = ' '.join(filtokens)
  return filtered_text

In [25]:
dataset['review'] = dataset['review'].apply(removing_stopwords)

In [27]:
dataset['review'] = dataset['review'].apply(stemmer)

In [29]:
dataset.head(10)

Unnamed: 0,review,sentiment
0,one review mention watch 1 oz episod ' hooked....,positive
1,wonder littl production. film techniqu unassum...,positive
2,thought wonder way spend time hot summer weeke...,positive
3,basic ' famili littl boy ( jake ) think ' zomb...,negative
4,"petter mattei ' "" love time money "" visual stu...",positive
5,"probabl all-tim favorit movi , stori selfless ...",positive
6,sure would like see resurrect date seahunt ser...,positive
7,"show amaz , fresh &amp; innov idea 70 ' first ...",negative
8,encourag posit comment film look forward watch...,negative
9,like origin gut wrench laughter like movie. yo...,positive


**Test Train Split**

In [30]:
train_data = dataset.review[:30000]

test_data = dataset.review[30000:]

**Bag of Words**

In [31]:
# Count Vectorizer for bag of words

cv = CountVectorizer(min_df=0, max_df=1,binary=False, ngram_range=(1,3))

cv_train = cv.fit_transform(train_data)

cv_test = cv.transform(test_data)

print(cv_train.shape)
print(cv_test.shape)

(30000, 4772003)
(20000, 4772003)


**TF-IDF**

In [32]:
# TF-IDF vectorizer

tf = TfidfVectorizer(min_df=0, max_df=1, use_idf=True, ngram_range=(1,3))

tf_train = tf.fit_transform(train_data)
tf_test = tf.transform(test_data)

print(tf_train.shape)
print(tf_test.shape)

(30000, 4772003)
(20000, 4772003)


**Label Encoding**

In [34]:
# Label the sentiment data
label = LabelBinarizer()

sentiment_data = label.fit_transform(dataset['sentiment'])
print(sentiment_data)

[[1]
 [1]
 [1]
 ...
 [0]
 [0]
 [0]]


In [36]:
train_sentiment_data = dataset.sentiment[:30000]
test_sentiment_data = dataset.sentiment[30000:]

**Train the Logistic Regression model for CountVectorizer**

In [39]:
# Train the model
logistic = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)

# fitting the model for bag of words
lr_bow = logistic.fit(cv_train,train_sentiment_data)
print(lr_bow)


LogisticRegression(C=1, max_iter=500, random_state=42)


**Predicting the model**

In [40]:
#For bag of words
lr_bow_predict = logistic.predict(cv_test)
print(lr_bow_predict)



['negative' 'negative' 'negative' ... 'negative' 'negative' 'negative']


**Accuracy Score of bag of words**

In [43]:
lr_bow_score = accuracy_score(test_sentiment_data, lr_bow_predict)
print(lr_bow_score)

0.5984


**Train the Logistic Regression model for TF-IDF**

In [41]:
#fitting the model for tfidf features
lr_tfidf = logistic.fit(tf_train,train_sentiment_data)
print(lr_tfidf)

LogisticRegression(C=1, max_iter=500, random_state=42)


**Predicting the model**

In [42]:
#For TF-IDF
lr_tfidf_predict = logistic.predict(tf_test)
print(lr_tfidf_predict)

['negative' 'negative' 'positive' ... 'negative' 'positive' 'positive']


**Accuracy Score of TF-IDF**

In [44]:
lr_tfidf_score = accuracy_score(test_sentiment_data, lr_tfidf_predict)
print(lr_tfidf_score)

0.73875
