IMPORTING MODULES

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import re
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression


In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

IMPORTING DATASET

In [3]:
imdb_data = pd.read_csv("IMDB-Dataset.csv")

In [4]:
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
imdb_data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [6]:
imdb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [7]:
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


DROPPING DUPLICATES AND NULL VALUES

In [8]:
imdb_data.drop_duplicates(inplace=True)
imdb_data.dropna(inplace=True)
imdb_data.shape

(49582, 2)

In [9]:

imdb_data.sentiment.value_counts()

positive    24884
negative    24698
Name: sentiment, dtype: int64

REPLACE SEMTIMENT VALUES "POSITIVE" AND "NEGATIVE" AS "1" AND "0"

In [10]:
imdb_data.replace({'positive':1, 'negative':0}, inplace=True)
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [11]:
stop_words =  stopwords.words('english')

CLEANING REVIEWS

In [12]:
def clean_review(text, remove_stopwords=True, lemmatize_words=True):
    #removing html tags and special characters
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"\'s", " ", text)
    text =  re.sub(r'http\S+',' link ', text)
    text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text)
    #Removing punctuations
    text = ''.join([c for c in text if c not in punctuation])
    #Converting to lowercase
    text=text.lower()
    #Removing stopwords
    if remove_stopwords:
        text=word_tokenize(text)
        text=[w for w in text if w not in stop_words]
        text = " ".join(text)
    #Lemmatization
    if lemmatize_words:
        text = word_tokenize(text)
        lemmatized_words = [WordNetLemmatizer().lemmatize(word) for word in text]
        text = " ".join(lemmatized_words)
    return (text)


CLEANED DATA

In [14]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [15]:
imdb_data["cleaned_review"] = imdb_data["review"].apply(clean_review)

In [16]:
imdb_data.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,1,one reviewer mentioned watching oz episode hoo...
1,A wonderful little production. <br /><br />The...,1,wonderful little production br br filming tech...
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter mattei love time money visually stunnin...


In [17]:
X = imdb_data["cleaned_review"]
y = imdb_data.sentiment.values

SPLITTING DATA INTO TRAIN AND TEST SETS

In [18]:
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.20,)

MODEL BUILDING,TRAINING, VALIDATION AND ACCURACY SCORE

In [20]:
logisticregression_classifier = Pipeline(steps=[
                               ('pre_processing',TfidfVectorizer(lowercase=False)),
                                 ('LogisticRegression_classifier',LogisticRegression())
                                 ])


In [21]:
logisticregression_classifier.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
y_pred_lr = logisticregression_classifier.predict(X_test)

In [23]:
print(accuracy_score(y_test,y_pred_lr))

0.8944237168498538


Logistic regression has highest accuracy score
