In [1]:
import pandas as pd
import numpy as np
import os
import re #used as a regular expression to find particular patterns and process it
import sys
#visualization library
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [2]:
data=pd.read_csv("C:/Users/visha/Downloads/Mountain Analytics IMDB Dataset 1.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
data.shape

(50000, 2)

In [4]:
data.describe() # summary of the dataset

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [5]:
#checking number of positive and negative sentiment counts
data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [6]:
data.isnull().sum() #here we have total number of null values are zero.

review       0
sentiment    0
dtype: int64

In [7]:
data['sentiment'].unique() # find the unique values

array(['positive', 'negative'], dtype=object)

In [8]:
labeling = {
    'positive':1, 
    'negative':0
}

data['sentiment'] = data['sentiment'].apply(lambda x : labeling[x]) # it converts categarical sentiment into numerics
# Output first ten rows
data.head(10)
#so,There are no missing values in any of the dataset's columns.

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
5,"Probably my all-time favorite movie, a story o...",1
6,I sure would like to see a resurrection of a u...,1
7,"This show was an amazing, fresh & innovative i...",0
8,Encouraged by the positive comments about this...,0
9,If you like original gut wrenching laughter yo...,1


In [9]:
# checking how many duplicate valu there are?
data.duplicated().value_counts()

False    49582
True       418
dtype: int64

# Data Preprocessing

Firstly we will use regular expressions to make the dataframe suitable for analysis.
remove punctuation marks, remove HTML tags, remove URL's, remove characters which are not letters or digits, remove successive whitespaces, convert the text to lower case, strip whitespaces from the beginning and the end of the reviews

In [10]:
# droping duplicate values from data
data.drop_duplicates(inplace=True)


In [11]:
# cheking duplicated values after droping
data.duplicated().value_counts()

False    49582
dtype: int64

In [12]:
data['sentiment']

0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 49582, dtype: int64

In [None]:
import nltk # importing libraries for cleanning text
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from bs4 import BeautifulSoup

# function to clean whole text
import re

def remove_url(text):
    url_tag = re.compile(r'https://\S+|www\.\S+')
    text = url_tag.sub(r'', text)
    return text

def remove_html(text):
    html_tag = re.compile(r'<.*?>')
    text = html_tag.sub(r'', text)
    return text



def remove_punctuation(text): 
    punct_tag = re.compile(r'[^\w\s]')
    text = punct_tag.sub(r'', text) 
    return text

def remove_special_character(text):
    special_tag = re.compile(r'[^a-zA-Z0-9\s]')
    text = special_tag.sub(r'', text)
    return text
def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
def remove_numbers(text):
    text = remove_numbers(text)
    
    return text    
    
def clean_text(text):
    text = remove_url(text)
    text = remove_html(text)
    text = remove_punctuation(text)
    text = remove_special_character(text)
    text = remove_emojis(text)
    text = remove_numbers(text)
    
    return text
data['sentiment'] = data['review'].apply(clean_text)

In [None]:
data['processed'] = data['sentiment'].apply(lambda x: clean_text(x)) # how the data looks like now
data

In [None]:
#Tokenization of text
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer=ToktokTokenizer() #for every function

#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

In [None]:
from nltk.tokenize import word_tokenize,sent_tokenize

#we can either remove stopwords before or after stemming. But since this is a review context, we expect users to have used many different words and we did
# stemming before filtering for stopwords.

stop=set(stopwords.words('english'))
print(stop)

#Removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#Apply function on review column
data['review']=data['review'].apply(remove_stopwords)

In [None]:
#Stemming and Lemmatization
#Stemming is rule-based, it omits the last few letters like 'ing', 'ed', 'es' and more. It is fast but may create strange words. Lemmatizing is dictionary-based, where it translates all words to the root form, like 'went' to 'go', 'going' to 'go' and more. Generally we prefer lemmatizing, but it might take some time in large datasets.
from nltk.stem import WordNetLemmatizer,SnowballStemmer
def simple_stemmer(text):
    ps = SnowballStemmer(language='english')
    return ' '.join([ps.stem(word) for word in tokenizer.tokenize(text)])

In [None]:
def simple_stemmer(text):
    ps = SnowballStemmer(language='english')
    return ' '.join([ps.stem(word) for word in tokenizer.tokenize(text)])

In [None]:
data['processed'][1]

In [None]:
%time simple_stemmer(data['processed'][1])

In [None]:
#Lemmatizer
from nltk.tag import pos_tag
def lemmatize_all(sentence):
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(word_tokenize(sentence)):
        if tag.startswith("NN"):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        else:
            yield word
            
def lemmatize_text(text):
    return ' '.join(lemmatize_all(text))

In [None]:
data['processed'][1]

In [None]:
%time lemmatize_text(data['processed'][1])

# EDA

In [None]:
# it shows the distribution of sentiments
data.groupby('sentiment').count().plot(kind='bar')

In [None]:
!pip install plotly

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from matplotlib import style
style.use('ggplot')

In [None]:
#def no_of_words(text):
    #words= text.split()
   # word_count = len(words)
   # return word_count

#data['word count'] = data['review'].apply(no_of_words)
#data.head()


In [None]:
# Calculate review lengths
review_len = pd.Series([len(review.split()) for review in data['review']])

# The distribution of review text lengths
review_len.plot(kind='box')

In [None]:
#Now,visualize how long our sentences are in the training data.
sns.set_theme(
    context='notebook',
    style='darkgrid',
    palette='deep',
    font='sans-serif',
    font_scale=1,
    color_codes=True,
    rc=None,
)

plt.figure(figsize = (10,12))
sns.histplot(review_len)

In [None]:
fig = plt.figure(figsize=(14,7))
data['length'] = data.review.str.split().apply(len)
ax1 = fig.add_subplot(122)
sns.histplot(data[data['sentiment']==1]['length'], ax=ax1,color='green')
describe = data.length[data.sentiment==1].describe().to_frame().round(2)

ax2 = fig.add_subplot(121)
ax2.axis('off')
font_size = 14
bbox = [0, 0, 1, 1]
table = ax2.table(cellText = describe.values, rowLabels = describe.index, bbox=bbox, colLabels=describe.columns)
table.set_fontsize(font_size)
fig.suptitle('Distribution of text length for positive sentiment reviews.', fontsize=16)

plt.show()


In [None]:
fig = plt.figure(figsize=(14,7))
ax1 = fig.add_subplot(122)
sns.histplot(data[data['sentiment']==0]['length'], ax=ax1,color='red')
describe = data.length[data.sentiment==0].describe().to_frame().round(2)

ax2 = fig.add_subplot(121)
ax2.axis('off')
font_size = 14
bbox = [0, 0, 1, 1]
table = ax2.table(cellText = describe.values, rowLabels = describe.index, bbox=bbox, colLabels=describe.columns)
table.set_fontsize(font_size)
fig.suptitle('Distribution of text length for Negative sentiment reviews.', fontsize=16)

plt.show()

In [None]:
# visualize the frequent words

all_words = " ".join([sentence for sentence in data['processed']])

wordcloud = WordCloud(width=800, height=500, random_state=42, max_font_size=100).generate(all_words)

# plot the graph
plt.figure(figsize=(15,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# frequent words visualization for -ve



from wordcloud import WordCloud
plt.figure(figsize = (15,8)) # Negative Review Text
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800, random_state=42).generate(" ".join(data[data.sentiment == 0].processed))
plt.imshow(wc , interpolation = 'bilinear')
plt.axis('off')
plt.show()

In [None]:
# frequent words visualization for +ve

from wordcloud import WordCloud
plt.figure(figsize = (15,8)) # Positive Review Text
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(data[data.sentiment == 1].processed))
plt.imshow(wc , interpolation = 'bilinear')
plt.axis('off')
plt.show()

# prepare data for training

In [None]:
data['sentiment'] = data['sentiment'].map({'positive':1,'negative':0})

train_data = data.sample(frac=0.8,random_state=100)
test_data = data.drop(train_data.index)

print(f"Train data shape: {train_data.shape}")
print(f"Test  data shape: {test_data.shape}")


# Tokenization

In [None]:
import tensorflow as tf
print(tf.__version__)
tokenizer  = tf.keras.preprocessing.text.Tokenizer(num_words=8000)
tokenizer.fit_on_texts(np.append(train_data['review'].values,test_data['review'].values))

word_index = tokenizer.word_index
nb_words = len(word_index) + 1

train_seq = tokenizer.texts_to_sequences(train_data["review"])
test_seq = tokenizer.texts_to_sequences(test_data["review"])

train_data = tf.keras.preprocessing.sequence.pad_sequences(train_seq, maxlen=100)
test_data = tf.keras.preprocessing.sequence.pad_sequences(test_seq, maxlen=100)

print(f"Train data shape: {train_data.shape}")
print(f"Test  data shape: {test_data.shape}")

# Model Building

In [None]:
# sklearn
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
data['sentiment'].value_counts()


In [None]:
#split the dataset  
#train dataset
train_reviews=data.review[:40000]
train_sentiments=data.sentiment[:40000]
#test dataset
test_reviews=data.review[40000:]
test_sentiments=data.sentiment[40000:]
print(train_reviews.shape,train_sentiments.shape)
print(test_reviews.shape,test_sentiments.shape)


In [None]:
X = data['review']
Y = data['sentiment']