# IMDB Sentiment Analysis Using BagOfWords,TF-IDF,Word2Vec

IMDB dataset having 50K movie reviews for natural language processing or Text analytics.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. The Dataset consists of 25,000 highly polar movie reviews for training and 25,000 for testing. So, We are going to predict the number of positive and negative reviews using classification ML algorithms.

<center><img src="https://upload.wikimedia.org/wikipedia/commons/thumb/6/69/IMDB_Logo_2016.svg/640px-IMDB_Logo_2016.svg.png"></center>

# Importing Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head()

In [None]:
df.info()

Now Let's convert the sentiment column values to numerical values by a simple python list comprehension expression which is given below.

In [None]:
df['sentiment'] = [1 if sentiment == 'positive' else 0 for sentiment in df['sentiment']]
df.head()

# Initial Data Analysis

In [None]:
import seaborn as sns

In [None]:
sns.countplot(df['sentiment'])

In [None]:
df['sentiment'].value_counts()

As we have the equal number of positive and negative reviews, we can say that the dataset is very well balanced.

# Text Preprocessing

In [None]:
import re
import nltk
nltk.download('stopwords')

As this is sentiment analysis we can use Stemming instead of Lemmatization.

To know about when to use stemming and lemmatization see below-

<b>Stemmming</b> = Spam classification, Review classification

<b>Lemmatization</b> = Text summarization, Language translation, chatbot

In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

Now we are going to apply stemming to all the sentences in the Dataset and we are also going to remove stopwords after converting all the words in the dataset to lowercase. After that we are going to store these sentences in a list named "corpus".

In [None]:
from tqdm import tqdm
corpus = []
for i in tqdm(range(0,len(df))):
    sentence = re.sub('[^a-zA-Z]',' ',df['review'][i]) ## each review is each sentence
    sentence = sentence.lower() ## Lower casing the words in each sentence
    sentence = sentence.split() ## splitting sentences to words and storing it as a list of words
    sentence = [ps.stem(word) for word in sentence if not word in stopwords.words('english')]  ## Removing stop words and applying stemming 
    sentence = ' '.join(sentence)  ## Joining words again to form the sentences
    corpus.append(sentence) ### storing each sentences to corpus

In [None]:
## corpus  ## If u want to see the corpus

# 1. Bag of Words Model

In [None]:
## Creating bag of words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X1 = cv.fit_transform(corpus).toarray()

In [None]:
y1 = pd.get_dummies(df['sentiment'])
y1 = y1.iloc[:,1].values

In [None]:
from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1,y1,test_size = 0.20, random_state=0)

In [None]:
X_train1.shape

In [None]:
y_train1.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB
model1 = MultinomialNB().fit(X_train1,y_train1)

In [None]:
y_pred1 = model1.predict(X_test1)

In [None]:
from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(y_test1,y_pred1))
print(classification_report(y_pred1,y_test1))

## Making a New Prediction

In [None]:
new_review = 'I love this movie so much. It\'s really great'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = model1.predict(new_X_test)
print(new_y_pred)

# 2. TF-IDF

In [None]:
## Creating tf-idf model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500)
X2 = tv.fit_transform(corpus).toarray()

In [None]:
y2=pd.get_dummies(df['sentiment'])
y2=y2.iloc[:,1].values

In [None]:
from sklearn.model_selection import train_test_split
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,y2, test_size=0.20, random_state=0)

In [None]:
X_train2.shape

In [None]:
y_train2.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB
model2 = MultinomialNB().fit(X_train2, y_train2)

In [None]:
y_pred2 = model2.predict(X_test2)

In [None]:
print(accuracy_score(y_test2,y_pred2))
print(classification_report(y_pred2,y_test2))

## Making a New Prediction

In [None]:
new_review = 'I Hate this movie so much. It\'s ok.'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = model2.predict(new_X_test)
print(new_y_pred)

# 3. Word2Vec

In [None]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
from tqdm import tqdm
corpus1 = []
for i in tqdm(range(0,len(df))):
    sentence = re.sub('[^a-zA-Z]',' ',df['review'][i]) ## each review is each sentence
    sentence = sentence.lower() ## Lower casing the words in each sentence
    sentence = sentence.split() ## splitting sentences to words and storing it as a list of words
    sentence = [lemmatizer.lemmatize(word) for word in sentence if not word in stopwords.words('english')]  ## Removing stop words and applying Lemmatizer 
    sentence = ' '.join(sentence)  ## Joining words again to form the sentences
    corpus1.append(sentence) ### storing each sentences to corpus

In [None]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [None]:
words = []
for sent in corpus1:
    sent_token = sent_tokenize(sent)
    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [None]:
#words  ## to see the nested words list

In [None]:
import gensim
model3 = gensim.models.Word2Vec(words,window=5,min_count=2)

In [None]:
## model3.wv.index_to_key

In [None]:
model3.corpus_count

In [None]:
model3.epochs

In [None]:
def avg_word2vec(doc):
    return np.mean([model3.wv[word] for word in doc if word in model3.wv.index_to_key], axis=0)

In [None]:
X3 = []
for i in tqdm(range(len(words))):
    X3.append(avg_word2vec(words[i]))

In [None]:
type(X3)

In [None]:
X_new = np.array(X3)

In [None]:
X_new.shape

In [None]:
y3=pd.get_dummies(df['sentiment'])
y3=y3.iloc[:,1].values

In [None]:
from sklearn.model_selection import train_test_split
X_train3, X_test3, y_train3, y_test3 = train_test_split(X_new,y3, test_size=0.20, random_state=0)

In [None]:
X_train3.shape

In [None]:
y_train3.shape

In [None]:
from sklearn.svm import SVC
model4 = SVC(kernel='rbf', random_state=0).fit(X_train3, y_train3)

In [None]:
y_pred3 = model4.predict(X_test3)

In [None]:
print(accuracy_score(y_test3,y_pred3))
print(classification_report(y_pred3,y_test3))

## Making a New Prediction

In [None]:
new_review = 'The Dr.Strange MOM movie was great.'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [lemmatizer.lemmatize(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]

new_words=[]
for sent in new_corpus:
    sent_token = sent_tokenize(sent)
    for sent in sent_token:
        new_words.append(simple_preprocess(sent))
        
new_X3 = []
for i in range(len(new_words)):
    new_X3.append(avg_word2vec(new_words[i]))
    
new_X = np.array(new_X3)
new_y_pred = model4.predict(new_X)
print(new_y_pred)