In [1]:
import numpy as np
import pandas as pd
import nltk

In [2]:
imdb = pd.read_csv('https://raw.githubusercontent.com/skathirmani/datasets/master/imdb_sentiment.csv')

In [3]:
imdb.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


# sentiment analysis using naive bayes

# Supervised text/document classification

In [5]:
#imdb['review']
pd.isnull(imdb['review']).sum()

0

In [10]:
docs = imdb['review'].str.lower().str.replace('[^a-z ]','')
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend([])
stemmer = nltk.stem.PorterStemmer()
def clean_doc(doc):
    words  = nltk.word_tokenize(doc)
    words_clean = [stemmer.stem(word) for word in words if word not in stopwords]
    doc_clean =' '.join(words_clean)
    return doc_clean
docs_clean =docs.apply(clean_doc)

- corpus :collection of documents
- documents:collction of terms/sentences
- term:collection of words:if 1 word:unigram,2 words:bigram,3 words:trigram

In [14]:
# conversion of text column to numeric values:Document Term Matrix
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split


train,test = train_test_split(docs_clean,test_size=0.2,random_state=1)
vectorizer =CountVectorizer(min_df =2)
vectorizer.fit(train)
train_x= pd.DataFrame(vectorizer.transform(train).toarray(),columns = vectorizer.get_feature_names())
test_x= pd.DataFrame(vectorizer.transform(test).toarray(),columns = vectorizer.get_feature_names())
train_x.head()
train_x.shape

(598, 808)

**Note**: Review column which is a text column ,converted into  numerical columns(Document Term matrix).
    So we have 808 unique terms in our review column (ignoring stopword and rarely occuring terms)

In [16]:
train_y =imdb.loc[train.index,'sentiment']
test_y= imdb.loc[test.index,'sentiment']

In [19]:
# Model Building
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score
gau_model =GaussianNB()
gau_model.fit(train_x,train_y)
test_pred =gau_model.predict(test_x)
print(accuracy_score(test_y,test_pred))

0.5933333333333334


In [20]:
mn_model =MultinomialNB()
mn_model.fit(train_x,train_y)
test_pred =mn_model.predict(test_x)
print(accuracy_score(test_y,test_pred))

0.7533333333333333


# Tfidf vectorization

In [21]:
vectorizer =TfidfVectorizer(min_df =2)
vectorizer.fit(train)
train_x= pd.DataFrame(vectorizer.transform(train).toarray(),columns = vectorizer.get_feature_names())
test_x= pd.DataFrame(vectorizer.transform(test).toarray(),columns = vectorizer.get_feature_names())

In [22]:
mn_model =MultinomialNB()
mn_model.fit(train_x,train_y)
test_pred =mn_model.predict(test_x)
print(accuracy_score(test_y,test_pred))

0.7866666666666666


# Sentiment analaysis using VADER

In [23]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()
def get_sentiment(doc):
    score = sentiment.polarity_scores(doc)['compound']
    if(score>0):
        return 1
    else:
        return 0
sent_pred =imdb['review'].apply(get_sentiment)
print(accuracy_score(imdb['sentiment'],sent_pred))

0.7941176470588235
