## Exercise 3.2: Sentiment Analysis and Preprocessing Text

### Part 1: Using the TextBlob Sentiment Analyzer

In [1]:
# Import all the required packages

import pandas as pd 
import numpy as np
import re as re
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import html

In [3]:
# 1. Import the movie review data as a data frame and ensure that the data is loaded properly.
# From the given data source download the data and extraxt and use the file : labeledTrainData.tsv

movie_reviews = pd.read_csv("/Users/Jagadeesh/Desktop/SAI_MS/SAI_550/week-3/labeledTrainData.tsv",sep="\t")
movie_reviews.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [4]:
movie_reviews.tail()

Unnamed: 0,id,sentiment,review
24995,3453_3,0,It seems like more consideration has gone into...
24996,5064_1,0,I don't believe they made this film. Completel...
24997,10905_3,0,"Guy is a loser. Can't get girls, needs to buil..."
24998,10194_3,0,This 30 minute documentary Buñuel made in the ...
24999,8478_8,1,I saw this movie as a child and it broke my he...


In [5]:
# 2. How many of each positive and negative reviews are there?

#Display the total number for each sentiment, positive and negative
movie_reviews.sentiment.value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

In [6]:
# 3. Use TextBlob to classify each movie review as positive or negative. 
# Assume that a polarity score greater than or equal to zero is a positive sentiment and less than 0 is a negative sentiment.

from textblob import TextBlob

# Create a new column 'senti_score' using the TextBlob polarity score from the 'review' column of movie_reviews
movie_reviews['senti_score'] = movie_reviews['review'].apply(lambda text: TextBlob(text).sentiment.polarity)

In [7]:
#verify if the new column 'senti_score' is there in the data frame 'movie_reviews'.
movie_reviews.head()

Unnamed: 0,id,sentiment,review,senti_score
0,5814_8,1,With all this stuff going down at the moment w...,0.001277
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941
3,3630_4,0,It must be assumed that those who praised this...,0.134753
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842


In [8]:
# Create a function to analyze the polarity scores to determine positivity, neutrality or negativity
# if the sentiment score is < zero return '0' , else consider it as '1'. 

def analyzePolarity(senti_score):
  if senti_score < 0:
    return '0'
  else:
    return '1'

In [10]:
# Create a new column by applying the function to the sentiment score

movie_reviews['senti_textblob'] = movie_reviews['senti_score'].apply(analyzePolarity)


In [11]:
##verify if the new column 'senti_textblob' is there in the data frame 'movie_reviews' with the values 0 or 1
movie_reviews.head()

Unnamed: 0,id,sentiment,review,senti_score,senti_textblob
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0


In [12]:
# counts of positive and negative reviews
movie_reviews.senti_textblob.value_counts()

# As per the below results there are : 
#Positive : 19017 reviews
#Negative : 5983 reviews

1    19017
0     5983
Name: senti_textblob, dtype: int64

In [15]:
#4. Check the accuracy of this model. Is this model better than random guessing?

print("Accuracy of positive sentiment prediction by textBlob :", sum((movie_reviews['sentiment'] > 0) & (movie_reviews['senti_score'] >= 0)))
print("Accuracy of negative sentiment prediction by textBlob :", sum((movie_reviews['sentiment'] <= 0) & (movie_reviews['senti_score'] < 0)))

Accuracy of positive sentiment prediction by textBlob : 11824
Accuracy of negative sentiment prediction by textBlob : 5307


Number of positive sentiment prediction from textBlob : 11824
Number of negative sentiment prediction from textBlob : 5307
Total number of reviews from textBlob: 11824 + 5307 = 17131

Total number of samples: 25000

Accuracy of textBlob = (17131/25000)*100 = 68.524%

Accuracy of textBlob model is about 69%. I think this is definitely better than random guessing which would be of 50% accuracy  with either yes or no.

In [16]:
# 5. For up to five points extra credit, use another prebuilt text sentiment analyzer, e.g., VADER, and repeat steps (3) and (4).

#Importing vader module
import nltk
nltk.downloader.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Jagadeesh\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jagadeesh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jagadeesh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
analyzer_vader = SentimentIntensityAnalyzer()
movie_reviews['negative'] = [analyzer_vader.polarity_scores(x)['neg'] for x in movie_reviews['review']]
movie_reviews['neutral'] = [analyzer_vader.polarity_scores(x)['neu'] for x in movie_reviews['review']]
movie_reviews['positive'] = [analyzer_vader.polarity_scores(x)['pos'] for x in movie_reviews['review']]
movie_reviews['compound'] = [analyzer_vader.polarity_scores(x)['compound'] for x in movie_reviews['review']]
movie_reviews['total'] = movie_reviews['positive'] - movie_reviews['negative']

In [18]:
##verify if the new columns 'negative','neutral','positive', 'compound', 'total'  are there in the data frame 'movie_reviews' with the values 0 or 1
movie_reviews.head()

Unnamed: 0,id,sentiment,review,senti_score,senti_textblob,negative,neutral,positive,compound,total
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1,0.13,0.744,0.126,-0.8278,-0.004
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1,0.047,0.739,0.214,0.9819,0.167
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0,0.142,0.8,0.058,-0.9883,-0.084
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1,0.066,0.878,0.056,-0.2189,-0.01
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0,0.119,0.741,0.14,0.796,0.021


In [19]:
# Calculating positive and negative review sentiment analysis count by Vader
print("Number of rows in the data set with positive reviews in dataset as per vader analysis :", sum(movie_reviews['compound'] >= 0))
print("Number of rows in the data set with Negative reviews in dataset as per vader analysis :", sum(movie_reviews['compound'] < 0))

Number of rows in the data set with positive reviews in dataset as per vader analysis : 16475
Number of rows in the data set with Negative reviews in dataset as per vader analysis : 8525


Total number of agreements by VADER: 10657+6682 = 17339

Total number of samples: 25000

Accuracy of VADER = (17339/25000)*100 = 69.356%

Accuracy of VADER model is about 70%. Accuracy of textBlob model is about 69%.
So VADER model is definitely better than random guessing which would be of 50% accuracy with either yes or no and Slight better than textBlob model.

### Part 2: Prepping Text for a Custom Model

In [20]:
# 1. Convert all text to lowercase letters.
# 2. Remove punctuation and special characters from the text.
# 3. Remove stop words.

# creating function to clean the text
def cleantext(text):
    text = BeautifulSoup(text).get_text()  # beautifying text
    letters_only = re.sub("[^a-zA-Z]", " ", text) # clean the html charecters (non text)
    words = letters_only.lower().split()   # convert to lower case text                        
    stops = set(stopwords.words("english")) # setting stop words to remove                  
    main_words = [w for w in words if not w in stops]   
    return( " ".join( main_words )) 

In [21]:
# applying cleantext function on the data frame and creating a new column with clean text name 'Clean_review'
movie_reviews['clean_review'] = movie_reviews['review'].apply(cleantext)

In [22]:
#Validating the new column 'clean_review'
movie_reviews.head()

Unnamed: 0,id,sentiment,review,senti_score,senti_textblob,negative,neutral,positive,compound,total,clean_review
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1,0.13,0.744,0.126,-0.8278,-0.004,stuff going moment mj started listening music ...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1,0.047,0.739,0.214,0.9819,0.167,classic war worlds timothy hines entertaining ...
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0,0.142,0.8,0.058,-0.9883,-0.084,film starts manager nicholas bell giving welco...
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1,0.066,0.878,0.056,-0.2189,-0.01,must assumed praised film greatest filmed oper...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0,0.119,0.741,0.14,0.796,0.021,superbly trashy wondrously unpretentious explo...


In [23]:
# 4. Apply NLTK’s PorterStemmer.

# import the required modules
from nltk.stem import PorterStemmer

In [24]:
# Applying porterstemmer on clean_review
ps = PorterStemmer()
movie_reviews['clean_review'] = movie_reviews['clean_review'].apply(lambda review: ps.stem(review))

In [25]:
#validate
movie_reviews.head()

Unnamed: 0,id,sentiment,review,senti_score,senti_textblob,negative,neutral,positive,compound,total,clean_review
0,5814_8,1,With all this stuff going down at the moment w...,0.001277,1,0.13,0.744,0.126,-0.8278,-0.004,stuff going moment mj started listening music ...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",0.256349,1,0.047,0.739,0.214,0.9819,0.167,classic war worlds timothy hines entertaining ...
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,-0.053941,0,0.142,0.8,0.058,-0.9883,-0.084,film starts manager nicholas bell giving welco...
3,3630_4,0,It must be assumed that those who praised this...,0.134753,1,0.066,0.878,0.056,-0.2189,-0.01,must assumed praised film greatest filmed oper...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,-0.024842,0,0.119,0.741,0.14,0.796,0.021,superbly trashy wondrously unpretentious explo...


In [26]:
# importing word_tokenize

from nltk import word_tokenize

# extracting and prinitng tokenized values sample from clean_review column of movie_reviews data frame
corpora = movie_reviews['clean_review'].values
tokenized = [word_tokenize(corpus) for corpus in corpora]

print(tokenized[1111])

['call', 'episode', 'brilliant', 'feels', 'like', 'little', 'say', 'keeps', 'excellent', 'work', 'season', 'premiere', 'reductive', 'cause', 'never', 'far', 'great', 'sopranos', 'episode', 'far', 'fact', 'title', 'might', 'smug', 'invitation', 'real', 'fans', 'yet', 'join', 'club', 'picking', 'junior', 'left', 'putting', 'bullet', 'nephew', 'gut', 'mistaking', 'crook', 'killed', 'first', 'season', 'story', 'begins', 'tony', 'absolutely', 'fine', 'recollection', 'whatsoever', 'happened', 'attending', 'kind', 'convention', 'speaking', 'normal', 'accent', 'seems', 'something', 'wrong', 'papers', 'apparently', 'tony', 'soprano', 'kevin', 'finnerty', 'least', 'group', 'people', 'think', 'mess', 'sorted', 'leave', 'hotel', 'naturally', 'pure', 'sopranos', 'tradition', 'turns', 'nothing', 'dream', 'tony', 'actually', 'coma', 'doctors', 'uncertain', 'regarding', 'fate', 'family', 'friends', 'worried', 'sick', 'junior', 'refusing', 'believe', 'whole', 'thing', 'actually', 'happened', 'unfortuna

In [27]:
#shape of the modified data frame
movie_reviews.shape

(25000, 11)

In [28]:
# 5. Create a bag-of-words matrix from your stemmed text (output from (4)), where each row is a word-count vector 
#    for a single movie review (see sections 5.3 & 6.8 in the Machine Learning with Python Cookbook). 
#    Display the dimensions of your bag-of-words matrix. The number of rows in this matrix should be the same as 
#    the number of rows in your original data frame.

#Creating bag_of_words matrix from clean review
count = CountVectorizer()
bag_of_words = count.fit_transform(movie_reviews['clean_review'])

In [29]:
print(bag_of_words)

  (0, 64191)	1
  (0, 27550)	3
  (0, 43422)	1
  (0, 43192)	11
  (0, 63275)	1
  (0, 38803)	1
  (0, 44457)	2
  (0, 72931)	1
  (0, 46610)	1
  (0, 18752)	1
  (0, 72925)	2
  (0, 74096)	1
  (0, 43666)	2
  (0, 41360)	3
  (0, 72753)	1
  (0, 26876)	1
  (0, 10554)	1
  (0, 33548)	1
  (0, 28842)	2
  (0, 67130)	1
  (0, 53985)	2
  (0, 13974)	2
  (0, 20521)	1
  (0, 40249)	1
  (0, 42722)	1
  :	:
  (24999, 11248)	3
  (24999, 71146)	1
  (24999, 57954)	1
  (24999, 8372)	1
  (24999, 67822)	1
  (24999, 15151)	1
  (24999, 41670)	1
  (24999, 36350)	1
  (24999, 1092)	1
  (24999, 28244)	1
  (24999, 65969)	1
  (24999, 2415)	1
  (24999, 70186)	1
  (24999, 11374)	1
  (24999, 23509)	1
  (24999, 8368)	1
  (24999, 17022)	1
  (24999, 15098)	1
  (24999, 16559)	1
  (24999, 37346)	1
  (24999, 61936)	1
  (24999, 70061)	1
  (24999, 34019)	1
  (24999, 11328)	2
  (24999, 70162)	1


In [30]:
#Size of bag_of_words
bag_of_words 

<25000x75529 sparse matrix of type '<class 'numpy.int64'>'
	with 2446144 stored elements in Compressed Sparse Row format>

In [31]:
# 6. Create a term frequency-inverse document frequency (tf-idf) matrix from your stemmed text, for your movie reviews 
#    (see section 6.9 in the Machine Learning with Python Cookbook). Display the dimensions of your tf-idf matrix. 
#    These dimensions should be the same as your bag-of-words matrix.

# Import tf-idf encoding from sklearn library
from sklearn.feature_extraction.text import TfidfVectorizer

# Define some hiperparameters of encoded
vectorizer = TfidfVectorizer()

# Create the training set with the words encoded as features of the reviews
train_data_features = vectorizer.fit_transform(movie_reviews['clean_review'])

print(train_data_features.shape)

(25000, 75529)


In [32]:
# Import the logistic regression model from sklearn 

from sklearn.linear_model import LogisticRegression

# Define the model
model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
# Train model
model.fit(train_data_features, movie_reviews['sentiment'])

LogisticRegression(multi_class='multinomial', random_state=0)

In [34]:
# Testing the model against entire train data from original trained data

# Read the test data
test = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", \
                   quoting=3 )
print(test.shape)


(25000, 3)


In [35]:
#validate
test.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [37]:
# Clean the text of all reviews in the training set
print("Cleaning and parsing the test set movie reviews...\n")
test['clean_review'] = test['review'].apply(cleantext)

test.head()

Cleaning and parsing the test set movie reviews...



Unnamed: 0,id,sentiment,review,clean_review
0,"""5814_8""",1,"""With all this stuff going down at the moment ...",stuff going moment mj started listening music ...
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ...",classic war worlds timothy hines entertaining ...
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell...",film starts manager nicholas bell giving welco...
3,"""3630_4""",0,"""It must be assumed that those who praised thi...",must assumed praised film greatest filmed oper...
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ...",superbly trashy wondrously unpretentious explo...


In [38]:
# Create the test set with the words encoded as features of the reviews
test_data_features = vectorizer.transform(test['clean_review'])


# Use the logistic regression model to make sentiment label predictions
result = model.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and a "sentiment" column
output = pd.DataFrame( data={"id":test["id"],"original_sentiment":test["sentiment"] ,"sentiment_custom":result})
output.head()

Unnamed: 0,id,original_sentiment,sentiment_custom
0,"""5814_8""",1,1
1,"""2381_9""",1,1
2,"""7759_3""",0,0
3,"""3630_4""",0,0
4,"""9495_8""",1,0


In [39]:
# Calculating number of positive and negative review sentiment analysis count by custom model
print("Number of rows in the data set with positive reviews in dataset per custom model :", sum(output['sentiment_custom'] == 1))
print("Number of rows in the data set with negative reviews in dataset per custom model :", sum(output['sentiment_custom'] == 0))

Number of rows in the data set with positive reviews in dataset per custom model : 12611
Number of rows in the data set with negative reviews in dataset per custom model : 12389


In [40]:
# Calculating Accuracy of custom model where labelled test data and VADER preduction for sentiment are matching
print("Accurate positive sentiment prediction by custom model :", sum((output['original_sentiment'] == 1) & (output['sentiment_custom'] == 1)))
print("Accurate negative sentiment prediction by custom model :", sum((output['original_sentiment'] == 0) & (output['sentiment_custom'] == 0)))

Accurate positive sentiment prediction by custom model : 11997
Accurate negative sentiment prediction by custom model : 11886


Total numbber of agreements by custom model: 11997+11886 = 23883

Total numbber of samples: 25000

Accuracy of Custom Model = (23883/25000)*100 = 95.532%
Accuracy of VADER model is about 70%. 
Accuracy of textBlob model is about 69%.
Accuracy of Custom model is about 95.5%. This is definitely better than random guessing which would be of 50% accuracy with either yes or no.


In [41]:
# Read the test data
test2 = pd.read_csv("/Users/Jagadeesh/Desktop/SAI_MS/SAI_550/week-3/labeledTrainData.tsv", header=0, delimiter="\t", \
                   quoting=3 )
print(test2.shape)



(25000, 3)


In [42]:
# Clean the text of all reviews in the training set
print("Cleaning and parsing the test set movie reviews...\n")
test2['clean_review'] = test2['review'].apply(cleantext)

# Create the test set with the words encoded as features of the reviews
test_data_features = vectorizer.transform(test2['clean_review'])


# Use the logistic regression model to make sentiment label predictions
result = model.predict(test_data_features)

# Copy the results to a pandas dataframe with an "id" column and a "sentiment" column
output_test = pd.DataFrame( data={"id":test2["id"],"sentiment":result})
output_test.head()

Cleaning and parsing the test set movie reviews...



Unnamed: 0,id,sentiment
0,"""5814_8""",1
1,"""2381_9""",1
2,"""7759_3""",0
3,"""3630_4""",0
4,"""9495_8""",0


In [43]:
output_test.to_csv("test_result.csv", index=False, quoting=3 )