In [37]:
import pandas as pd
import numpy as np
import text_normalizer as tn
import model_evaluation_utils as meu
import contractions

import re
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer 

In [38]:
np.set_printoptions(precision=2, linewidth=80)

In [39]:
dataset = pd.read_excel("Film_User_Reviews.xlsx")

In [40]:
dataset['Rating'] = dataset['Rating'].replace(r'\n','', regex=True).str.split('/').str[0].astype(int)

In [41]:
conditions = [
    (dataset['Rating'] <= 3),
    (dataset['Rating'] > 3) & (dataset['Rating'] <= 5),
    (dataset['Rating'] > 5) & (dataset['Rating'] <= 6),
    (dataset['Rating'] > 6)
    ]

# create a list of the values we want to assign for each condition
values = ['Low', 'Average', 'Median', 'High']

# create a new column and use np.select to assign values to it using our lists as arguments
dataset['Sentiment'] = np.select(conditions, values)

In [42]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,Film Title,Review User,Title,Review,Rating,Sentiment
0,0,Palm Springs,fadlanamin,Weird but Good Weird,I was expecting a conventional rom-com where t...,8,High
1,1,Palm Springs,kjproulx,It's Very Hard to Dislike a Movie like Palm S...,Films that revolve around characters repeating...,9,High
2,2,Palm Springs,cartsghammond,Pure fun,Palm Springs is just such a good time of a mov...,9,High
3,3,Palm Springs,cardsrock,Simply terrific,I'm impressed that people are still able to fi...,8,High
4,4,Palm Springs,Loptimus06,A New Take On Groundhog Day,"Palm Springs is ""One of those infinite time-lo...",8,High


In [43]:
reviews = np.array(dataset['Review'])

In [46]:
# Initialize empty array 
# to append clean text  
corpus = []  
  
# 1000 (reviews) rows to clean 
for i in range(0, 125):  
      
    # column : "Review", row ith 
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])  
      
    # convert all cases to lower cases 
    review = review.lower()  
      
    # split to array(default delimiter is " ") 
    review = review.split()  
      
    # creating PorterStemmer object to 
    # take main stem of each word 
    ps = PorterStemmer()  
      
    # loop for stemming each word 
    # in string array at ith row     
    review = [ps.stem(word) for word in review 
                if not word in set(stopwords.words('english'))]  
                  
    # rejoin all string array elements 
    # to create back into a string 
    review = ' '.join(review)   
      
    # append each string to create 
    # array of clean text  
    corpus.append(review)
    np.array(corpus)

In [47]:
sentiments = np.array(dataset['Sentiment'])

In [48]:
sentiments

array(['High', 'High', 'High', 'High', 'High', 'High', 'High', 'High', 'High',
       'High', 'High', 'High', 'High', 'High', 'High', 'High', 'High', 'High',
       'High', 'High', 'High', 'High', 'High', 'High', 'Average', 'High',
       'High', 'High', 'High', 'High', 'High', 'High', 'High', 'High', 'High',
       'High', 'High', 'High', 'High', 'High', 'High', 'High', 'High', 'High',
       'High', 'High', 'High', 'High', 'High', 'High', 'High', 'High', 'High',
       'High', 'High', 'High', 'High', 'Median', 'High', 'Median', 'Average',
       'Median', 'Average', 'High', 'Median', 'Median', 'Average', 'Average',
       'Median', 'Average', 'High', 'Median', 'Median', 'Average', 'Low',
       'Low', 'Low', 'Low', 'Low', 'Low', 'Low', 'Low', 'Low', 'Low', 'Low',
       'Average', 'Low', 'Low', 'Low', 'Median', 'Low', 'Low', 'Average',
       'Average', 'Average', 'High', 'Median', 'High', 'Low', 'Average',
       'Average', 'Average', 'Low', 'Low', 'Low', 'Low', 'Low', 'Low', 'Low',

In [60]:
test_reviews = reviews[85:]
test_sentiments = sentiments[85:]
sample_review_ids = [4, 7, 25, 35]

In [61]:
test_reviews

array(['As I was watching this movie I thought to myself "this really is a bad movie", the acting and the direction isn\'t the greatest.The movie centres on a teenager from a privileged family, supposedly with a high IQ, applying to the best universities in the US. As the movie progresses I couldn\'t help but feel little to no empathy for his character.What a flop.',
       "Brilliant story about a guy, who found everything ready in life, got bored and blew everything just to have fun...\nLet's just change the characters. Instead of rich white guy with wealthy parents, put a hardworking kid from the poor family.. Will he behave the same way? Easy to go wild when you drive fathers' Mercedes and live in the huge house with the pool. Screw it! Let's go crazy, not care about people, environment and other things, let's burn books, schools and other thing.\nMillions of kids nowadays do not have even access to schools and sharing books as treasures. So I do not really understand the message o

In [62]:
import textblob

for review, sentiment in zip(test_reviews[sample_review_ids], test_sentiments[sample_review_ids]):
    print('REVIEW:', review)
    print('Actual Sentiment:', sentiment)
    print('Predicted Sentiment polarity:', textblob.TextBlob(review).sentiment.polarity)
    print('-'*60)

REVIEW: A 3 for the romance, a 2 for the comedy and a little bit more for the score. the actors are good at talking, but not stalking.its just one of those senior high movie-stuff, not as crazy and stupid as the main stream romcom, but it doesnt reach deep into your heart. its like booksmart, everything happens within 24 hours, and its like''the last nigt of the pranks''.it has a story of love and happiness, but it doesnt fondle above the knee of what the grumpy old man holds as a standard for these kind of movies. so if you like to learn some high school anarchistic pranks, then be my guest, for the rest its for the rainy sunday afternoon. a weak 6.
Actual Sentiment: Median
Predicted Sentiment polarity: 0.09553921568627452
------------------------------------------------------------
REVIEW: Considering the culture of the world in 2020 and how things are shaping, this movie feels like it would have been great had it been made 20 years ago. The problems the characters face are totally u

In [63]:
sentiment_polarity = [textblob.TextBlob(review).sentiment.polarity for review in test_reviews]

In [64]:
sentiment_polarity

[0.054500000000000035,
 0.0885942760942761,
 0.5,
 0.125,
 0.09553921568627452,
 -0.3453703703703703,
 -0.008999999999999985,
 0.30185185185185187,
 0.1277777777777778,
 0.21141414141414142,
 0.38636363636363635,
 0.09999999999999999,
 0.21999999999999997,
 0.09175925925925926,
 0.011800334168755214,
 -0.21999999999999997,
 -0.0016741071428571389,
 0.17153679653679657,
 -0.18154761904761904,
 -0.12337662337662336,
 -0.07314814814814813,
 -0.369047619047619,
 0.0,
 -0.03541666666666665,
 0.0,
 -0.20833333333333334,
 -0.203125,
 -0.27037599497276915,
 0.023295454545454522,
 0.25384615384615383,
 0.05,
 0.14537037037037037,
 0.08559027777777778,
 -0.3306175595238095,
 0.5499999999999999,
 0.41875,
 -0.22592592592592592,
 0.15952380952380954,
 0.21000000000000002,
 -0.0634920634920635]

In [65]:
predicted_sentiments = ['Low' if score <= -0.1 else 'Average' if score <= 0.25 else 'Median' if score <= 0.4 else 'High'
                        for score in sentiment_polarity]

In [66]:
from sklearn.metrics import confusion_matrix

confusion_matrix(test_sentiments, predicted_sentiments)

array([[ 7,  0,  1,  1],
       [ 3,  2,  0,  2],
       [11,  1,  9,  0],
       [ 3,  0,  0,  0]], dtype=int64)

In [67]:
#!pip install afinn

In [68]:
from afinn import Afinn

In [69]:
afn = Afinn(emoticons=True)

In [70]:
for review, sentiment in zip(test_reviews[sample_review_ids], test_sentiments[sample_review_ids]):
    print('REVIEW:', review)
    print('Actual Sentiment:', sentiment)
    print('Predicted Sentiment polarity:', afn.score(review))
    print('-'*60)

REVIEW: A 3 for the romance, a 2 for the comedy and a little bit more for the score. the actors are good at talking, but not stalking.its just one of those senior high movie-stuff, not as crazy and stupid as the main stream romcom, but it doesnt reach deep into your heart. its like booksmart, everything happens within 24 hours, and its like''the last nigt of the pranks''.it has a story of love and happiness, but it doesnt fondle above the knee of what the grumpy old man holds as a standard for these kind of movies. so if you like to learn some high school anarchistic pranks, then be my guest, for the rest its for the rainy sunday afternoon. a weak 6.
Actual Sentiment: Median
Predicted Sentiment polarity: 12.0
------------------------------------------------------------
REVIEW: Considering the culture of the world in 2020 and how things are shaping, this movie feels like it would have been great had it been made 20 years ago. The problems the characters face are totally unrelatable to 9