# Mounting GoogleDrive

In [1]:
# Mounting Google-Drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Importing Modules

In [2]:
import pandas as pd
import numpy as np
import re
!pip install demoji
import demoji
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split



# Importing Dataset

In [3]:
# Importing Dataset
df = pd.read_csv('/content/gdrive/MyDrive/Capstone/movie_reviews.csv', index_col = 0)
df

Unnamed: 0,Movie,Review,Rating
0,http://www.imdb.com/title/tt0108921,I used to watch this show when I was growing u...,8
1,http://www.imdb.com/title/tt0366707,"After seeing this DVD, I was floored. It is SO...",10
2,http://www.imdb.com/title/tt0096657,This TV series is about a foolish and unconven...,9
3,http://www.imdb.com/title/tt0044079,One would have expected Hitchcock's return to ...,10
4,http://www.imdb.com/title/tt0138541,"But not too hip. And not too wisecracking. ""Ju...",10
...,...,...,...
24995,http://www.imdb.com/title/tt0053054,"Excellent farce! Which, of course, is all it i...",10
24996,http://www.imdb.com/title/tt0079095,This movie earned every one of the ten votes I...,10
24997,http://www.imdb.com/title/tt0303397,Don't be swayed by the naysayers. This is a wo...,8
24998,http://www.imdb.com/title/tt0762091,I gave this movie such a high mark because it ...,10


# Data Pre-Processing

In [4]:
# To remove Emojis and Emoticons
def emojis(tweet):
  dem = demoji.findall(tweet)
  for item in dem.keys():
    tweet = tweet.replace(item, ' ')
  return tweet

In [5]:
# To Remove HTML Tags
def Remove_HTML_Tags(tweet):
  tweet = re.sub(r'<.*?>', '', tweet)
  return tweet

In [6]:
# Importing Dictionary for common misspellings
# http://norvig.com/spell-correct.html
misspell_data = pd.read_csv('/content/gdrive/MyDrive/Capstone/Spell_Check/aspell.txt',
                                 sep = ':', names = ["correction","misspell"])
misspell_data.misspell = misspell_data.misspell.str.strip()
misspell_data.misspell = misspell_data.misspell.str.split(" ")
misspell_data = misspell_data.explode("misspell").reset_index(drop=True)
misspell_data.drop_duplicates("misspell",inplace=True)
miss_corr = dict(zip(misspell_data.misspell, misspell_data.correction))

# Sample of the dict
{v:miss_corr[v] for v in [list(miss_corr.keys())[k] for k in range(20)]}

{'Steffen': 'Stephen',
 'abilitey': 'ability',
 'abouy': 'about',
 'absorbtion': 'absorption',
 'accidently': 'accidentally',
 'accomodate': 'accommodate',
 'acommadate': 'accommodate',
 'acord': 'accord',
 'adultry': 'adultery',
 'aggresive': 'aggressive',
 'alchohol': 'alcohol',
 'alchoholic': 'alcoholic',
 'allieve': 'alive',
 'alright': 'all_right',
 'aquantance': 'acquaintance',
 'equire': 'acquire',
 'nevade': 'Nevada',
 'presbyterian': 'Presbyterian',
 'rsx': 'RSX',
 'susan': 'Susan'}

In [7]:
# To correct misspelled words
def MisspelledCorrection(tweet):
  for x in tweet.split():
    if x in miss_corr.keys():
      tweet = tweet.replace(x, miss_corr[x])
  return tweet

In [8]:
# Import list of common English Contractions
contractions = pd.read_csv("/content/gdrive/MyDrive/Capstone/contractions.csv")
print(contractions)
cont_dic = dict(zip(contractions.Contraction, contractions.Meaning))

    Contraction       Meaning
0        'aight       alright
1         ain't        is not
2         amn't        am not
3        aren't       are not
4         can't        cannot
..          ...           ...
143    y'all're  you all are 
144       you'd     you would
145      you'll      you will
146      you're       you are
147      you've      you have

[148 rows x 2 columns]


In [9]:
# To Convert Contractions to their Meaning
def Cont_to_Meaning(tweet):
  for x in tweet.split():
    if x in cont_dic.keys():
      tweet = tweet.replace(x, cont_dic[x])
  return tweet

In [10]:
# Pre-processing Tweets
def cleaning(tweet):
  # Converting tweet to Lower Case
  tweet = tweet.lower()
  # Remove HTML Tags
  tweet = Remove_HTML_Tags(tweet)
  # Removing Mentions (@...) and Hashtags (#...)
  tweet = re.sub("@[A-Za-z0-9_]+","", tweet)
  tweet = re.sub("#[A-Za-z0-9_]+","", tweet)
  # Removing URL Links
  tweet = re.sub(r'https?:\/\/\S+', '', tweet)
  tweet = re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", '', tweet)
  # Removing Placeholders sometimes found in some data
  tweet = re.sub(r'{link}', '', tweet)
  tweet = re.sub(r"\[video\]", '', tweet)
  # Remove emojis
  tweet = emojis(tweet)
  # Correct Spellings
  tweet = MisspelledCorrection(tweet)
  # Converting Contractions to their Meaning
  tweet = Cont_to_Meaning(tweet)
  return tweet

In [11]:
# Applying cleaning to all tweets
df['Review'] = df.Review.apply(lambda x: cleaning(x))
df

Unnamed: 0,Movie,Review,Rating
0,http://www.imdb.com/title/tt0108921,i used to watch this show when i was growing u...,8
1,http://www.imdb.com/title/tt0366707,"after seeing this dvd, i was floored. it is so...",10
2,http://www.imdb.com/title/tt0096657,this tv series is about a foolish and unconven...,9
3,http://www.imdb.com/title/tt0044079,one would have expected hitchcock's return to ...,10
4,http://www.imdb.com/title/tt0138541,"but not too hip. and not too wisecracking. ""ju...",10
...,...,...,...
24995,http://www.imdb.com/title/tt0053054,"excellent farce! which, of course, is all it i...",10
24996,http://www.imdb.com/title/tt0079095,this movie earned every one of the ten votes i...,10
24997,http://www.imdb.com/title/tt0303397,do not be swayed by the naysayers. this is a w...,8
24998,http://www.imdb.com/title/tt0762091,i gave this movie such a high mark because it ...,10


In [12]:
# Removing blank tweets after pre-processing
df['Review'].replace('', np.nan, inplace = True)
df.dropna(subset = ['Review'], inplace = True)
df

Unnamed: 0,Movie,Review,Rating
0,http://www.imdb.com/title/tt0108921,i used to watch this show when i was growing u...,8
1,http://www.imdb.com/title/tt0366707,"after seeing this dvd, i was floored. it is so...",10
2,http://www.imdb.com/title/tt0096657,this tv series is about a foolish and unconven...,9
3,http://www.imdb.com/title/tt0044079,one would have expected hitchcock's return to ...,10
4,http://www.imdb.com/title/tt0138541,"but not too hip. and not too wisecracking. ""ju...",10
...,...,...,...
24995,http://www.imdb.com/title/tt0053054,"excellent farce! which, of course, is all it i...",10
24996,http://www.imdb.com/title/tt0079095,this movie earned every one of the ten votes i...,10
24997,http://www.imdb.com/title/tt0303397,do not be swayed by the naysayers. this is a w...,8
24998,http://www.imdb.com/title/tt0762091,i gave this movie such a high mark because it ...,10


In [13]:
df["empty"]       = np.nan
df["sadness"]     = np.nan
df["enthusiasm"]  = np.nan
df["neutral"]     = np.nan
df["worry"]       = np.nan
df["surprise"]    = np.nan
df["love"]        = np.nan
df["fun"]         = np.nan
df["hate"]        = np.nan
df["happiness"]   = np.nan
df["boredom"]     = np.nan
df["relief"]      = np.nan
df["anger"]       = np.nan
df

Unnamed: 0,Movie,Review,Rating,empty,sadness,enthusiasm,neutral,worry,surprise,love,fun,hate,happiness,boredom,relief,anger
0,http://www.imdb.com/title/tt0108921,i used to watch this show when i was growing u...,8,,,,,,,,,,,,,
1,http://www.imdb.com/title/tt0366707,"after seeing this dvd, i was floored. it is so...",10,,,,,,,,,,,,,
2,http://www.imdb.com/title/tt0096657,this tv series is about a foolish and unconven...,9,,,,,,,,,,,,,
3,http://www.imdb.com/title/tt0044079,one would have expected hitchcock's return to ...,10,,,,,,,,,,,,,
4,http://www.imdb.com/title/tt0138541,"but not too hip. and not too wisecracking. ""ju...",10,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,http://www.imdb.com/title/tt0053054,"excellent farce! which, of course, is all it i...",10,,,,,,,,,,,,,
24996,http://www.imdb.com/title/tt0079095,this movie earned every one of the ten votes i...,10,,,,,,,,,,,,,
24997,http://www.imdb.com/title/tt0303397,do not be swayed by the naysayers. this is a w...,8,,,,,,,,,,,,,
24998,http://www.imdb.com/title/tt0762091,i gave this movie such a high mark because it ...,10,,,,,,,,,,,,,


In [14]:
df.to_csv('/content/gdrive/MyDrive/Capstone/movie_emotion.csv')

# Model

In [None]:
!pip install transformers
import transformers
from transformers import TFAutoModel, AutoTokenizer
!pip install tokenizers
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from keras.preprocessing import sequence, text
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
import tensorflow as tf
import tqdm

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 3.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 27.5 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 27.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 41.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyy

In [None]:
import tensorflow as tf
from keras.preprocessing import sequence, text


In [None]:
!pip install pyyaml h5py
import os
checkpoint_path = "/content/gdrive/MyDrive/Capstone/Checkpoints/cp.cpkt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)



In [None]:
latest = tf.train.latest_checkpoint(checkpoint_dir)
latest

'/content/gdrive/MyDrive/Capstone/Checkpoints/cp.cpkt'

In [None]:
from tensorflow import keras
model = keras.models.load_model('/content/gdrive/MyDrive/Capstone/my_model.h5')
model.summary()
model.load_weights(latest)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 160, 160)          4967040   
                                                                 
 spatial_dropout1d (SpatialD  (None, 160, 160)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 250)               411000    
                                                                 
 dense (Dense)               (None, 13)                3263      
                                                                 
Total params: 5,381,303
Trainable params: 5,381,303
Non-trainable params: 0
_________________________________________________________________


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fb3cc24dc90>

In [None]:
token = text.Tokenizer(num_words=None)
max_len = 160

In [None]:
id  = {"empty":       0,
       "sadness":     1,
       "enthusiasm":  2,
       "neutral":     3,
       "worry":       4,
       "surprise":    5,
       "love":        6,
       "fun":         7,
       "hate":        8,
       "happiness":   9,
       "boredom":     10,
       "relief":      11,
       "anger":       12}

In [None]:
def get_sentiment(model,text):
    text = cleaning(text)
    #tokenize
    twt = token.texts_to_sequences([text])
    twt = sequence.pad_sequences(twt, maxlen=max_len, dtype='int32')
    sentiment = model.predict(twt,batch_size=1,verbose = 2)
    sent = np.round(np.dot(sentiment,100).tolist(),0)[0]
    result = pd.DataFrame([id.keys(),sent]).T
    result.columns = ["sentiment","percentage"]
    result=result[result.percentage !=0]
    return result

In [None]:
for index, row in df.iterrows():
  #print(row['Review'])
  result = get_sentiment(model, row['Review'])
  for result_index, result_row in result.iterrows():
    df.loc[index, result_row['sentiment']] = result_row['percentage']

1/1 - 0s - 481ms/epoch - 481ms/step
1/1 - 0s - 84ms/epoch - 84ms/step
1/1 - 0s - 81ms/epoch - 81ms/step
1/1 - 0s - 79ms/epoch - 79ms/step
1/1 - 0s - 81ms/epoch - 81ms/step
1/1 - 0s - 95ms/epoch - 95ms/step
1/1 - 0s - 102ms/epoch - 102ms/step
1/1 - 0s - 77ms/epoch - 77ms/step
1/1 - 0s - 77ms/epoch - 77ms/step
1/1 - 0s - 83ms/epoch - 83ms/step
1/1 - 0s - 94ms/epoch - 94ms/step
1/1 - 0s - 86ms/epoch - 86ms/step
1/1 - 0s - 79ms/epoch - 79ms/step
1/1 - 0s - 79ms/epoch - 79ms/step
1/1 - 0s - 82ms/epoch - 82ms/step
1/1 - 0s - 99ms/epoch - 99ms/step
1/1 - 0s - 124ms/epoch - 124ms/step
1/1 - 0s - 83ms/epoch - 83ms/step
1/1 - 0s - 78ms/epoch - 78ms/step
1/1 - 0s - 96ms/epoch - 96ms/step
1/1 - 0s - 81ms/epoch - 81ms/step
1/1 - 0s - 84ms/epoch - 84ms/step
1/1 - 0s - 78ms/epoch - 78ms/step
1/1 - 0s - 87ms/epoch - 87ms/step
1/1 - 0s - 91ms/epoch - 91ms/step
1/1 - 0s - 77ms/epoch - 77ms/step
1/1 - 0s - 78ms/epoch - 78ms/step
1/1 - 0s - 78ms/epoch - 78ms/step
1/1 - 0s - 80ms/epoch - 80ms/step
1/1 - 0s

KeyboardInterrupt: ignored

In [None]:
df

Unnamed: 0,Movie,Review,Rating,empty,sadness,enthusiasm,neutral,worry,surprise,love,fun,hate,happiness,boredom,relief,anger
0,http://www.imdb.com/title/tt0246299,"one of, if not the most visually beautiful fil...",9,,,,,,,,,,,,,
1,http://www.imdb.com/title/tt0153998,i loved the great lighting and was warmed by t...,8,,,,,,,,,,,,,
2,http://www.imdb.com/title/tt0053270,"the remarkable, sometimes infuriating, often b...",8,,,,,,,,,,,,,
3,http://www.imdb.com/title/tt0080716,although at first glance this movie looks like...,8,,,,,,,,,,,,,
4,http://www.imdb.com/title/tt0310778,"in paris, a few months before the nazi invasio...",9,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7993,http://www.imdb.com/title/tt0048021,fantastic movie! one of the best film noir mov...,10,,,,,,,,,,,,,
7994,http://www.imdb.com/title/tt0048021,"the fact that after 50 years, it is still a hi...",8,,,,,,,,,,,,,
7995,http://www.imdb.com/title/tt0033727,mr. bug goes to town was one of those films th...,8,,,,,,,,,,,,,
7996,http://www.imdb.com/title/tt0223005,"even the trailer for this movie makes me cry, ...",10,,,,,,,,,,,,,


In [None]:
result = get_sentiment(model,"I hate this game so much,It make me angry all the time ")
print(result)

1/1 - 0s - 484ms/epoch - 484ms/step
     sentiment percentage
0        empty       11.0
1      sadness        2.0
2   enthusiasm        2.0
3      neutral       66.0
4        worry        4.0
5     surprise        3.0
6         love        3.0
7          fun        1.0
8         hate        1.0
9    happiness        3.0
11      relief        3.0
