In [1]:
# Mounting Google-Drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Pre-Processing

In [2]:
import pandas as pd
import numpy as np
import re
!pip install demoji
import demoji
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split



In [3]:
# Importing Dataset
df = pd.read_csv('/content/gdrive/MyDrive/Capstone/text_emotion.csv')
df

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...
...,...,...,...,...
39995,1753918954,neutral,showMe_Heaven,@JohnLloydTaylor
39996,1753919001,love,drapeaux,Happy Mothers Day All my love
39997,1753919005,love,JenniRox,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,ipdaman1,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [4]:
# To remove Emojis and Emoticons
def emojis(tweet):
  dem = demoji.findall(tweet)
  for item in dem.keys():
    tweet = tweet.replace(item, ' ')
  return tweet

In [5]:
# Importing Dictionary for common misspellings
# http://norvig.com/spell-correct.html
misspell_data = pd.read_csv('/content/gdrive/MyDrive/Capstone/Spell_Check/aspell.txt',
                                 sep = ':', names = ["correction","misspell"])
misspell_data.misspell = misspell_data.misspell.str.strip()
misspell_data.misspell = misspell_data.misspell.str.split(" ")
misspell_data = misspell_data.explode("misspell").reset_index(drop=True)
misspell_data.drop_duplicates("misspell",inplace=True)
miss_corr = dict(zip(misspell_data.misspell, misspell_data.correction))

# Sample of the dict
{v:miss_corr[v] for v in [list(miss_corr.keys())[k] for k in range(20)]}

{'Steffen': 'Stephen',
 'abilitey': 'ability',
 'abouy': 'about',
 'absorbtion': 'absorption',
 'accidently': 'accidentally',
 'accomodate': 'accommodate',
 'acommadate': 'accommodate',
 'acord': 'accord',
 'adultry': 'adultery',
 'aggresive': 'aggressive',
 'alchohol': 'alcohol',
 'alchoholic': 'alcoholic',
 'allieve': 'alive',
 'alright': 'all_right',
 'aquantance': 'acquaintance',
 'equire': 'acquire',
 'nevade': 'Nevada',
 'presbyterian': 'Presbyterian',
 'rsx': 'RSX',
 'susan': 'Susan'}

In [6]:
# To correct misspelled words
def MisspelledCorrection(tweet):
  for x in tweet.split():
    if x in miss_corr.keys():
      tweet = tweet.replace(x, miss_corr[x])
  return tweet

In [7]:
# Import list of common English Contractions
contractions = pd.read_csv("/content/gdrive/MyDrive/Capstone/contractions.csv")
print(contractions)
cont_dic = dict(zip(contractions.Contraction, contractions.Meaning))

    Contraction       Meaning
0        'aight       alright
1         ain't        is not
2         amn't        am not
3        aren't       are not
4         can't        cannot
..          ...           ...
143    y'all're  you all are 
144       you'd     you would
145      you'll      you will
146      you're       you are
147      you've      you have

[148 rows x 2 columns]


In [8]:
# To Convert Contractions to their Meaning
def Cont_to_Meaning(tweet):
  for x in tweet.split():
    if x in cont_dic.keys():
      tweet = tweet.replace(x, cont_dic[x])
  return tweet

In [9]:
# Pre-processing Tweets
def cleaning(tweet):
  # Converting tweet to Lower Case
  tweet = tweet.lower()
  # Removing Mentions (@...) and Hashtags (#...)
  tweet = re.sub("@[A-Za-z0-9_]+","", tweet)
  tweet = re.sub("#[A-Za-z0-9_]+","", tweet)
  # Removing URL Links
  tweet = re.sub(r'https?:\/\/\S+', '', tweet)
  tweet = re.sub(r"www\.[a-z]?\.?(com)+|[a-z]+\.(com)", '', tweet)
  # Removing Placeholders sometimes found in some data
  tweet = re.sub(r'{link}', '', tweet)
  tweet = re.sub(r"\[video\]", '', tweet)
  # Remove emojis
  tweet = emojis(tweet)
  # Correct Spellings
  tweet = MisspelledCorrection(tweet)
  # Converting Contractions to their Meaning
  tweet = Cont_to_Meaning(tweet)
  return tweet

In [10]:
# Applying cleaning to all tweets
df['content'] = df.content.apply(lambda x: cleaning(x))
df

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,i know i was listenin to bad habit earlier a...
1,1956967666,sadness,wannamama,layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends soon!
4,1956968416,neutral,xkilljoyx,we want to trade with someone who has houston...
...,...,...,...,...
39995,1753918954,neutral,showMe_Heaven,
39996,1753919001,love,drapeaux,happy mothers day all my love
39997,1753919005,love,JenniRox,happy mother's day to all the mommies out ther...
39998,1753919043,happiness,ipdaman1,wassup beautiful!!! follow me!! peep out my ...


In [11]:
# Removing blank tweets after pre-processing
df['content'].replace('', np.nan, inplace = True)
df.dropna(subset = ['content'], inplace = True)
df

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,i know i was listenin to bad habit earlier a...
1,1956967666,sadness,wannamama,layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends soon!
4,1956968416,neutral,xkilljoyx,we want to trade with someone who has houston...
...,...,...,...,...
39994,1753918900,happiness,courtside101,succesfully following tayla!!
39996,1753919001,love,drapeaux,happy mothers day all my love
39997,1753919005,love,JenniRox,happy mother's day to all the mommies out ther...
39998,1753919043,happiness,ipdaman1,wassup beautiful!!! follow me!! peep out my ...


In [12]:
# Available Emotions
df['sentiment'].value_counts()

neutral       8582
worry         8455
happiness     5208
sadness       5162
love          3841
surprise      2187
fun           1776
relief        1526
hate          1323
empty          817
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

In [13]:
# Assigning numerical IDs to each sentiment
id  = {"empty":       0,
       "sadness":     1,
       "enthusiasm":  2,
       "neutral":     3,
       "worry":       4,
       "surprise":    5,
       "love":        6,
       "fun":         7,
       "hate":        8,
       "happiness":   9,
       "boredom":     10,
       "relief":      11,
       "anger":       12}

In [14]:
# Updating Sentiment to their respective IDs
df["sentiment_id"] = df['sentiment'].map(id)
df

Unnamed: 0,tweet_id,sentiment,author,content,sentiment_id
0,1956967341,empty,xoshayzers,i know i was listenin to bad habit earlier a...,0
1,1956967666,sadness,wannamama,layin n bed with a headache ughhhh...waitin o...,1
2,1956967696,sadness,coolfunky,funeral ceremony...gloomy friday...,1
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends soon!,2
4,1956968416,neutral,xkilljoyx,we want to trade with someone who has houston...,3
...,...,...,...,...,...
39994,1753918900,happiness,courtside101,succesfully following tayla!!,9
39996,1753919001,love,drapeaux,happy mothers day all my love,6
39997,1753919005,love,JenniRox,happy mother's day to all the mommies out ther...,6
39998,1753919043,happiness,ipdaman1,wassup beautiful!!! follow me!! peep out my ...,9


In [15]:
label_encoder = LabelEncoder()
int_encoded = label_encoder.fit_transform(df.sentiment_id)

onehot_encoder = OneHotEncoder(sparse = False)
int_encoded = int_encoded.reshape(len(int_encoded), 1)
Y = onehot_encoder.fit_transform(int_encoded)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df.content, Y,
                                                    random_state = 2000,
                                                    test_size = 0.2,
                                                    shuffle = True)

## LSTM

In [17]:
!pip install transformers
!pip install tensorflow
!pip install keras
import transformers
from transformers import TFAutoModel, AutoTokenizer
!pip install tokenizers
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from keras.preprocessing import sequence, text
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
import tensorflow as tf
import keras
from tensorflow.keras.layers import LSTM

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 12.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 51.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 61.6 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.1 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 37.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found e

In [18]:
token = text.Tokenizer(num_words=None)
max_len = 160
Epoch = 5
token.fit_on_texts(list(X_train) + list(X_test))
X_train_pad = sequence.pad_sequences(token.texts_to_sequences(X_train), maxlen=max_len)
X_test_pad = sequence.pad_sequences(token.texts_to_sequences(X_test), maxlen=max_len)

In [19]:
w_idx = token.word_index

In [20]:
embed_dim = 160
lstm_out = 250

model = Sequential()
model.add(Embedding(len(w_idx) +1 , embed_dim,input_length = X_test_pad.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(keras.layers.core.Dense(13, activation='softmax'))
#adam rmsprop 
model.compile(loss = "categorical_crossentropy", optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 160, 160)          4967040   
                                                                 
 spatial_dropout1d (SpatialD  (None, 160, 160)         0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 250)               411000    
                                                                 
 dense (Dense)               (None, 13)                3263      
                                                                 
Total params: 5,381,303
Trainable params: 5,381,303
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
batch_size = 32

In [22]:
!pip install pyyaml h5py
import os
checkpoint_path = "/content/gdrive/MyDrive/Capstone/Checkpoints/cp.cpkt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)




In [23]:
model.fit(X_train_pad, y_train,
          epochs = Epoch,
          batch_size=batch_size,
          validation_data=(X_test_pad, y_test),
          callbacks=[cp_callback])

Epoch 1/5
Epoch 1: saving model to /content/gdrive/MyDrive/Capstone/Checkpoints/cp.cpkt
Epoch 2/5
Epoch 2: saving model to /content/gdrive/MyDrive/Capstone/Checkpoints/cp.cpkt
Epoch 3/5
Epoch 3: saving model to /content/gdrive/MyDrive/Capstone/Checkpoints/cp.cpkt
Epoch 4/5
Epoch 4: saving model to /content/gdrive/MyDrive/Capstone/Checkpoints/cp.cpkt
Epoch 5/5
Epoch 5: saving model to /content/gdrive/MyDrive/Capstone/Checkpoints/cp.cpkt


<keras.callbacks.History at 0x7f0168058a50>

In [24]:
def get_sentiment(model,text):
    text = cleaning(text)
    #tokenize
    twt = token.texts_to_sequences([text])
    twt = sequence.pad_sequences(twt, maxlen=max_len, dtype='int32')
    sentiment = model.predict(twt,batch_size=1,verbose = 2)
    sent = np.round(np.dot(sentiment,100).tolist(),0)[0]
    result = pd.DataFrame([id.keys(),sent]).T
    result.columns = ["sentiment","percentage"]
    result=result[result.percentage !=0]
    return result

In [25]:
result = get_sentiment(model,"Had an absolutely brilliant day ðŸ˜ loved seeing an old friend and reminiscing")
print(result)
result = get_sentiment(model,"The pain my heart feels is just too much for it to bear. Nothing eases this pain. I can’t hold myself back. I really miss you")
print(result)
result = get_sentiment(model,"I hate this game so much,It make me angry all the time ")
print(result)

1/1 - 0s - 364ms/epoch - 364ms/step
    sentiment percentage
1     sadness        3.0
2  enthusiasm        1.0
3     neutral        5.0
6        love       60.0
7         fun        6.0
9   happiness       24.0
1/1 - 0s - 80ms/epoch - 80ms/step
  sentiment percentage
1   sadness       68.0
4     worry       25.0
5  surprise        1.0
6      love        2.0
8      hate        2.0
1/1 - 0s - 87ms/epoch - 87ms/step
   sentiment percentage
1    sadness        7.0
3    neutral        1.0
4      worry        5.0
8       hate       85.0
10   boredom        1.0


In [26]:
result = get_sentiment(model,"I hate this game so much,It make me angry all the time ")
print(result)

1/1 - 0s - 80ms/epoch - 80ms/step
   sentiment percentage
1    sadness        7.0
3    neutral        1.0
4      worry        5.0
8       hate       85.0
10   boredom        1.0


# Importing Dataset

In [27]:
# Importing Dataset
df = pd.read_csv('/content/gdrive/MyDrive/Capstone/movie_emotion.csv', index_col = 0)
df

Unnamed: 0,Movie,Review,Rating,empty,sadness,enthusiasm,neutral,worry,surprise,love,fun,hate,happiness,boredom,relief,anger
0,http://www.imdb.com/title/tt0108921,i used to watch this show when i was growing u...,8,,,,,,,,,,,,,
1,http://www.imdb.com/title/tt0366707,"after seeing this dvd, i was floored. it is so...",10,,,,,,,,,,,,,
2,http://www.imdb.com/title/tt0096657,this tv series is about a foolish and unconven...,9,,,,,,,,,,,,,
3,http://www.imdb.com/title/tt0044079,one would have expected hitchcock's return to ...,10,,,,,,,,,,,,,
4,http://www.imdb.com/title/tt0138541,"but not too hip. and not too wisecracking. ""ju...",10,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,http://www.imdb.com/title/tt0053054,"excellent farce! which, of course, is all it i...",10,,,,,,,,,,,,,
24996,http://www.imdb.com/title/tt0079095,this movie earned every one of the ten votes i...,10,,,,,,,,,,,,,
24997,http://www.imdb.com/title/tt0303397,do not be swayed by the naysayers. this is a w...,8,,,,,,,,,,,,,
24998,http://www.imdb.com/title/tt0762091,i gave this movie such a high mark because it ...,10,,,,,,,,,,,,,


# Model

In [28]:
df

Unnamed: 0,Movie,Review,Rating,empty,sadness,enthusiasm,neutral,worry,surprise,love,fun,hate,happiness,boredom,relief,anger
0,http://www.imdb.com/title/tt0108921,i used to watch this show when i was growing u...,8,,,,,,,,,,,,,
1,http://www.imdb.com/title/tt0366707,"after seeing this dvd, i was floored. it is so...",10,,,,,,,,,,,,,
2,http://www.imdb.com/title/tt0096657,this tv series is about a foolish and unconven...,9,,,,,,,,,,,,,
3,http://www.imdb.com/title/tt0044079,one would have expected hitchcock's return to ...,10,,,,,,,,,,,,,
4,http://www.imdb.com/title/tt0138541,"but not too hip. and not too wisecracking. ""ju...",10,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,http://www.imdb.com/title/tt0053054,"excellent farce! which, of course, is all it i...",10,,,,,,,,,,,,,
24996,http://www.imdb.com/title/tt0079095,this movie earned every one of the ten votes i...,10,,,,,,,,,,,,,
24997,http://www.imdb.com/title/tt0303397,do not be swayed by the naysayers. this is a w...,8,,,,,,,,,,,,,
24998,http://www.imdb.com/title/tt0762091,i gave this movie such a high mark because it ...,10,,,,,,,,,,,,,


In [29]:
result = get_sentiment(model,"I hate this game so much,It make me angry all the time ")
print(result)

1/1 - 0s - 158ms/epoch - 158ms/step
   sentiment percentage
1    sadness        7.0
3    neutral        1.0
4      worry        5.0
8       hate       85.0
10   boredom        1.0


In [30]:
for index, row in df.iterrows():
  #print(row['Review'])
  result = get_sentiment(model, row['Review'])
  for result_index, result_row in result.iterrows():
    df.loc[index, result_row['sentiment']] = result_row['percentage']

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1/1 - 0s - 80ms/epoch - 80ms/step
1/1 - 0s - 87ms/epoch - 87ms/step
1/1 - 0s - 80ms/epoch - 80ms/step
1/1 - 0s - 91ms/epoch - 91ms/step
1/1 - 0s - 87ms/epoch - 87ms/step
1/1 - 0s - 82ms/epoch - 82ms/step
1/1 - 0s - 82ms/epoch - 82ms/step
1/1 - 0s - 83ms/epoch - 83ms/step
1/1 - 0s - 83ms/epoch - 83ms/step
1/1 - 0s - 81ms/epoch - 81ms/step
1/1 - 0s - 81ms/epoch - 81ms/step
1/1 - 0s - 81ms/epoch - 81ms/step
1/1 - 0s - 83ms/epoch - 83ms/step
1/1 - 0s - 79ms/epoch - 79ms/step
1/1 - 0s - 83ms/epoch - 83ms/step
1/1 - 0s - 81ms/epoch - 81ms/step
1/1 - 0s - 84ms/epoch - 84ms/step
1/1 - 0s - 86ms/epoch - 86ms/step
1/1 - 0s - 79ms/epoch - 79ms/step
1/1 - 0s - 86ms/epoch - 86ms/step
1/1 - 0s - 83ms/epoch - 83ms/step
1/1 - 0s - 81ms/epoch - 81ms/step
1/1 - 0s - 87ms/epoch - 87ms/step
1/1 - 0s - 90ms/epoch - 90ms/step
1/1 - 0s - 77ms/epoch - 77ms/step
1/1 - 0s - 83ms/epoch - 83ms/step
1/1 - 0s - 80ms/epoch - 80ms/step
1/1 - 0s - 85ms/e

In [31]:
df

Unnamed: 0,Movie,Review,Rating,empty,sadness,enthusiasm,neutral,worry,surprise,love,fun,hate,happiness,boredom,relief,anger
0,http://www.imdb.com/title/tt0108921,i used to watch this show when i was growing u...,8,,2.0,1.0,2.0,9.0,3.0,22.0,1.0,,40.0,,20.0,
1,http://www.imdb.com/title/tt0366707,"after seeing this dvd, i was floored. it is so...",10,1.0,,,3.0,,10.0,,37.0,,48.0,,1.0,
2,http://www.imdb.com/title/tt0096657,this tv series is about a foolish and unconven...,9,,,2.0,82.0,1.0,,,3.0,,10.0,,1.0,
3,http://www.imdb.com/title/tt0044079,one would have expected hitchcock's return to ...,10,1.0,,,1.0,,,,,95.0,,2.0,,
4,http://www.imdb.com/title/tt0138541,"but not too hip. and not too wisecracking. ""ju...",10,,3.0,4.0,2.0,14.0,4.0,1.0,1.0,,65.0,,6.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,http://www.imdb.com/title/tt0053054,"excellent farce! which, of course, is all it i...",10,,2.0,,,2.0,,,,,20.0,,76.0,
24996,http://www.imdb.com/title/tt0079095,this movie earned every one of the ten votes i...,10,,7.0,2.0,4.0,9.0,4.0,18.0,1.0,,12.0,,43.0,
24997,http://www.imdb.com/title/tt0303397,do not be swayed by the naysayers. this is a w...,8,,61.0,,2.0,11.0,3.0,1.0,1.0,3.0,17.0,,1.0,
24998,http://www.imdb.com/title/tt0762091,i gave this movie such a high mark because it ...,10,4.0,9.0,3.0,16.0,24.0,20.0,4.0,1.0,14.0,2.0,1.0,1.0,1.0


In [32]:
df.to_csv('/content/gdrive/MyDrive/Capstone/movie_review_emotion.csv')