In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from os import listdir
import pandas as pd

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers

In [3]:
def process_docs(directory):
  frames = []
  for filename in listdir(directory):
    print(filename, "Imported already!")
    # create the full path of the file to open
    path = directory + '/' + filename
    docs = pd.read_csv(path,
                        sep="\t",
                        header=0,
                        names=["id", "tweet", "tweettype", "score"])
    frames.append(docs)
  result = pd.concat(frames)
  return result

## Crowdflower


In [4]:
cf = pd.read_csv('/content/drive/MyDrive/CZ4042/project/text_emotion.csv')
cf

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...
...,...,...,...,...
39995,1753918954,neutral,showMe_Heaven,@JohnLloydTaylor
39996,1753919001,love,drapeaux,Happy Mothers Day All my love
39997,1753919005,love,JenniRox,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,ipdaman1,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [5]:
y = pd.get_dummies(cf['sentiment'])
print(type(y))
print(y.shape)
print(y.ndim)
y.head()

<class 'pandas.core.frame.DataFrame'>
(40000, 13)
2


Unnamed: 0,anger,boredom,empty,enthusiasm,fun,happiness,hate,love,neutral,relief,sadness,surprise,worry
0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,0,0,0


In [6]:
y = y.values
print("properties of y")
print("type : {}, dimensions : {}, shape : {}, total no. of elements : {}, data type of each element: {}, size of each element {} bytes".format(type(y), y.ndim, y.shape, y.size, y.dtype, y.itemsize))

properties of y
type : <class 'numpy.ndarray'>, dimensions : 2, shape : (40000, 13), total no. of elements : 520000, data type of each element: uint8, size of each element 1 bytes


In [7]:
list_of_classes = cf['sentiment'].unique().tolist()

In [8]:
cf = cf.drop(['author','tweet_id','sentiment'], axis=1)

In [9]:
cf.shape
x = cf.values.flatten()

In [10]:
print("Max. sentence length is: ", len(max(cf['content'], key=len)))

Max. sentence length is:  167


In [11]:
max_features = 20000
max_text_length = 170
embedding_dims = 50
batch_size = 10
epochs = 15
num_filters_1 = 250
num_filters_2 = 250
filter_size = 3

In [12]:
x_tokenizer = Tokenizer(num_words=max_features)
x_tokenizer.fit_on_texts(list(x))
x_tokenized = x_tokenizer.texts_to_sequences(x)
x_train_val = pad_sequences(x_tokenized, maxlen=max_text_length)

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x_train_val, y, 
                                                   test_size = 0.3, 
                                                   random_state = 1000)

In [14]:
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(28000, 170) (28000, 13)
(12000, 170) (12000, 13)


In [15]:
model = Sequential()

model.add(layers.Embedding(max_features, embedding_dims, input_length=max_text_length))
model.add(layers.Dropout(0.2))

model.add(layers.Conv1D(filters=num_filters_1, kernel_size=filter_size, padding='valid', activation='relu', strides=1))
model.add(layers.GlobalMaxPooling1D())

model.add(layers.Dense(num_filters_2))
model.add(layers.Dropout(0.2))
model.add(layers.Activation('relu'))

model.add(layers.Dense(13))
model.add(layers.Activation('sigmoid'))

In [16]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 170, 50)           1000000   
                                                                 
 dropout (Dropout)           (None, 170, 50)           0         
                                                                 
 conv1d (Conv1D)             (None, 168, 250)          37750     
                                                                 
 global_max_pooling1d (Globa  (None, 250)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 250)               62750     
                                                                 
 dropout_1 (Dropout)         (None, 250)               0         
                                                        

In [17]:
import time
start_time = time.time() 
history_cf = model.fit(x_train, y_train, 
                      batch_size=batch_size,
                      epochs=epochs, validation_data=(x_test, y_test))
time_taken = time.time()-start_time
print("\n Time taken: " , time_taken)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15

 Time taken:  180.55857467651367


## WASSA

In [4]:
train = process_docs('/content/drive/MyDrive/CZ4042/project/wassa/train')
validate = process_docs('/content/drive/MyDrive/CZ4042/project/wassa/validate')

anger-ratings-0to1.train.txt Imported already!
fear-ratings-0to1.train.txt Imported already!
joy-ratings-0to1.train.txt Imported already!
sadness-ratings-0to1.train.txt Imported already!
anger-ratings-0to1.dev.gold.txt Imported already!
fear-ratings-0to1.dev.gold.txt Imported already!
joy-ratings-0to1.dev.gold.txt Imported already!
sadness-ratings-0to1.dev.gold.txt Imported already!


In [10]:
wassa = pd.concat([train, validate])
# Pruning with threshold of 0.7
wassa = wassa[wassa['score'] >= 0.7]
wassa

Unnamed: 0,id,tweet,tweettype,score
0,10000,How the fu*k! Who the heck! moved my fridge!.....,anger,0.938
1,10001,So my Indian Uber driver just called someone t...,anger,0.896
2,10002,@DPD_UK I asked for my parcel to be delivered ...,anger,0.896
3,10003,so ef whichever butt wipe pulled the fire alar...,anger,0.896
4,10004,Don't join @BTCare they put the phone down on ...,anger,0.896
...,...,...,...,...
35,40821,It feel like we lost a family member🙄😂,sadness,0.708
36,40822,My life went from happy to unhappy..,sadness,0.812
60,40846,Should of stayed in Dubai 😞,sadness,0.708
69,40855,Common app just randomly logged me out as I wa...,sadness,0.833


In [11]:
y = pd.get_dummies(wassa['tweettype'])
print(type(y))
print(y.shape)
print(y.ndim)
y.head()

<class 'pandas.core.frame.DataFrame'>
(625, 4)
2


Unnamed: 0,anger,fear,joy,sadness
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0


In [12]:
y = y.values
print("properties of y")
print("type : {}, dimensions : {}, shape : {}, total no. of elements : {}, data type of each element: {}, size of each element {} bytes".format(type(y), y.ndim, y.shape, y.size, y.dtype, y.itemsize))

properties of y
type : <class 'numpy.ndarray'>, dimensions : 2, shape : (625, 4), total no. of elements : 2500, data type of each element: uint8, size of each element 1 bytes


In [13]:
list_of_classes = wassa['tweettype'].unique().tolist()

In [14]:
wassa = wassa.drop(['tweettype','id','score'], axis=1)

In [15]:
wassa.shape
x = wassa.values.flatten()

In [17]:
print("Max. sentence length is: ", len(max(wassa['tweet'], key=len)))

Max. sentence length is:  158


In [18]:
max_features = 20000
max_text_length = 160
embedding_dims = 50
batch_size = 10
epochs = 15
num_filters_1 = 250
num_filters_2 = 250
filter_size = 3

In [20]:
x_tokenizer = Tokenizer(num_words=max_features)
x_tokenizer.fit_on_texts(list(x))
x_tokenized = x_tokenizer.texts_to_sequences(x)
x_train_val = pad_sequences(x_tokenized, maxlen=max_text_length)

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x_train_val, y, 
                                                   test_size = 0.2, 
                                                   random_state = 1000)

In [22]:
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(500, 160) (500, 4)
(125, 160) (125, 4)


In [23]:
model = Sequential()

model.add(layers.Embedding(max_features, embedding_dims, input_length=max_text_length))
model.add(layers.Dropout(0.2))

model.add(layers.Conv1D(filters=num_filters_1, kernel_size=filter_size, padding='valid', activation='relu', strides=1))
model.add(layers.GlobalMaxPooling1D())

model.add(layers.Dense(num_filters_2))
model.add(layers.Dropout(0.2))
model.add(layers.Activation('relu'))

model.add(layers.Dense(4))
model.add(layers.Activation('sigmoid'))

In [24]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 160, 50)           1000000   
                                                                 
 dropout (Dropout)           (None, 160, 50)           0         
                                                                 
 conv1d (Conv1D)             (None, 158, 250)          37750     
                                                                 
 global_max_pooling1d (Globa  (None, 250)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 250)               62750     
                                                                 
 dropout_1 (Dropout)         (None, 250)               0         
                                                        

In [25]:
import time
start_time = time.time() 
history_wassa = model.fit(x_train, y_train, 
                      batch_size=batch_size,
                      epochs=epochs, validation_data=(x_test, y_test))
time_taken = time.time()-start_time
print("\n Time taken: " , time_taken)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15

 Time taken:  21.29413890838623


## Emotions

In [18]:
train = pd.read_csv('/content/drive/MyDrive/CZ4042/project/train.txt', sep=';', engine='python', names=['col', 'senti'])
test = pd.read_csv('/content/drive/MyDrive/CZ4042/project/test.txt', sep=';', engine='python', names=['col', 'senti'])
valid = pd.read_csv('/content/drive/MyDrive/CZ4042/project/val.txt', sep=';', engine='python', names=['col', 'senti'])

In [19]:
emo = pd.concat([train, test, valid])
emo

Unnamed: 0,col,senti
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
1995,im having ssa examination tomorrow in the morn...,sadness
1996,i constantly worry about their fight against n...,joy
1997,i feel its important to share this info for th...,joy
1998,i truly feel that if you are passionate enough...,joy


In [20]:
y = pd.get_dummies(emo['senti'])
print(type(y))
print(y.shape)
print(y.ndim)
y.head()

<class 'pandas.core.frame.DataFrame'>
(20000, 6)
2


Unnamed: 0,anger,fear,joy,love,sadness,surprise
0,0,0,0,0,1,0
1,0,0,0,0,1,0
2,1,0,0,0,0,0
3,0,0,0,1,0,0
4,1,0,0,0,0,0


In [21]:
y = y.values
print("properties of y")
print("type : {}, dimensions : {}, shape : {}, total no. of elements : {}, data type of each element: {}, size of each element {} bytes".format(type(y), y.ndim, y.shape, y.size, y.dtype, y.itemsize))

properties of y
type : <class 'numpy.ndarray'>, dimensions : 2, shape : (20000, 6), total no. of elements : 120000, data type of each element: uint8, size of each element 1 bytes


In [22]:
list_of_classes = emo['senti'].unique().tolist()

In [24]:
emo = emo.drop(['senti'], axis=1)

In [25]:
emo.shape
x = emo.values.flatten()

In [26]:
print("Max. sentence length is: ", len(max(emo['col'], key=len)))

Max. sentence length is:  300


In [27]:
max_features = 20000
max_text_length = 300
embedding_dims = 50
batch_size = 10
epochs = 15
num_filters_1 = 250
num_filters_2 = 250
filter_size = 3

In [28]:
x_tokenizer = Tokenizer(num_words=max_features)
x_tokenizer.fit_on_texts(list(x))
x_tokenized = x_tokenizer.texts_to_sequences(x)
x_train_val = pad_sequences(x_tokenized, maxlen=max_text_length)

In [31]:
x_train, x_test, y_train, y_test = train_test_split(x_train_val, y, 
                                                   test_size = 0.2, 
                                                   random_state = 1000)

In [32]:
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

(16000, 300) (16000, 6)
(4000, 300) (4000, 6)


In [33]:
model = Sequential()

model.add(layers.Embedding(max_features, embedding_dims, input_length=max_text_length))
model.add(layers.Dropout(0.2))

model.add(layers.Conv1D(filters=num_filters_1, kernel_size=filter_size, padding='valid', activation='relu', strides=1))
model.add(layers.GlobalMaxPooling1D())

model.add(layers.Dense(num_filters_2))
model.add(layers.Dropout(0.2))
model.add(layers.Activation('relu'))

model.add(layers.Dense(6))
model.add(layers.Activation('sigmoid'))

In [34]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 300, 50)           1000000   
                                                                 
 dropout_2 (Dropout)         (None, 300, 50)           0         
                                                                 
 conv1d_1 (Conv1D)           (None, 298, 250)          37750     
                                                                 
 global_max_pooling1d_1 (Glo  (None, 250)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_2 (Dense)             (None, 250)               62750     
                                                                 
 dropout_3 (Dropout)         (None, 250)               0         
                                                      

In [36]:
import time
start_time = time.time() 
history_emo = model.fit(x_train, y_train, 
                      batch_size=batch_size,
                      epochs=epochs, validation_data=(x_test, y_test))
time_taken = time.time()-start_time
print("\n Time taken: " , time_taken)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15

 Time taken:  81.40405797958374
