In [0]:
import os
import re
import json
import pandas as pd
from pathlib import Path
import sys
import pickle

In [0]:
tokenizer_file = Path('/sentiment140/tokenizer.pickle').resolve()
with tokenizer_file.open('rb') as file:
    tokenizer = pickle.load(file)

In [0]:
from tensorflow.keras.layers import Input, Embedding, GRU
from tensorflow.keras.layers import Dropout, GlobalMaxPooling1D
from tensorflow.keras.layers import Bidirectional, Dense
from tensorflow.keras.models import Sequential

In [0]:
input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)
embedding_dim = 200
input_length = 100
gru_units = 128
gru_dropout = 0.1
recurrent_dropout = 0.1
dropout = 0.1

In [0]:
model = Sequential()
model.add(Embedding(
    input_dim=input_dim,
    output_dim=embedding_dim,
    input_shape=(input_length,)
))

model.add(Bidirectional(GRU(
    gru_units,
    return_sequences=True,
    dropout=gru_dropout,
    recurrent_dropout=recurrent_dropout
)))
model.add(GlobalMaxPooling1D())
model.add(Dense(32, activation='relu'))
model.add(Dropout(dropout))

model.add(Dense(1, activation='sigmoid'))



In [0]:
weights_path = Path('/sentiment_analysis/gru_model.h5').resolve()
model.load_weights(weights_path.as_posix())

In [0]:
import json

relations_path = Path('query_relations.json')
with relations_path.open('r') as file:
    relations = json.load(file)

In [0]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm
import pandas as pd
import numpy as np
import re

In [0]:
dataset_dir = Path('/datasets/tweepy').resolve()

In [0]:
import re
import nltk
from time import time
!pip install emoji
from emoji import demojize
nltk.download('stopwords')

def typecast_str(text):
  return str(text)

def preprocess(texts, quiet=False):
  start = time()
  texts = texts.apply(typecast_str)
  # Lowercasing
  texts = texts.str.lower()

  # Remove special chars
  texts = texts.str.replace(r"(http|@)\S+", "")
  texts = texts.apply(demojize)
  texts = texts.str.replace(r"::", ": :")
  texts = texts.str.replace(r"’", "'")
  texts = texts.str.replace(r"[^a-z\':_]", " ")

  # Remove repetitions
  pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
  texts = texts.str.replace(pattern, r"\1")

  # Transform short negation form
  texts = texts.str.replace(r"(can't|cannot)", 'can not')
  texts = texts.str.replace(r"n't", ' not')

  # Remove stop words
  stopwords = nltk.corpus.stopwords.words('english')
  stopwords.remove('not')
  stopwords.remove('nor')
  stopwords.remove('no')
  texts = texts.apply(
    lambda x: ' '.join([word for word in x.split() if word not in stopwords])
  )

  if not quiet:
    print("Time to clean up: {:.2f} sec".format(time() - start))

  return texts

Collecting emoji
[?25l  Downloading https://files.pythonhosted.org/packages/40/8d/521be7f0091fe0f2ae690cc044faf43e3445e0ff33c574eae752dd7e39fa/emoji-0.5.4.tar.gz (43kB)
[K     |███████▌                        | 10kB 21.1MB/s eta 0:00:01[K     |███████████████                 | 20kB 4.0MB/s eta 0:00:01[K     |██████████████████████▋         | 30kB 5.3MB/s eta 0:00:01[K     |██████████████████████████████▏ | 40kB 5.4MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 3.0MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-0.5.4-cp36-none-any.whl size=42176 sha256=06b2e2c903121a7bd99f4024d9221a19202f1680131075a30d22511cc4b405fb
  Stored in directory: /root/.cache/pip/wheels/2a/a9/0a/4f8e8cce8074232aba240caca3fade315bb49fac68808d1a9c
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-0.5.4
[nltk_data] Downloading package sto

In [0]:
data_dict = {}

query_dict = {
    'query': [],
    'mean': [],
    'max': [],
    'min': [],
    'std': [],
    'count': [],
    'emotion': []
}

dir_files = os.listdir(dataset_dir)

with tqdm(total=len(dir_files)) as t:
    for filename in dir_files:
        dataset = pd.read_csv(os.path.join(dataset_dir, filename),engine="python")
        cleaned_texts = preprocess((dataset.text), quiet=True)

        query = re.findall(r'(#[^.]+|:.+:)', filename)[0]

        predict_sequences = [text.split() for text in cleaned_texts]
        list_tokenized_predict = tokenizer.texts_to_sequences(predict_sequences)
        x_predict = pad_sequences(list_tokenized_predict, maxlen=100)

        result = model.predict(x_predict)
        
        emotion = relations[query]
        query_dict['query'].append(query)
        query_dict['mean'].append(np.mean(result))
        query_dict['max'].append(np.amax(result))
        query_dict['min'].append(np.amin(result))
        query_dict['count'].append(len(dataset))
        query_dict['std'].append(np.std(result))
        query_dict['emotion'].append(emotion)

        if emotion in data_dict:
            data_dict[emotion] = np.concatenate([data_dict[emotion], result])
        else:
            data_dict[emotion] = result
        
        t.update()

100%|██████████| 33/33 [12:12<00:00, 22.20s/it]


In [0]:
df = pd.DataFrame(data=query_dict)
for emotion in df.emotion.unique():
    display(df[df.emotion == emotion])

Unnamed: 0,query,mean,max,min,std,count,emotion
0,:red_heart:,0.822659,0.997705,0.004259,0.222622,9000,joy
1,:face_with_tears_of_joy:,0.569252,0.998579,0.003522,0.27156,9000,joy
2,:grinning_face_with_smiling_eyes:,0.687733,0.996813,0.00267,0.270469,8800,joy
3,:smiling_face_with_smiling_eyes:,0.767773,0.996919,0.004499,0.246835,9000,joy
17,#happiness,0.915877,0.99732,0.014244,0.138561,10000,joy
18,#joy,0.810568,0.997132,0.012698,0.198349,10000,joy
19,#excited,0.911127,0.997508,0.036122,0.128728,10000,joy


Unnamed: 0,query,mean,max,min,std,count,emotion
4,:face_screaming_in_fear:,0.537975,0.994955,0.000814,0.282369,9000,fear
5,:fearful_face:,0.47187,0.991604,0.001679,0.280883,9000,fear
6,#fear,0.522579,0.987383,0.006821,0.264976,2319,fear
7,:anxious_face_with_sweat:,0.427369,0.992529,0.001351,0.28893,17983,fear
23,#anxious,0.45575,0.993768,0.002243,0.295146,5000,fear
24,#scared,0.242051,0.899232,0.002594,0.202835,372,fear
28,#scary,0.375054,0.981418,0.004209,0.238416,10000,fear
29,#worried,0.121295,0.97589,0.001068,0.13204,10000,fear
30,#afraid,0.393429,0.933373,0.002112,0.265732,10000,fear


Unnamed: 0,query,mean,max,min,std,count,emotion
8,:pouting_face:,0.425764,0.996427,0.002186,0.284386,8995,anger
9,:face_with_steam_from_nose:,0.505153,0.993641,0.00241,0.29282,9000,anger
10,:face_with_symbols_on_mouth:,0.394176,0.99231,0.003269,0.27614,9000,anger
11,:anger_face:,0.396917,0.990181,0.008929,0.24664,1987,anger
20,#pissed,0.144151,0.950584,0.002865,0.139335,5000,anger
21,#angry,0.185236,0.979037,0.000885,0.181135,5000,anger
22,#hateyou,0.470552,0.994972,0.001251,0.309794,5000,anger
25,#mad,0.435405,0.989492,0.000885,0.264409,10000,anger
26,#pissedoff,0.315363,0.992776,0.000955,0.262643,10000,anger
27,#furious,0.395957,0.986565,0.001284,0.274739,10000,anger


Unnamed: 0,query,mean,max,min,std,count,emotion
12,#sad,0.047621,0.841336,0.000559,0.078319,7643,sadness
13,:crying_face:,0.417462,0.99372,0.001329,0.313953,7200,sadness
14,:pensive_face:,0.441474,0.99623,0.000841,0.303467,8200,sadness
15,:loudly_crying_face:,0.520543,0.997415,0.003074,0.306719,9000,sadness
16,:broken_heart:,0.407137,0.995162,0.001961,0.301669,8988,sadness
31,#depressed,0.090225,0.973555,0.000565,0.120993,10000,sadness
32,#depression,0.238804,0.969783,0.000939,0.209054,10000,sadness


In [0]:
emotion_dict = {
    'emotion': [],
    'mean': [],
    'max': [],
    'min': [],
    'std': [],
    'count': []
}

for emotion, result in data_dict.items():
    emotion_dict['emotion'].append(emotion)
    emotion_dict['mean'].append(np.mean(result))
    emotion_dict['max'].append(np.amax(result))
    emotion_dict['min'].append(np.amin(result))
    emotion_dict['std'].append(np.std(result))
    emotion_dict['count'].append(len(result))
    
emotion_df = pd.DataFrame(data=emotion_dict)
display(emotion_df)

Unnamed: 0,emotion,mean,max,min,std,count
0,joy,0.788221,0.998579,0.00267,0.243668,65800
1,fear,0.397052,0.994955,0.000814,0.286923,73674
2,anger,0.380894,0.996427,0.000885,0.283283,73982
3,sadness,0.305161,0.997415,0.000559,0.300929,61031


In [0]:
def get_score_range(mean):
  if mean < 0.7:
    return (0.0, mean)
  return (mean, 1.0)

In [0]:
files_dir = Path('/datasets/tweepy').resolve()

In [0]:
emotion_data_dict = {}

filenames = os.listdir(files_dir)
with tqdm(total=len(filenames)) as t:
    for filename in filenames:
        query = re.findall(r'(#[^.]+|:.+:)', filename)[0]
        emotion = relations[query]

        file_data = pd.read_csv(os.path.join(files_dir, filename),engine="python")
        dict_data = emotion_data_dict[emotion] if emotion in emotion_data_dict else None
        emotion_data_dict[emotion] = pd.concat([dict_data, file_data])
        t.update()

100%|██████████| 33/33 [00:02<00:00, 14.73it/s]


In [0]:
result_data = []
messages = []
with tqdm(total=len(emotion_data_dict.items())) as t:
    for emotion, dataset in emotion_data_dict.items():
        t.set_description('Processing "' + emotion + '" data')

        cleaned_texts = preprocess(dataset.text, quiet=True)
        predict_sequences = [text.split() for text in cleaned_texts]
        list_tokenized_predict = tokenizer.texts_to_sequences(predict_sequences)
        x_predict = pad_sequences(list_tokenized_predict, maxlen=100)

        result = model.predict(x_predict)
        mean = np.mean(result)
        std = np.std(result)
        low, high = get_score_range(mean)
        messages.append(emotion.capitalize() + ": Score Range: {:4f} - {:4f}".format(low, high))
        dataset = dataset[np.all([(result >= low), (result <= high)], axis=0)]
        dataset.insert(0, 'label', emotion)

        result_data = result_data + [dataset]
        t.update()

for message in messages:
    print(message)

Processing "sadness" data: 100%|██████████| 4/4 [11:09<00:00, 167.33s/it]

Joy: Score Range: 0.788221 - 1.000000
Fear: Score Range: 0.000000 - 0.397052
Anger: Score Range: 0.000000 - 0.380894
Sadness: Score Range: 0.000000 - 0.305161





In [0]:
if len(result_data) > 0:
    result_data = pd.concat(result_data)

    path = Path('dataset.csv').resolve()
    result_data.to_csv(path, index=None)

    print('Files saved under "' + path.as_posix() + '"')