# LSTM TRAIN CODE

## Import Libraries

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
# pd.set_option('display.max_rows', None)
import numpy as np
from numpy import load
import pickle

import tensorflow as tf
import keras
from keras.preprocessing import text, sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Embedding, InputLayer, LSTM, GRU, Conv1D, MaxPooling1D, Flatten, Dropout, Activation, GlobalMaxPool1D
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from keras import backend as K
from keras.utils import np_utils

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression

import torch # pytorch
import torch.nn as nn
import torch.nn.functional as F
import torchtext.vocab
from torchtext.legacy import data
from torchtext.legacy import datasets

import gensim
from gensim.models import Word2Vec, KeyedVectors

import re
import string
from string import punctuation
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

import time
import statistics

# Open-source Sentiment Analysis libraries
from textblob import TextBlob
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

# !pip install emoji
# from preprocessing_steps import Preprocess



## Load Google drive

In [None]:
# For access to Gdrive to write result charts into it
from google.colab import auth, files, drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/Colab_Project/tweet_sentiment_analysis

In [None]:
is_cuda = torch.cuda.is_available()
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used") 

## Load preprocessed data for lstm model (2020_09_2021_02)

In [None]:
file_path = "data1/emojitag_remove_finalized_tweets_training_data_2020_09_to_2021_02.csv"
data_df = pd.read_csv(file_path)
data_df = data_df.dropna(subset=["full_text", "processed_text"])
data_copy = data_df.copy()
print(len(data_df))

data_df = data_df[["full_text", "processed_text", "sentiment_Label_AVG"]] # Remove unnecessary columns
data_df["processed_text"] = data_df["processed_text"].astype(str) # Ensure column is string type
data_df.sample(n=2)

## Generate ground truth using textblob and vader

### TextBlob

In [None]:
# start = time.time()
# sentiment_TextBlob = []
# for i in range(len(data_df)):
#     blob = TextBlob(data_df.iloc[i]["full_text"]) # Note: On processed_text
#     sentiment = "neutral"
#     if blob.sentiment.polarity == 0:
#         sentiment = "neutral"
#     elif blob.sentiment.polarity > 0:
#         sentiment = "positive"
#     elif blob.sentiment.polarity < 0:
#         sentiment = "negative"
        
#     sentiment_TextBlob.append(sentiment)

# # Add in new column called "sentiment_TextBlob" with the sentiment value from TextBlob
# data_df["sentiment_TextBlob"] = sentiment_TextBlob
# print(str(len(data_df)) + " records")
# print("\nTotal time taken in minutes: {:.4f}".format((time.time()-start) / 60))

### Vader

In [None]:
# sia = SentimentIntensityAnalyzer()

# start = time.time()
# sentiment_Vader = []
# for i in range(len(data_df)):
#     # scores_dict = sia.polarity_scores(data_copy.iloc[i]["text"]) # Note: On unprocessed text because it has an exhaustive inbuilt preprocessing
#     scores_dict = sia.polarity_scores(data_df.iloc[i]["full_text"]) # Note: On unprocessed text because it has an exhaustive inbuilt preprocessing
#     sentiment = "neutral"
#     if scores_dict["compound"] == 0:
#         sentiment = "neutral"
#     elif scores_dict["compound"] > 0:
#         sentiment = "positive"
#     elif scores_dict["compound"] < 0:
#         sentiment = "negative"
        
#     sentiment_Vader.append(sentiment)

# # Add in new column called "sentiment_TewxtBlob" with the sentiment value from TextBlob
# data_df["sentiment_Vader"] = sentiment_Vader
# print(str(len(data_df)) + " records")
# print("\nTotal time taken in minutes: {:.4f}".format((time.time()-start) / 60))

### Textblob + Vader

In [None]:
# raw_df = pd.read_csv("data1/emojitag_remove_finalized_tweets_training_data_2020_09_to_2021_02.csv")

# start = time.time()
# sia = SentimentIntensityAnalyzer()
# sentiment_TV = []
# for i in range(len(raw_df)):
#     blob = TextBlob(raw_df.iloc[i]["full_text"]) # Note: On processed_text
#     scores_dict = sia.polarity_scores(raw_df.iloc[i]["full_text"]) # Note: On unprocessed text because it has an exhaustive inbuilt preprocessing
#     sentiment = "neutral"
#     average_score = (blob.sentiment.polarity + scores_dict["compound"])/2
#     # print(average_score)
#     if average_score > 0.1:
#         sentiment = "positive"
#     elif average_score < -0.1:
#         sentiment = "negative"
#     else: 
#         sentimet = "neutral"
#     sentiment_TV.append(sentiment)
# raw_df["sentiment_TV"] = sentiment_TV

# print(str(len(raw_df)) + " records")
# print("\nTotal time taken in minutes: {:.4f}".format((time.time()-start) / 60))
# raw_df.sample(n=5)


In [None]:
# raw_df.to_csv("data1/emojitag_remove_finalized_tweets_training_data_2020_09_to_2021_02.csv", index=False)

## Check Ground Truth Label Distribution

In [None]:
# sb.set(style='whitegrid')
# sb.countplot(x='sentiment_Label_AVG', data=data_df) # Balanced data

In [None]:
# sb.set(style='whitegrid')
# sb.countplot(x='sentiment_Vader', data=data_df) # Balanced data

In [None]:
# sb.set(style='whitegrid')
# sb.countplot(x='sentiment_TextBlob', data=data_df) # Balanced data

## Split data into train and test data

In [None]:
train_df, test_df = train_test_split(data_df, test_size=0.2, random_state=0)
print(len(train_df))
train_df.head()

# Word Embeddings

## Use Self-Trained Word2Vec embedding model

### Using 2020_06_2021_02 to generate word2vec model

In [None]:
embedding_train_df = pd.read_csv('data/emojitag_remove_filtered_tweets_filtered_compiled_2020_06_to_2021_02.csv', encoding='utf-8')
embedding_train_df = embedding_train_df.dropna(subset=["full_text", "processed_text"])
print(len(embedding_train_df))
embedding_train_df[["id", "full_text", "processed_text"]] # Note: No labels. Keep id for cross-referencing


In [None]:
# Train word2Vec model for LSTM embedding layer 
EMBED_DIM = 100
WORD2VEC_WINDOW = 7
WORD2VEC_EPOCH = 50
WORD2VEC_MIN_COUNT = 10

doc = [txt.split() for txt in embedding_train_df["processed_text"]] 
# for txt in embedding_train_df["processed_text"]:
#    if isinstance(txt, float): print(txt)

word2vec_model = gensim.models.word2vec.Word2Vec(size=EMBED_DIM,
                                                 window=WORD2VEC_WINDOW,
                                                 min_count=WORD2VEC_MIN_COUNT,
                                                 workers=8)

word2vec_model.build_vocab(doc)
words = word2vec_model.wv.vocab.keys()
print("Total num of words in vocab: ", len(words))

start = time.time()
word2vec_model.train(doc, total_examples=len(doc), epochs=WORD2VEC_EPOCH)
print("\nTotal time taken in minutes: {:.4f}".format((time.time()-start) / 60))

# Save word2vec model
# filename = "lstm_word2vec_model_from_tweets_" + str(EMBED_DIM) + "_" + str(WORD2VEC_WINDOW) + "_" + str(WORD2VEC_EPOCH) + "_" + str(WORD2VEC_MIN_COUNT) + ".w2v"
# word2vec_model.save(filename)

In [None]:
# print(word2vec_model.wv[''])
print(word2vec_model.wv.most_similar('covid', topn=10))

# Tokenize text content 
- Tokenize the text content, then convert to a padded sequence to fit into LSTM model

In [None]:
# deal with the train data

tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts(train_df["processed_text"])
print(tokenizer)
VOCAB_SIZE = len(tokenizer.word_index) + 1 # Note: index starts from 1 instead of 0. Therefore increase size by 1
print("VOCAB_SIZE:", VOCAB_SIZE)

# save tokenizer
tokenizer_file_path = "remove_emojitag_weights/lstm_tokenizer.pkl"
pickle.dump(tokenizer, open(tokenizer_file_path, "wb"), protocol=0)
tokenizer.word_index.items()

In [None]:
SEQUENCE_LENGTH = 100
x_train = pad_sequences(tokenizer.texts_to_sequences(train_df["processed_text"]), maxlen=SEQUENCE_LENGTH)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_df["processed_text"]), maxlen=SEQUENCE_LENGTH)
# len(x_train[1])

## Preprocess labels

In [None]:
# def convert_label(list):
#     labels = []
#     for label in list:
#        if label == 'positive': labels.append(1)
#        elif label == 'neutral': labels.append(0)
#        elif label == 'negative': labels.append(-1)
#     return labels

encoder = LabelEncoder()
encoder.fit(train_df["sentiment_Label_AVG"].tolist())
print(train_df["sentiment_Label_AVG"].head())

y_train = encoder.transform(train_df["sentiment_Label_AVG"].tolist())
y_test = encoder.transform(test_df["sentiment_Label_AVG"].tolist())
# y_train = np.array(convert_label(train_df["sentiment_TV"].tolist()))
# y_test = np.array(convert_label(test_df["sentiment_TV"].tolist()))

y_train_num = y_train.reshape(-1, 1)
y_test_num = y_test.reshape(-1, 1)
print(y_train_num[:5])

y_train = np_utils.to_categorical(y_train_num)
y_test = np_utils.to_categorical(y_test_num)
print(y_train[:5])  # Note: 'positive': 2->[0 0 1] 'neutral': 1->[0 1 0]      'negative': 0->[1 0 0]


print("x_train shape", x_train.shape)
print("x_test shape", x_test.shape)
print("y_train shape", y_train.shape)
print("y_test shape", y_test.shape)
print()




# Build and compile LSTM model

In [None]:
embedding_matrix_file_path = "remove_emojitag_weights/lstm_word2vec_embedding_matrix.npy"

In [None]:
# Create embedding matrix for the embedding layer
EMBEDDING_MATRIX = np.zeros((VOCAB_SIZE, EMBED_DIM))

# Self-trained word2Vec embedding
# convert each token in the train vocabulary to word vector
for word, index in tokenizer.word_index.items():
    if word in word2vec_model.wv.vocab.keys():
        EMBEDDING_MATRIX[index] = word2vec_model.wv[word]
        
print(EMBEDDING_MATRIX.shape)

In [None]:
# Save EMBEDDING_MATRIX(numpy array) as npy file
from numpy import asarray
from numpy import save

save(embedding_matrix_file_path, EMBEDDING_MATRIX)



## Load embedding matrix and tokenizer

In [None]:
# Load EMBEDDING_MATRIX(numpy array) from npy file
from numpy import load

EMBEDDING_MATRIX = load(embedding_matrix_file_path)
print("EMBEDDING_MATRIX SHAPE:", EMBEDDING_MATRIX.shape)

VOCAB_SIZE = EMBEDDING_MATRIX.shape[0]
EMBED_DIM = EMBEDDING_MATRIX.shape[1]

print("VOCAB_SIZE - Embedding Matrix:", VOCAB_SIZE)
print("EMBED_DIM: ", EMBED_DIM)


In [None]:

def get_LSTM_Model():
    model = Sequential()
    model.add(InputLayer(input_shape=(SEQUENCE_LENGTH,), dtype='int32'))
    
    # Non-trainable embedding layer
    model.add(
        Embedding(
            input_dim = VOCAB_SIZE,
            output_dim = EMBED_DIM,
            input_length = SEQUENCE_LENGTH,
            weights = [EMBEDDING_MATRIX],
            trainable=False
        ))
    
    # LSTM layer
    model.add(LSTM(128, return_sequences=True))
    model.add(GlobalMaxPool1D())
    model.add(Dropout(0.1))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.1))
    # model.add(Dense(1, activation='sigmoid'))
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    return model

# Train LSTM model

In [None]:
# Model params 
BATCH_SIZE = 1024
EPOCHS = 30

# Compile and get model
model_lstm = get_LSTM_Model()
print(model_lstm.summary())
print("\n")

# Callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0)
early_stop = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=5) # To reduce overfitting
callbacks = [reduce_lr, early_stop]

# Train model
start = time.time()
history = model_lstm.fit(x_train,
                         y_train,
                         batch_size=BATCH_SIZE,
                         epochs=EPOCHS,
                         verbose=1,
                         validation_split=0.1,
                         callbacks=callbacks)

# Get test accuracy
print('\nTest')
loss, accuracy = model_lstm.evaluate(x_test, y_test, batch_size=BATCH_SIZE, verbose=1)
print("\nTest Accuracy = {}".format(accuracy))

print("\nTotal time taken in mins: {:.4f}".format((time.time()-start) / 60))

In [None]:
# Plot Train vs Validation accuracy
sb.set(font_scale=1.0) # Determine fontsize
train_acc = history.history['accuracy'] # Blue
val_acc = history.history['val_accuracy'] # Orange
plt.plot(train_acc, 'C0', label="Train Accuracy")
plt.plot(val_acc, 'C1', label="Validation Accuracy")
plt.legend(loc="upper left")
plt.xlabel("No. of epochs")
plt.ylabel("Accuracy Pct")
plt.show()

# Evaluate test data

In [None]:
# represent by number pos: 2, neu: 1, neg:0
y_pred = model_lstm.predict_classes(x_test, verbose=0)
same = 0
for i, y_true in enumerate(y_test_num):
    if y_true == y_pred[i]:
       same = same + 1
print(len(y_pred) == len(y_test_num))
ac_check = same / len(y_test_num)
ac_check

accuracy = accuracy_score(y_test_num, y_pred)
print("Test Accuracy = {}".format(accuracy))

f1 = f1_score(y_test_num, y_pred, average="macro")
print("Test F1 macro score = {}".format(f1))
# print(classification_report(y_test, y_pred, target_names = ['negative', 'positive']))

# Perform Sample Predictions

In [None]:
# # Print confusion matrix
# cm = confusion_matrix(y_test, y_pred)
# cm = pd.DataFrame(cm, index=['negative','positive'], columns=['negative','positive'])

# sb.set(font_scale=2.0) # Increase fontsize
# plt.figure(figsize = (10,10))
# sb.heatmap(cm, cmap="Blues", linecolor='black', linewidth=1, annot=True, fmt='')
# plt.xlabel("Predicted")
# # plt.ylabel("Actual")

# Save models

In [None]:
file_path = "remove_emojitag_weights/lstm_main_model.h5"
model_lstm.save(file_path)

# # EXPORT
# LSTM_MODEL = "model_lstm.h5"
# WORD2VEC_MODEL = "model.w2v"
# TOKENIZER_MODEL = "tokenizer.pkl"
# ENCODER_MODEL = "encoder.pkl"

# model_lstm.save(LSTM_MODEL)
# word2vec_model.save(WORD2VEC_MODEL)
# pickle.dump(tokenizer, open(TOKENIZER_MODEL, "wb"), protocol=0)
# pickle.dump(encoder, open(ENCODER_MODEL, "wb"), protocol=0)

# Test on manually labeled data

In [None]:
# sentiment_SELF
self_file_path = "data/emojitag_remove_filtered_2278.csv"
self_df = pd.read_csv(self_file_path)
x_self_test = pad_sequences(tokenizer.texts_to_sequences(self_df["processed_text"]), maxlen=SEQUENCE_LENGTH)

y_self_test = encoder.transform(self_df["sentiment_SELF"].tolist())
y_self_test_num = y_self_test.reshape(-1, 1)
y_self_test = np_utils.to_categorical(y_self_test_num)

loss, accuracy = model_lstm.evaluate(x_self_test, y_self_test, batch_size=BATCH_SIZE, verbose=1)
print("\nSelf-labeled Test Accuracy = {}".format(accuracy))
self_df.head(1)

In [None]:
def v2sentiment(list):
    sentiments = []
    for v in list:
        if v == 2: sentiments.append("positive")
        elif v == 1: sentiments.append("neutral")
        elif v == 0: sentiments.append("negative")
    return sentiments
    
y_self_pred = np.argmax(model_lstm.predict(x_self_test), axis=-1)
self_df["sentiment_tagged"] = v2sentiment(y_self_pred)
self_df.to_csv(self_file_path, index=False)

accuracy = accuracy_score(y_self_test_num, y_self_pred)
print("Self-labeled Test Accuracy = {}".format(accuracy))

f1 = f1_score(y_self_test_num, y_self_pred, average="macro")
print("Self-labeled Test F1 macro score = {}".format(f1))