## Import library and set configuration

In [24]:
# for general data analysis and visualization
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

# for text preprocessing 
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
import re

# for tokenization and train-test split
import pickle
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# for word embedding and model building
import requests 
import shutil
from gensim.scripts.glove2word2vec import glove2word2vec  
from gensim.models import KeyedVectors
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam

# for evaluation on test data
from tensorflow.keras.models import load_model
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

# for making prediction
!pip install snscrape
import snscrape.modules.twitter as sntwitter
import datetime as dt

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Marselo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Marselo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Marselo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Marselo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Marselo\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!




In [2]:
# Create a custom plotly theme and set it as default

pio.templates["custom"] = pio.templates["plotly_white"]
pio.templates["custom"].layout.margin = {'b': 25, 'l': 25, 'r': 25, 't': 75}
pio.templates["custom"].layout.width = 800
pio.templates["custom"].layout.height = 600
pio.templates["custom"].layout.autosize = False
pio.templates["custom"].layout.font.update({"family":"Arial", "size":14, "color":"#707070"})
pio.templates["custom"].layout.title.update({"xref":"container", "yref":"container","x":0.5, "yanchor":"top","font_size":20, "y":0.95, "font_color":"#353535"})
pio.templates["custom"].layout.xaxis.update({"showline":True, "linecolor":"lightgray","title_font_size":16})
pio.templates["custom"].layout.yaxis.update({"showline":True, "linecolor":"lightgray","title_font_size":16})
pio.templates["custom"].layout.colorway = ['#1F77B4', '#FF7F0E', '#54A24B', '#D62728', '#C355FA',
                                           '#8C564B', '#E377C2', '#7F7F7F',"#FFE323", '#17BECF']
pio.templates.default = "custom"

## Read dataset

In [4]:
df = pd.read_csv("../dataset/labeled_tweets.csv")
df = df.drop(columns=["username", "date"])
df.head()

Unnamed: 0,tweet,sentiment
0,@Roblox_RTC we want him back,Negative
1,@worldinpetals Can’t believe @tobyfox STOLE ph...,Negative
2,can't y'all just let us be happy and enjoy 2...,Negative
3,F4 Thailand is so promising waaah my heart ((...,Negative
4,@favstoubia I felt terrible for her If I was...,Negative


In [5]:
sentiment_ct = df.sentiment.value_counts()
fig = px.bar(sentiment_ct, text_auto=True, title="<b>Tweets Sentiment Distribution</b><br>"+
             "<span style='font-size:18'>The number of negative tweets are slightly more than the number of positive tweets</span>")
fig.update_layout(showlegend=False, xaxis_title=None, yaxis_title=None)
fig.update_traces(hovertemplate="Sentiment=%{x}<br>Count=%{y}<extra></extra>")
fig.show()

## Text preprocessing

In [6]:
stopwords = set()
with open('../static/en_stopwords.txt', 'r') as file:
    for word in file:
        stopwords.add(word.rstrip("\n"))
lemmatizer = WordNetLemmatizer()

In [7]:
def text_preprocessing(text):
    try:
        url_pattern = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
        user_pattern = r'@[^\s]+'
        entity_pattern = r"&.*;"
        neg_contraction = r"n't\W"
        non_alpha = "[^a-z]"
        cleaned_text = text.lower()
        cleaned_text = re.sub(url_pattern, "URL", cleaned_text)
        cleaned_text = re.sub(user_pattern, "USERNAME", cleaned_text)
        cleaned_text = re.sub(entity_pattern, " ", cleaned_text)
        cleaned_text = re.sub(neg_contraction, " not ", cleaned_text)
        cleaned_text = re.sub(non_alpha, " ", cleaned_text)
        tokens = nltk.word_tokenize(cleaned_text)
        # provide POS tag for lemmatization to yield better result
        word_tag_tuples = pos_tag(tokens, tagset='universal')
        tag_dict = {'NOUN':'n', 'VERB':'v', 'ADJ':'a', 'ADV':'r'}
        final_tokens = []
        for word, tag in word_tag_tuples:
            if len(word) > 1 and word not in stopwords:
                if tag in tag_dict:
                    final_tokens.append(lemmatizer.lemmatize(word, tag_dict[tag]))
                else:
                    final_tokens.append(lemmatizer.lemmatize(word))
        return " ".join(final_tokens)
    except:
        return np.nan

In [8]:
df['cleaned_tweet'] = df['tweet'].apply(text_preprocessing)

In [9]:
df.head()

Unnamed: 0,tweet,sentiment,cleaned_tweet
0,@Roblox_RTC we want him back,Negative,want back
1,@worldinpetals Can’t believe @tobyfox STOLE ph...,Negative,believe stole phone call fnaf
2,can't y'all just let us be happy and enjoy 2...,Negative,ca not all let u happy enjoy faves meeting kin...
3,F4 Thailand is so promising waaah my heart ((...,Negative,thailand promising waaah heart thyme domyouji ...
4,@favstoubia I felt terrible for her If I was...,Negative,felt terrible piss


In [10]:
df.to_csv("../dataset/cleaned_tweets.csv")

## Tokenization and train-val-test split

In [9]:
df = pd.read_csv('../dataset/cleaned_tweets.csv', index_col=0)

In [10]:
df.shape

(701651, 3)

In [11]:
df = df[df["cleaned_tweet"].notna() & (df["cleaned_tweet"] != "")]

# use 80% of the data for training, and set aside 20% for testing
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(index=train_df.index)

In [12]:
train_df.sentiment.value_counts()

Negative    282974
Positive    272622
Name: sentiment, dtype: int64

The amount between negative and positive tweets in train_df are relatively equal

In [13]:
# further split the training set into training and validation set
X = train_df.cleaned_tweet
y = train_df.sentiment
X_train, X_test, y_train, y_test=train_test_split(X, y, train_size=.8, random_state=42)

In [17]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(X_train)
nunique_words = len(tokenizer.word_index)+1
print('Number of Unique Words: {}'.format(nunique_words))

Number of Unique Words: 110883


In [18]:
# saving tokenizer
with open('../static/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [19]:
X_train = pad_sequences(tokenizer.texts_to_sequences(X_train))
X_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=X_train.shape[1])
y_train = y_train.replace({"Negative":0, "Positive":1})
y_test = y_test.replace({"Negative":0, "Positive":1})

## Word embedding and model building

In [20]:
# download pretrained GloVe (based on 2B tweets)
file_url = "https://nlp.stanford.edu/data/glove.twitter.27B.zip"    
!wget $file_url
!unzip /content/glove.twitter.27B.zip

--2023-01-12 06:10:05--  https://nlp.stanford.edu/data/glove.twitter.27B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip [following]
--2023-01-12 06:10:05--  https://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1520408563 (1.4G) [application/zip]
Saving to: ‘glove.twitter.27B.zip’


2023-01-12 06:14:51 (5.09 MB/s) - ‘glove.twitter.27B.zip’ saved [1520408563/1520408563]

Archive:  /content/glove.twitter.27B.zip
  inflating: glove.twitter.27B.25d.txt  
  inflating: glove.twitter.27B.50d.txt  
  inflating: glove

In [21]:
# convert GloVe format into Word2Vec format
glove_file = '/content/glove.twitter.27B.200d.txt'
new_file = '/content/glove.twitter.27B.200d.w2v.txt'
glove2word2vec(glove_file, new_file)

(1193514, 200)

In [22]:
# use pretrained GloVe as embedding layer (transfer learning)
word2vec_model = KeyedVectors.load_word2vec_format('/content/glove.twitter.27B.200d.w2v.txt')
embedding_matrix = np.zeros((nunique_words, 200))
for word in tokenizer.word_index:
    if word in word2vec_model: 
        embedding_vector = word2vec_model[word]
        n = tokenizer.word_index[word]
        embedding_matrix[n] = embedding_vector

embedding_layer = Embedding(nunique_words,
                            200,
                            weights=[embedding_matrix],
                            input_length=X_train.shape[1],
                            trainable=False)

In [23]:
model = Sequential()
model.add(embedding_layer)
model.add(SpatialDropout1D(0.4))
model.add(LSTM(150, dropout=0.25, recurrent_dropout=0.25))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer="adam", metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 54, 200)           22176600  
                                                                 
 spatial_dropout1d (SpatialD  (None, 54, 200)          0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 150)               210600    
                                                                 
 dense (Dense)               (None, 1)                 151       
                                                                 
Total params: 22,387,351
Trainable params: 210,751
Non-trainable params: 22,176,600
_________________________________________________________________
None


In [24]:
es_callback = EarlyStopping(monitor='val_loss', patience=3)
mc_callback = ModelCheckpoint(
    filepath='../static/lstm_model-{epoch:02d}.h5',
    monitor='val_loss',
    mode='min',
    save_best_only=True)
history = model.fit(X_train, y_train, epochs = 10, validation_data = (X_test, y_test) ,callbacks=[es_callback, mc_callback], batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


Model stop training because val_loss didn't go below 0.4528 after 3 iterations

In [36]:
loss = history.history['loss']
acc = history.history['accuracy']
val_loss = history.history['val_loss']
val_acc = history.history['val_accuracy']

epochs = [n for n in range(1, len(loss) + 1)]

fig = px.line(x=epochs, y=[acc, val_acc], labels={"x":"Epoch", "value":"Accuracy"}, title="<b>Accuracy of Training and Validation Set</b>")
fig.data[0].name = "Training Set"
fig.data[1].name = "Validation Set"
fig.data[0].hovertemplate='Variable=Training Set<br>Epoch=%{x}<br>Accuracy=%{y}<extra></extra>'
fig.data[1].hovertemplate='Variable=Validation Set<br>Epoch=%{x}<br>Accuracy=%{y}<extra></extra>'
fig.show()

In [37]:
fig = px.line(x=epochs, y=[loss, val_loss], labels={"x":"Epoch", "value":"Loss"}, title="<b>Loss of Training and Validation Set</b>")
fig.data[0].name = "Training Set"
fig.data[1].name = "Validation Set"
fig.data[0].hovertemplate='Variable=Training Set<br>Epoch=%{x}<br>Loss=%{y}<extra></extra>'
fig.data[1].hovertemplate='Variable=Validation Set<br>Epoch=%{x}<br>Loss=%{y}<extra></extra>'
fig.show()

## Evaluation on test data

Choose model with lowest val_loss

In [18]:
model = load_model('../static/lstm_model-04.h5')

In [19]:
with open('../static/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [20]:
test_df.sentiment.value_counts()

Negative    71112
Positive    67787
Name: sentiment, dtype: int64

The amount of positive and negative tweets are relatively equal

In [21]:
sequences = pad_sequences(tokenizer.texts_to_sequences(test_df['cleaned_tweet']), maxlen=54)
score = model.predict(sequences)
test_df['score'] = score
test_df['pred_sentiment'] = test_df['score'].apply(lambda x: "Positive" if x >= 0.50 else "Negative")



In [22]:
accuracy_score(test_df['sentiment'], test_df['pred_sentiment'])

0.7836413509096538

The accuracy on testing data is similar to the accuracy of the validation data

In [23]:
tn, fp, fn, tp = confusion_matrix(test_df['sentiment'], test_df['pred_sentiment']).ravel()
tnr = tn / (tn + fp)
tpr = tp / (tp + fn)
print("True Negative Rate: {:.3f}".format(tnr))
print("True Positive Rate: {:.3f}".format(tpr))

True Negative Rate: 0.783
True Positive Rate: 0.784


The true negative rate is similar to the true positive rate