<a href="https://colab.research.google.com/github/sydney-machine-learning/COVID19-antivaccine_sentiment/blob/main/LSTM%20on%20preprocessed%20data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
SEED = 1024
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

from torch.utils.data import TensorDataset, DataLoader, Dataset
import torchtext
from torchtext import data

import warnings
warnings.filterwarnings('ignore')
import operator

from sklearn.metrics import hamming_loss, jaccard_score, label_ranking_average_precision_score, f1_score
from tqdm._tqdm_notebook import tqdm_notebook as tqdm

In [None]:
import spacy
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from wordcloud import WordCloud
import re
import unicodedata
import nltk
import csv
import string
from textblob import TextBlob
from nltk.corpus import stopwords
!pip install demoji
nltk.download('all')
import demoji

Collecting demoji
  Downloading https://files.pythonhosted.org/packages/88/6a/34379abe01c9c36fe9fddc4181dd935332e7d0159ec3fae76f712e49bcea/demoji-0.4.0-py2.py3-none-any.whl
Collecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Installing collected packages: colorama, demoji
Successfully installed colorama-0.4.4 demoji-0.4.0
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pandas as pd
tweet_df = pd.read_csv('/content/gdrive/My Drive/data with no geo tag/merged_mar_to_aug.csv', lineterminator='\r')
tweet_df = tweet_df.replace('\n','', regex=True)
tweet_df = tweet_df.dropna()
tweet_df = tweet_df.drop(['label'], axis = 1)
print(tweet_df.head())

                                               tweet sentiment_score    date
0   all respected members  a team of different co...     0.053333333  Apr-17
1  ok here’s a question   i believe we r gonna ov...            0.25  Apr-17
2  well  everywhere is talking about coronapeople...               0  Apr-17
3  like combined global corona infection dashboar...               0  Apr-17
4   they are already talking about how a corona v...              -1  Apr-17


In [None]:
class CustomLSTM(nn.Module):
    def __init__(self, embedding_dim, vocab, hidden_dim, output_dim, drop_prob, bidirectional = False, use_glove = True):
        super().__init__()
        self.embeddings = nn.Embedding(len(vocab), embedding_dim)
        if use_glove:
            self.embeddings.weight.data.copy_(vocab.vectors)
            self.embeddings.weight.requires_grad = False
        self.drop_prob = drop_prob

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional = bidirectional, batch_first = True, num_layers = 2)
        if bidirectional is True:
            self.lin = nn.Linear(2*hidden_dim, 64)
        else:
            self.lin = nn.Linear(hidden_dim, 64)
        self.fc = nn.Linear(64, output_dim)
        self.dropout = nn.Dropout(p = drop_prob)
    def forward(self, sentence):
        #sentence = [max_len, batch_size]

        embed = self.embeddings(torch.transpose(sentence, 0, 1))
        #embed = [batch_size, max_len, embedding_dim]
        
        if self.drop_prob:
            embed = self.dropout(embed)
        
        lstm_out, (hidden, cell) = self.lstm(embed)
        #lstm_out = [batch_size, max_len, 2*hidden_dim if bidirectional else hidden_dim]
        #hidden = [num_layers, batch_size, hidden_dim]
        #cell = [num_layers, batch_size, hidden_dim]
        
        out = lstm_out[:,-1,:].squeeze()
        #out = [batch_size, 2*hidden_dim if bidirectional else hidden_dim]
        
        out = self.lin(out)
        #out = [batch_size, 64]

        outputs = self.fc(out)
        #outputs = [batch_size, output_dim]
        
        return outputs

In [None]:
def evaluation_metrics(actual_labels, pred_labels, threshold):
    int_pred_labels = pred_labels
    for i in range(len(pred_labels)):
        for j in range(11):
            if int_pred_labels[i][j] >= threshold: int_pred_labels[i][j] = 1
            else:
                int_pred_labels[i][j] = 0
    
    ham_loss = hamming_loss(actual_labels, int_pred_labels)
    jacc_score = jaccard_score(actual_labels, int_pred_labels, average = 'samples')
    lrap = label_ranking_average_precision_score(actual_labels, pred_labels)
    f1_macro = f1_score(actual_labels, int_pred_labels, average = 'macro')
    f1_micro = f1_score(actual_labels, int_pred_labels, average = 'micro')

    return ham_loss, jacc_score, lrap, f1_macro, f1_micro

In [None]:
model = torch.load('/content/gdrive/MyDrive//model7LSTM.pth',map_location ='cpu')
model

CustomLSTM(
  (embeddings): Embedding(13666, 300)
  (lstm): LSTM(300, 128, num_layers=2, batch_first=True)
  (lin): Linear(in_features=128, out_features=64, bias=True)
  (fc): Linear(in_features=64, out_features=11, bias=True)
  (dropout): Dropout(p=0.65, inplace=False)
)

In [None]:
import spacy
spacy_en = spacy.load('en')

def tokenizer(tweet):
    tweet = re.sub(r'[\n]', ' ', tweet)
    return [tok.text for tok in spacy_en.tokenizer(tweet)]

tweet_field = data.Field(sequential = True, lower = True, tokenize = tokenizer)
dataFields = [("tweet", tweet_field), ("sentiment_score", None),("date", None)]

train_dataset = data.TabularDataset(
    path = '/content/gdrive/My Drive/data with no geo tag/merged_mar_to_aug.csv', format = 'csv', fields = dataFields, skip_header = True
    )

In [None]:
print(train_dataset) 

<torchtext.data.dataset.TabularDataset object at 0x7f308c49fb10>


In [None]:
print("length of dataset:", len(train_dataset))

length of dataset: 199755


In [None]:
def check_coverage(vocab, embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:
            oov[word] = vocab[word]
            i += vocab[word]
            pass
        
    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key = operator.itemgetter(1))[::-1]

    return sorted_x

def build_vocab(sentences, verbose = True):
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except:
                vocab[word] = 1
    return vocab
    
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


def load_embeddings(path):
    with open(path, encoding = "utf-8") as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)


def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix

In [None]:
GLOVE_EMBEDDING_FILE = '/content/gdrive/My Drive/glove.840B.300d.txt'
glove_embeddings = load_embeddings(GLOVE_EMBEDDING_FILE)
print(f'loaded {len(glove_embeddings)} word vectors ')

loaded 2196008 word vectors 


In [None]:
print(list(tweet_df))
print(tweet_df)

['tweet', 'sentiment_score', 'date']
                                                    tweet  ...    date
0        all respected members  a team of different co...  ...  Apr-17
1       ok here’s a question   i believe we r gonna ov...  ...  Apr-17
2       well  everywhere is talking about coronapeople...  ...  Apr-17
3       like combined global corona infection dashboar...  ...  Apr-17
4        they are already talking about how a corona v...  ...  Apr-17
...                                                   ...  ...     ...
199750    you are responsible to reject this  vaccine ...  ...  Aug-21
199751    flu vaccine is mandatory in both aged care a...  ...  Aug-21
199752  i will gladly take a covid vaccine in like 20 ...  ...  Aug-21
199753     fools  the us has tested enough  we need a ...  ...  Aug-21
199754   he knows exactly what he is doing   he makes ...  ...  Aug-21

[199755 rows x 3 columns]


In [None]:
vocab = build_vocab(list(tweet_df['tweet'].apply(lambda x : x.split())))
oov = check_coverage(vocab, glove_embeddings)
oov[:10]

HBox(children=(FloatProgress(value=0.0, max=193124.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=205940.0), HTML(value='')))


Found embeddings for 24.27% of vocab
Found embeddings for  94.11% of all text


[('covid', 72009),
 ('covid19', 20579),
 ('fauci', 4371),
 ('covidー19', 1194),
 ('icmr', 1191),
 ('\u2066', 956),
 ('covaxin', 944),
 ('coronavaccine', 819),
 ('hcq', 816),
 ('novavax', 716)]

In [None]:
TWEET = data.Field(sequential = True, lower = True, tokenize = tokenizer)
LABEL = data.Field(sequential = False, use_vocab = False)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
TWEET.build_vocab(train_dataset, vectors = 'glove.840B.300d')

.vector_cache/glove.840B.300d.zip: 2.18GB [06:59, 5.19MB/s]                            
100%|█████████▉| 2195562/2196017 [04:50<00:00, 8076.06it/s]

In [None]:
tweet_field.vocab = TWEET.vocab
BATCH_SIZE = 1
n_label = 11
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter = data.BucketIterator(train_dataset,
                                batch_size = BATCH_SIZE,
                                sort_key = lambda x : len(x.tweet),
                                sort_within_batch = False,
                                repeat = False,
                                shuffle = False,
                                device = device)

In [None]:
class BatchWrapper():
    def __init__(self, dl, x_var, y_vars):
        self.dl = dl
        self.x_var = x_var
        self.y_vars = y_vars

    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var)
            if self.y_vars is not None:
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim = 1).float()
            else:
                y = torch.zeros((1))
            yield(x, y)
    
    def __len__(self):
        return len(self.dl)

In [None]:
train_dl = BatchWrapper(train_iter, "tweet", None)

In [None]:
i = 0
for x, y in train_dl:
    print(x.squeeze())
    i += 1
    if i == 10: break

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
predicted_labels = []
sus = [33484, 62825]
i = 0
for X, y in train_dl:
    i += 1
    if i not in sus:
        preds = model(X)
        sig = nn.Sigmoid()
        out = sig(preds)

        out[out >= 0.5] = 1
        out[out < 0.5] = 0
        predicted_labels.append(out.detach().cpu().numpy().tolist())
        if i % 100 == 0:
            print("{}/{} iterations done".format(i, len(train_dl)))
    else:
        predicted_labels.append([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])

100/5508 iterations done
200/5508 iterations done
300/5508 iterations done
400/5508 iterations done
500/5508 iterations done
600/5508 iterations done
700/5508 iterations done
800/5508 iterations done
900/5508 iterations done
1000/5508 iterations done
1100/5508 iterations done
1200/5508 iterations done
1300/5508 iterations done
1400/5508 iterations done
1500/5508 iterations done
1600/5508 iterations done
1700/5508 iterations done
1800/5508 iterations done
1900/5508 iterations done
2000/5508 iterations done
2100/5508 iterations done
2200/5508 iterations done
2300/5508 iterations done
2400/5508 iterations done
2500/5508 iterations done
2600/5508 iterations done
2700/5508 iterations done
2800/5508 iterations done
2900/5508 iterations done
3000/5508 iterations done
3100/5508 iterations done
3200/5508 iterations done
3300/5508 iterations done
3400/5508 iterations done
3500/5508 iterations done
3600/5508 iterations done
3700/5508 iterations done
3800/5508 iterations done
3900/5508 iterations 

In [None]:
print(len(predicted_labels))
till_now = predicted_labels

5508


In [None]:
classified_df = pd.read_csv('/content/gdrive/My Drive/data with no geo tag/merged_mar_to_aug.csv', lineterminator='\r')
classified_df = tweet_df.replace('\n','', regex=True)

In [None]:
classified_df['Optimistic'] = "None"
classified_df['Thankful'] = "None"
classified_df['Empathetic'] = "None"
classified_df['Pessimistic'] = "None"
classified_df['Anxious'] = "None"
classified_df['Sad'] = "None"
classified_df['Annoyed'] = "None"
classified_df['Denial'] = "None"
classified_df['Official report'] = "None"
classified_df['Surprise'] = "None"
classified_df['Joking'] = "None"
print(classified_df.head(5))

                                               tweet  ...  Joking
0  hi  – thanks for your leadership on the  covid...  ...    None
1  a vaccine against  covid19 could save millions...  ...    None
2   recovery for now is not based on covid 19 vac...  ...    None
3  first people injected as uk starts human trial...  ...    None
4  fast tracking innovation with access  we propo...  ...    None

[5 rows x 13 columns]


In [None]:
for i in range(len(till_now)):
    classified_df['Optimistic'].iloc[i] = till_now[i][0]
    classified_df['Thankful'].iloc[i] = till_now[i][1]
    classified_df['Empathetic'].iloc[i] = till_now[i][2]
    classified_df['Pessimistic'].iloc[i] = till_now[i][3]
    classified_df['Anxious'].iloc[i] = till_now[i][4]
    classified_df['Sad'].iloc[i] = till_now[i][5]
    classified_df['Annoyed'].iloc[i] = till_now[i][6]
    classified_df['Denial'].iloc[i] = till_now[i][7]
    classified_df['Official report'].iloc[i] = till_now[i][8]
    classified_df['Surprise'].iloc[i] = till_now[i][9]
    classified_df['Joking'].iloc[i] = till_now[i][10]

In [None]:
classified_df.head()

Unnamed: 0,tweet,sentiment_score,Optimistic,Thankful,Empathetic,Pessimistic,Anxious,Sad,Annoyed,Denial,Official report,Surprise,Joking
0,hi – thanks for your leadership on the covid...,0.1875,0,0,0,0,0,0,0,0,1,0,0
1,a vaccine against covid19 could save millions...,0.136364,0,0,0,0,0,0,0,0,1,0,0
2,recovery for now is not based on covid 19 vac...,0.0,0,0,0,0,0,0,0,0,1,0,0
3,first people injected as uk starts human trial...,0.125,0,0,0,0,0,0,0,0,0,0,1
4,fast tracking innovation with access we propo...,0.118182,0,0,0,0,0,0,0,0,0,0,1


In [None]:
rand_text = classified_df

In [None]:
rand_idx = np.random.randint(0, 5508)
print(rand_idx)
print(classified_df.iloc[rand_idx]['tweet'])
print(classified_df.iloc[rand_idx])

403
  open  friendly  relaxed  down to earth  wine drinking and food loving people  interested in history  culture  letterpress printing  coronavirus vaccine  carnival and outstanding football coaches  who stand up against discrimination and know how to party need  mainz  others don’t 
tweet                open  friendly  relaxed  down to earth  wine...
sentiment_score                                                0.345
Optimistic                                                         0
Thankful                                                           0
Empathetic                                                         0
Pessimistic                                                        0
Anxious                                                            0
Sad                                                                0
Annoyed                                                            0
Denial                                                             0
Official report        

find a bunch of exmamples and put into a table ;
which ones and combinations are most expressed; 

In [None]:
categories = list(classified_df.columns[2:].values)
sns.set(font_scale = 2)

plt.figure(figsize = (15, 8))
ax= sns.barplot(classified_df.iloc[:, 2:].sum().values, categories, orient = 'h')
plt.title("Distribution of Emotions", fontsize = 24)
plt.xlabel('Number of Tweets', fontsize = 18)
plt.ylabel('Emotion type', fontsize = 18)

#adding the text labels
rects = ax.patches
labels = classified_df.iloc[:, 2:].sum().values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    #ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha = 'center', va = 'bottom', fontsize = 18)
plt.show()

KeyError: ignored

<Figure size 1080x576 with 0 Axes>