# Import Libraries

In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
import torch
from torchtext import data
from torchtext import datasets
import random
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import time
import datetime
import spacy
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import nltk
from nltk.tokenize import word_tokenize
from cleaning_tweets import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# RNN Set Up

In [2]:
# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# Define the RNN model
class RNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()
        self.rnn = nn.LSTM(input_dim, hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        text = text.unsqueeze(1)  # Add dimension for batch_first=True in LSTM
        packed_output, (hidden, cell) = self.rnn(text)
        hidden = self.dropout(hidden[-1,:,:])  # Take the last layer's hidden state
        return self.fc(hidden)

In [4]:
# define the accuracy function
def categorical_accuracy(preds, y):
    top_pred = preds.argmax(1, keepdim=True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    return correct.float() / y.shape[0]

# define the training function
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        text, labels = batch
        text, labels = text.to(device), labels.to(device)
        predictions = model(text)
        loss = criterion(predictions, labels)
        acc = categorical_accuracy(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

# define the evaluation function

def evaluate(model, iterator, criterion):

        epoch_loss = 0
        epoch_acc = 0

        all_predictions = []
        all_labels = []

        model.eval()

        with torch.no_grad():
            for batch in iterator:
                text, labels = batch
                text, labels = text.to(device), labels.to(device)
                predictions = model(text)
                loss = criterion(predictions, labels)
                acc = categorical_accuracy(predictions, labels)
                epoch_loss += loss.item()
                epoch_acc += acc.item()

        precision = precision_score(all_labels, all_predictions, average='weighted', zero_division=True)
        recall = recall_score(all_labels, all_predictions, average='weighted', zero_division=True)
        f1 = f1_score(all_labels, all_predictions, average='weighted')

        return epoch_loss / len(iterator), epoch_acc / len(iterator), precision, recall, f1
# define the function to calculate the time elapsed

def epoch_time(start_time, end_time):

        elapsed_time = end_time - start_time

        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))

        return elapsed_mins, elapsed_secs

# **Coronavirus Dataset**

In [5]:
# import the tweets datasets
covid_train = pd.read_csv('tweets dataset/Coronavirus tweets NLP - Text Classification/tweets_train.csv', encoding="ISO-8859-1")
covid_test = pd.read_csv('tweets dataset/Coronavirus tweets NLP - Text Classification/tweets_test.csv', encoding="ISO-8859-1")

In [6]:
texts_new_train = []
for t in covid_train.OriginalTweet:
    texts_new_train.append(remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(emoji.demojize(t))))))

# Add the cleaned tweets to the dataframe
covid_train['cleaned_tweets'] = texts_new_train

In [7]:
texts_new_test = []
for t in covid_test.OriginalTweet:
    texts_new_test.append(remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(emoji.demojize(t))))))

# Add the cleaned tweets to the dataframe
covid_test['cleaned_tweets'] = texts_new_test

In [8]:
covid_train['Sentiment'] = covid_train['Sentiment'].map({'Extremely Negative':0,'Negative':0,'Neutral':1,'Positive':2,'Extremely Positive':2})
covid_test['Sentiment'] = covid_test['Sentiment'].map({'Extremely Negative':0,'Negative':0,'Neutral':1,'Positive':2,'Extremely Positive':2})

In [9]:
# Apply TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=25000)
tfidf_vectors_train = tfidf_vectorizer.fit_transform(covid_train['cleaned_tweets']).toarray()
tfidf_vectors_test = tfidf_vectorizer.transform(covid_test['cleaned_tweets']).toarray()

In [10]:
# Encode the labels
label_encoder = LabelEncoder()
covid_train['Sentiment'] = label_encoder.fit_transform(covid_train['Sentiment'])
covid_test['Sentiment'] = label_encoder.transform(covid_test['Sentiment'])

In [11]:
# Split the data
covid_X_train, covid_X_valid, covid_y_train, covid_y_valid = train_test_split(tfidf_vectors_train, covid_train['Sentiment'].values, test_size=0.2, random_state=42)

covid_X_test = tfidf_vectors_test
covid_y_test = covid_test['Sentiment'].values

In [12]:
# Convert to PyTorch tensors
covid_X_train = torch.tensor(covid_X_train, dtype=torch.float32)
covid_X_valid = torch.tensor(covid_X_valid, dtype=torch.float32)
covid_X_test = torch.tensor(covid_X_test, dtype=torch.float32)
covid_y_train = torch.tensor(covid_y_train, dtype=torch.long)
covid_y_valid = torch.tensor(covid_y_valid, dtype=torch.long)
covid_y_test = torch.tensor(covid_y_test, dtype=torch.long)

# Move tensors to the GPU if available
covid_X_train = covid_X_train.to(device)
covid_X_valid = covid_X_valid.to(device)
covid_X_test = covid_X_test.to(device)
covid_y_train = covid_y_train.to(device)
covid_y_valid = covid_y_valid.to(device)
covid_y_test = covid_y_test.to(device)

In [13]:
# Create DataLoader
BATCH_SIZE = 32

covid_train_data = torch.utils.data.TensorDataset(covid_X_train, covid_y_train)
covid_valid_data = torch.utils.data.TensorDataset(covid_X_valid, covid_y_valid)
covid_test_data = torch.utils.data.TensorDataset(covid_X_test, covid_y_test)

covid_train_iterator = torch.utils.data.DataLoader(covid_train_data, batch_size=BATCH_SIZE, shuffle=True)
covid_valid_iterator = torch.utils.data.DataLoader(covid_valid_data, batch_size=BATCH_SIZE)
covid_test_iterator = torch.utils.data.DataLoader(covid_test_data, batch_size=BATCH_SIZE)

In [14]:
# Hyperparameters
INPUT_DIM = covid_X_train.shape[1]
HIDDEN_DIM = 256
OUTPUT_DIM = 3  # Six classes
N_LAYERS = 2
DROPOUT = 0.5

In [15]:
# Initialize the model
model = RNN(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT)

# Move model to GPU
model = model.to(device)

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

# Move criterion to GPU
criterion = criterion.to(device)

In [16]:
# train the model

N_EPOCHS = 3

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

        start_time = time.time()

        train_loss, train_acc = train(model, covid_train_iterator, optimizer, criterion)
        valid_loss, valid_acc, _, _, _ = evaluate(model, covid_valid_iterator, criterion)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 16m 25s
	Train Loss: 0.715 | Train Acc: 68.66%
	 Val. Loss: 0.538 |  Val. Acc: 79.38%
Epoch: 02 | Epoch Time: 15m 51s
	Train Loss: 0.359 | Train Acc: 87.36%
	 Val. Loss: 0.515 |  Val. Acc: 80.74%
Epoch: 03 | Epoch Time: 16m 29s
	Train Loss: 0.230 | Train Acc: 92.29%
	 Val. Loss: 0.569 |  Val. Acc: 80.26%


In [17]:
# evaluate the model on the test set

test_loss, test_acc, test_precision, test_recall, test_f1 = evaluate(model, covid_test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
print(f'Test Precision: {test_precision:.3f} | Test Recall: {test_recall:.3f} | Test F1: {test_f1:.3f}')

Test Loss: 0.592 | Test Acc: 78.97%
Test Precision: 1.000 | Test Recall: 1.000 | Test F1: 0.000
