In [1]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from torchtext import data, datasets
from torchtext.data import Field, LabelField, BucketIterator, TabularDataset
from torchtext.vocab import Vocab
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from scipy.stats import spearmanr
from nltk import word_tokenize

In [18]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/sunniva/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
class LSTM_model(torch.nn.Module):

    def __init__(self, vocab_size, embedding_dim=105, hidden_size=179, output_dim=1, dropout_rate=0.58,
                 **kwargs):

        super(LSTM_model, self).__init__()

        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)

        self.lstm = torch.nn.LSTM(embedding_dim, hidden_size, **kwargs)

        self.dropout = torch.nn.Dropout(dropout_rate)

        self.linear = torch.nn.Linear(hidden_size, output_dim)

    def forward(self, tensor_batch):

        embedding_tensor = self.embedding(tensor_batch)

        dropout_embedding = self.dropout(embedding_tensor)

        out, (hidden_state, _) = self.lstm(dropout_embedding)

        hidden_squeezed = hidden_state.squeeze(0)

        assert torch.equal(out[-1, :, :], hidden_squeezed)

        return self.linear(hidden_squeezed)

In [22]:
def train_model(model, train_iter, optimizer):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in train_iter:

        optimizer.zero_grad()

        predictions = model(batch.Tweet).squeeze(1) # removing the extra dimension ([batch_size,1])

        loss_function = torch.nn.BCEWithLogitsLoss()

        loss = loss_function(predictions, batch.overall_label)  # batch loss

        predicted_classes = torch.round(torch.sigmoid(predictions))

        correct_preds = (predicted_classes == batch.overall_label).float()

        accuracy = correct_preds.sum() / len(correct_preds)


        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()  # add the loss for this batch to calculate the loss for whole epoch
        epoch_acc += accuracy.item()  # .item() tend to give the exact number from the tensor of shape [1,]



    return epoch_loss / len(train_iter), epoch_acc / len(train_iter)


In [23]:
def evaluate_model(model, val_test_iter, optimizer):

    total_loss = 0
    total_acc = 0

    # Two lists are used to calculate AUC score
    y_true = []
    y_pred = []
    y_pred_round = []

    model.eval()

    with torch.no_grad():

        for batch in val_test_iter:
            predictions = model(batch.Tweet).squeeze(1)

            loss_function = torch.nn.BCEWithLogitsLoss()

            loss = loss_function(predictions, batch.overall_label)

            predicted_classes = torch.sigmoid(predictions)
            y_pred.append(predicted_classes)

            pred_classes = torch.round(torch.sigmoid(predictions))
            y_pred_round.append(pred_classes)

            correct_predictions = (pred_classes == batch.overall_label).float()

            accuracy = correct_predictions.sum() / len(correct_predictions)

            total_loss += loss.item()
            total_acc += accuracy.item()
            y_true.append(batch.overall_label)

        return total_loss / len(val_test_iter), total_acc / len(val_test_iter), y_pred, y_true, y_pred_round

In [5]:
def get_auroc(truth, pred):
    assert len(truth) == len(pred)
    auc = roc_auc_score(truth.numpy(), pred.numpy())
    return auc

def spearman(x,y):
    return spearmanr(x,y)

In [6]:
seed = 42
source_folder = '../data/twitter_data/'
num_epochs = 100

In [7]:
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.determinstic = True

In [8]:
os.environ['PYTHONHASHSEED'] = str(seed)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # use 'cuda' if available else 'cpu

In [9]:
tweet = Field(tokenize=word_tokenize)
# tokenize text using word_tokenize and convert to numerical form using default parameters

In [10]:
label = LabelField(dtype=torch.float)
# useful for label string to LabelEncoding. Not useful here but doesn't hurt either

In [11]:
fields = [('Tweet', tweet), ('overall_label', label)]
# (column name,field object to use on that column) pair for the dictonary

In [12]:
train, test = TabularDataset.splits(path=source_folder, train='overall_label_dataset_train.csv', test='overall_label_dataset_test.csv',
                                             format='csv', fields=fields)

In [13]:
tweet.build_vocab(train)
label.build_vocab(train)

In [20]:
train_iter, test_iter = BucketIterator.splits((train, test), batch_sizes=(64, 64),
                                                            sort_key=lambda x: len(x.tweet),
                                                            sort_within_batch=False,
                                                            device=device)  # use the cuda device if available

In [15]:
vocab_size = len(tweet.vocab)
lr = 3e-4  # learning rate = 0.0003
model = LSTM_model(vocab_size)
optimizer = torch.optim.RMSprop(model.parameters(), lr=lr)

In [16]:
train_loss_list = []
test_loss_list = []

In [24]:
for epoch in range(num_epochs):
        train_loss, train_acc = train_model(model, train_iter, optimizer)
        testation_loss, testation_acc, y_pred, y_true, test_y_pred = evaluate_model(model, test_iter, optimizer)
        train_loss_list.append(train_loss)
        test_loss_list.append(testation_loss)
        print(
            f'''End of Epoch: {epoch + 1}  |  Train Loss: {train_loss:.3f}  |  testation Loss: {testation_loss:.3f}  |  Train Acc: {train_acc * 100:.2f}%  |  testation Acc: {testation_acc * 100:.2f}% ''')

AttributeError: 'Example' object has no attribute 'tweet'