In [None]:
!pip install tokenizers



In [None]:
import numpy as np
import torch
import torch.nn as nn
import itertools
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
vocab,embeddings = [],[]
with open('drive/MyDrive/csci_567_project/glove.6B.100d.txt','rt', encoding='utf8') as fi:
    full_content = fi.read().strip().split('\n')
for i in range(len(full_content)):
    i_word = full_content[i].split(' ')[0]
    i_embeddings = [float(val) for val in full_content[i].split(' ')[1:]]
    vocab.append(i_word)
    embeddings.append(i_embeddings)

In [None]:
vocab_npa = np.array(vocab)
embs_npa = np.array(embeddings)

In [None]:
vocab_npa = np.insert(vocab_npa, 0, '[PAD]')
vocab_npa = np.insert(vocab_npa, 1, '[UNK]')

pad_emb_npa = np.zeros((1,embs_npa.shape[1]))   #embedding for '<pad>' token.
unk_emb_npa = np.mean(embs_npa,axis=0,keepdims=True)    #embedding for '<unk>' token.

#insert embeddings for pad and unk tokens at top of embs_npa.
embs_npa = np.vstack((pad_emb_npa,unk_emb_npa,embs_npa))

In [None]:
my_embedding_layer = torch.nn.Embedding.from_pretrained(torch.from_numpy(embs_npa).float(), freeze=True, padding_idx=0)

In [None]:
word2idx = {
    word: idx
    for idx, word in enumerate(vocab_npa)
}

In [None]:
print(len(word2idx.keys()))

400002


In [None]:
df = pd.read_csv('drive/MyDrive/csci_567_project/train_text.csv')

from tokenizers import pre_tokenizers
from tokenizers.pre_tokenizers import Digits, Whitespace
pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits=True)])

def sentiment_to_tensor(s: str):
    if s == 'positive':
        return 0
    elif s == 'neutral':
        return 1
    else:
        return 2

def utterance_split(u: str):
    obj = pre_tokenizer.pre_tokenize_str(u)
    return [x[0].lower() for x in obj]


df = pd.concat([df['Utterance'].map(lambda x: utterance_split(x)), df['Sentiment'].map(lambda x: sentiment_to_tensor(x))],
               keys=['data', 'labels'])

df['data'] = df['data'].map(lambda x: [word2idx.get(word, 1) for word in x])

In [None]:
# Tunable hyperparameter
batch_size = 16

class TextDataset(Dataset):
    def __init__(self, data, labels):
        # We have to transpose after pad_sequence since pad_sequence performs a transpose
        self.data = torch.transpose(nn.utils.rnn.pad_sequence(list(map(lambda x: torch.LongTensor(x), df['data']))), 0, 1)

        self.labels = torch.LongTensor(list(labels))

    def __len__(self):
        return self.data.size()[0]

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

text_dataset = TextDataset(df['data'], df['labels'])
text_dataloader = DataLoader(text_dataset, batch_size=batch_size, shuffle=True)

In [None]:
# Model architecture parameters
vocab_size = len(word2idx.keys())
embedding_size = 100
padding_index = 0
num_classes = 3

# Tunable hyperparameters
hidden_layers = 1
hidden_layer_size = 256
dropout_probability = 0.33
linear_output_size = 128
num_directions = 2
elu_alpha = 1
learning_rate = 0.001
scheduler_gamma = 0.9

class GLoVeLSTM(nn.Module):
    def __init__(self):
        super(GLoVeLSTM, self).__init__()

        self.embedding = my_embedding_layer
        self.lstm = nn.LSTM(input_size=embedding_size, num_layers=hidden_layers, hidden_size=hidden_layer_size,
                            bidirectional=(True if num_directions == 2 else False), batch_first=True)
        self.dropout = nn.Dropout(p=dropout_probability)
        self.linear = nn.Linear(in_features=hidden_layer_size * num_directions, out_features=linear_output_size)
        self.activation = nn.ELU(alpha=elu_alpha)
        self.classifier = nn.Linear(in_features=linear_output_size, out_features=num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.dropout(x)
        x = self.linear(x)
        x = self.activation(x)
        x = self.classifier(x)
        return x[:, -1, :]

glove_lstm = GLoVeLSTM().to(device)

# Tunable Optimizer, Scheduler, and Loss Function
optimizer = torch.optim.AdamW(params=glove_lstm.parameters(), lr=learning_rate)
# scheduler = torch.optim.lr_scheduler.ExponentialLR(gamma=scheduler_gamma, optimizer=optimizer)
criterion = nn.CrossEntropyLoss(ignore_index=num_classes)

In [None]:
# Tunable hyperparameters
num_epochs = 25

valid_loss_min = np.Inf

for epoch in range(num_epochs):
    train_loss = 0.0

    glove_lstm.train()
    for data, target in text_dataloader:
        optimizer.zero_grad()
        data = data.to(device)
        target = target.to(device)
        output = glove_lstm(data)

        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

In [None]:
df = pd.read_csv('drive/MyDrive/csci_567_project/dev_text.csv')

from tokenizers import pre_tokenizers
from tokenizers.pre_tokenizers import Digits, Whitespace
pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits=True)])


df = pd.concat([df['Utterance'].map(lambda x: utterance_split(x)), df['Sentiment'].map(lambda x: sentiment_to_tensor(x))],
               keys=['data', 'labels'])

df['data'] = df['data'].map(lambda x: [word2idx.get(word, 1) for word in x])

In [None]:
dev_dataset = TextDataset(df['data'], df['labels'])
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True)

In [None]:
test_loss= 0.0
correct, total = 0, 0

for data, label in dev_dataloader:
    glove_lstm.eval()
    data = data.to(device)
    label = label.to(device)

    output = glove_lstm(data)
    loss = criterion(output, label)
    for o,l in zip(torch.argmax(output,axis = 1),label):
        if o == l:
            correct += 1
        total += 1
    loss = criterion(output,label)
    test_loss += loss.item() * data.size(0)

print('Test Loss: ' + str(test_loss / len(dev_dataloader.dataset)))
print('Correct Guesses: ' + str(correct) + '/' + str(total) + ' -> Accuracy: ' + str(correct / total))

Test Loss: 1.0741448338984154
Correct Guesses: 470/1109 -> Accuracy: 0.4238052299368801


In [None]:
torch.save(glove_lstm, 'drive/MyDrive/csci_567_project/glove_lstm.pt')