In [1]:
import torch
import torch.nn as nn

In [2]:
input_dim = 5
hidden_dim = 10
n_layers = 1

In [3]:
lstm_layer = nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True)

In [4]:
batch_size = 1
seq_len = 1

inp = torch.randn(batch_size, seq_len, input_dim)
hidden_state = torch.randn(n_layers, batch_size, hidden_dim)
cell_state = torch.randn(n_layers, batch_size, hidden_dim)
hidden = (hidden_state, cell_state)

In [5]:
out, hidden = lstm_layer(inp, hidden)
print("Output shape: ", out.shape)
print("Hidden: ", hidden)

Output shape:  torch.Size([1, 1, 10])
Hidden:  (tensor([[[-0.2483, -0.0842, -0.2334,  0.0269, -0.0499,  0.1655, -0.3001,
           0.0145, -0.3391, -0.2217]]], grad_fn=<StackBackward>), tensor([[[-0.6272, -0.1336, -0.3057,  0.0446, -0.3471,  0.2164, -0.4301,
           0.0316, -1.2205, -0.2992]]], grad_fn=<StackBackward>))


In [6]:
seq_len = 3
inp = torch.randn(batch_size, seq_len, input_dim)
out, hidden = lstm_layer(inp, hidden)
print(out.shape)

torch.Size([1, 3, 10])


In [7]:
# Obtaining the last output
out = out.squeeze()[-1, :]
print(out.shape)

torch.Size([10])


In [8]:
# import data

import pandas as pd
import numpy as np
from collections import Counter
import nltk
nltk.download('punkt')

train = pd.read_csv('../data/processed/train.csv')
val = pd.read_csv('../data/processed/valid.csv')
test = pd.read_csv('../data/processed/test.csv')

[nltk_data] Downloading package punkt to /home/zqxh49/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
# Extracting text and labels from dataframe

train_labels = train.label.values
train_sentences = train.text.values

val_labels = val.label.values
val_sentences = val.text.values

test_labels = test.label.values
test_sentences = test.text.values

In [10]:
words = Counter()  # Dictionary that will map a word to the number of times it appeared in all the training sentences
for i, row in train.iterrows():
    # The sentences will be stored as a list of words/tokens
    train_sentences[i] = []
    sentence = row['text']
    for word in nltk.word_tokenize(sentence):  # Tokenizing the words
        words.update([word.lower()])  # Converting all the words to lowercase
        train_sentences[i].append(word)
    if i%20000 == 0:
        print(str((i*100)/len(train)) + "% done")
print("100% done")

0.0% done
36.08870603944496% done
72.17741207888992% done
100% done


In [11]:
len(words)

56641

In [12]:
for i, row in val.iterrows():
    # The sentences will be stored as a list of words/tokens
    val_sentences[i] = []
    sentence = row['text']
    for word in nltk.word_tokenize(sentence):  # Tokenizing the words
        words.update([word.lower()])  # Converting all the words to lowercase
        val_sentences[i].append(word)
    if i%20000 == 0:
        print(str((i*100)/len(train)) + "% done")
print("100% done")

0.0% done
100% done


In [13]:
# Removing the words that only appear once
words = {k:v for k,v in words.items() if v>1}
# Sorting the words according to the number of appearances, with the most common word being first
words = sorted(words, key=words.get, reverse=True)
# Adding padding and unknown to our vocabulary so that they will be assigned an index
words = ['_PAD','_UNK'] + words
# Dictionaries to store the word to index mappings and vice versa
word2idx = {o:i for i,o in enumerate(words)}
idx2word = {i:o for i,o in enumerate(words)}

In [14]:
for i, sentence in enumerate(train_sentences):
    # Looking up the mapping dictionary and assigning the index to the respective words
    train_sentences[i] = [word2idx[word] if word in word2idx else 0 for word in sentence]
    
for i, sentence in enumerate(val_sentences):
    # Looking up the mapping dictionary and assigning the index to the respective words
    val_sentences[i] = [word2idx[word] if word in word2idx else 0 for word in sentence]

for i, sentence in enumerate(test_sentences):
    # For test sentences, we have to tokenize the sentences as well
    test_sentences[i] = [word2idx[word.lower()] if word.lower() in word2idx else 0 for word in nltk.word_tokenize(sentence)]

In [15]:
# Defining a function that either shortens sentences or pads sentences with 0 to a fixed length
def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

seq_len = 100  # The length that the sentences will be padded/shortened to

train_sentences = pad_input(train_sentences, seq_len)
val_sentences = pad_input(val_sentences, seq_len)
test_sentences = pad_input(test_sentences, seq_len)

# Converting our labels into numpy arrays
train_labels = np.array(train_labels)
val_labels = np.array(val_labels)
test_labels = np.array(test_labels)

In [16]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

train_data = TensorDataset(torch.from_numpy(train_sentences), torch.from_numpy(train_labels))
val_data = TensorDataset(torch.from_numpy(val_sentences), torch.from_numpy(val_labels))
test_data = TensorDataset(torch.from_numpy(test_sentences), torch.from_numpy(test_labels))

BATCH_SIZE = 64

train_loader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)
val_loader = DataLoader(val_data, shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_data, shuffle=True, batch_size=BATCH_SIZE)

In [17]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [18]:
class SentimentNet(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        super(SentimentNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.sigmoid(out)
        
        out = out.view(batch_size, -1)
        out = out[:,-1]
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        return hidden

In [19]:
vocab_size = len(word2idx) + 1
output_size = 1
embedding_dim = 300
hidden_dim = 512
n_layers = 2

model = SentimentNet(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
model.to(device)

lr=0.005
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [28]:
N_EPOCHS = 5
counter = 0
print_every = 100
clip = 5
valid_loss_min = np.Inf

best_valid_loss = float('inf')

def train_func(sub_train_):
    # Train the model
    train_losses = []
    num_correct = 0

    model.train()
    
    for inputs, labels in sub_train_:
        batch_size = inputs.shape[0]
        h = model.init_hidden(batch_size)
        
        h = tuple([e.data for e in h])
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output, h = model(inputs, h)
        pred = torch.round(output.squeeze())  # Rounds the output to 0/1
        loss = criterion(output.squeeze(), labels.float())
        train_losses.append(loss.item())
        
        correct_tensor = pred.eq(labels.float().view_as(pred))
        correct = np.squeeze(correct_tensor.cpu().numpy())
        num_correct += np.sum(correct)
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
    return np.mean(train_losses), num_correct/len(sub_train_.dataset)

def eval_func(sub_val_):
    val_losses = []
    num_correct = 0
    
    model.eval()
    
    for inputs, labels in sub_val_:
        batch_size = inputs.shape[0]
        h = model.init_hidden(batch_size)
        
        h = tuple([e.data for e in h])
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output, h = model(inputs, h)
        pred = torch.round(output.squeeze())  # Rounds the output to 0/1
        val_loss = criterion(output.squeeze(), labels.float())
        val_losses.append(val_loss.item())
        pred = torch.round(output.squeeze())  # Rounds the output to 0/1
        correct_tensor = pred.eq(labels.float().view_as(pred))
        correct = np.squeeze(correct_tensor.cpu().numpy())
        num_correct += np.sum(correct)
        
    return np.mean(val_losses), num_correct/len(sub_val_.dataset)

In [29]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [30]:
for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss, train_acc = train_func(train_loader)
    valid_loss, valid_acc = eval_func(val_loader)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 22s
	Train Loss: 0.0210 | Train Acc: 99.74%
	 Val. Loss: 0.018 |  Val. Acc: 99.75%
Epoch: 02 | Epoch Time: 0m 22s
	Train Loss: 0.0211 | Train Acc: 99.74%
	 Val. Loss: 0.018 |  Val. Acc: 99.75%
Epoch: 03 | Epoch Time: 0m 23s
	Train Loss: 0.0205 | Train Acc: 99.74%
	 Val. Loss: 0.018 |  Val. Acc: 99.75%
Epoch: 04 | Epoch Time: 0m 23s
	Train Loss: 0.0206 | Train Acc: 99.74%
	 Val. Loss: 0.020 |  Val. Acc: 99.75%
Epoch: 05 | Epoch Time: 0m 23s
	Train Loss: 0.0208 | Train Acc: 99.74%
	 Val. Loss: 0.018 |  Val. Acc: 99.75%


In [31]:
# Loading the best model
model.load_state_dict(torch.load('./tut4-model.pt'))

test_loss, test_acc = eval_func(test_loader)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 3.334 | Test Acc: 48.30%


####  Evaluation metrics for the “positive” class

In [35]:
print_every = 500

model.train()

for i in range(N_EPOCHS):
    for inputs, labels in train_loader:
        batch_size = inputs.shape[0]
        h = model.init_hidden(batch_size)
        
        counter += 1
        h = tuple([e.data for e in h])
        inputs, labels = inputs.to(device), labels.to(device)
        model.zero_grad()
        output, h = model(inputs, h)
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        
        if counter%print_every == 0:
            val_losses = []
            model.eval()
            for inp, lab in val_loader:
                batch_size = inp.shape[0]
                val_h = model.init_hidden(batch_size)
                
                val_h = tuple([each.data for each in val_h])
                inp, lab = inp.to(device), lab.to(device)
                out, val_h = model(inp, val_h)
                val_loss = criterion(out.squeeze(), lab.float())
                val_losses.append(val_loss.item())
                
            model.train()
            print("Epoch: {}/{}...".format(i+1, N_EPOCHS),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))
            if np.mean(val_losses) <= valid_loss_min:
                torch.save(model.state_dict(), './state_dict.pt')
                print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
                valid_loss_min = np.mean(val_losses)

Epoch: 1/5... Step: 5500... Loss: 0.001968... Val Loss: 0.018549
Epoch: 1/5... Step: 6000... Loss: 0.004557... Val Loss: 0.017613
Validation loss decreased (0.018134 --> 0.017613).  Saving model ...
Epoch: 2/5... Step: 6500... Loss: 0.001969... Val Loss: 0.018123
Epoch: 2/5... Step: 7000... Loss: 0.002825... Val Loss: 0.017597
Validation loss decreased (0.017613 --> 0.017597).  Saving model ...
Epoch: 3/5... Step: 7500... Loss: 0.000987... Val Loss: 0.019129
Epoch: 4/5... Step: 8000... Loss: 0.099699... Val Loss: 0.018679
Epoch: 4/5... Step: 8500... Loss: 0.003545... Val Loss: 0.017617
Epoch: 5/5... Step: 9000... Loss: 0.001592... Val Loss: 0.018190
Epoch: 5/5... Step: 9500... Loss: 0.000787... Val Loss: 0.019168


In [146]:
# Loading the best model
model.load_state_dict(torch.load('./state_dict.pt'))

test_losses = []
num_correct = 0

model.eval()
for inputs, labels in test_loader:
    batch_size = inputs.shape[0]
    h = model.init_hidden(batch_size)
    h = tuple([each.data for each in h])
    inputs, labels = inputs.to(device), labels.to(device)
    output, h = model(inputs, h)
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    pred = torch.round(output.squeeze())  # Rounds the output to 0/1
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)

print("Test loss: {:.3f}".format(np.mean(test_losses)))
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}%".format(test_acc*100))

Test loss: 3.108
Test accuracy: 48.296%


In [172]:
class_correct = list(0. for i in range(2))
class_total = list(0. for i in range(2))
predictions, true_labels = [], []

model.eval()
for data in test_loader:
    texts, labels = data
    texts, labels = texts.to(device), labels.to(device)
    batch_size = texts.shape[0]
    h = model.init_hidden(batch_size)
    h = tuple([each.data for each in h])
    outputs, h = model(texts, h)
    predicted = torch.round(outputs.squeeze())
    
    # Store predictions and true labels
    predictions.append(predicted.detach().cpu().numpy())
    true_labels.append(labels.float().cpu().numpy())
    
    c = predicted.eq(labels.float().view_as(predicted))
    for i in range(batch_size):
        label = labels[i]
        class_correct[label] += c[i].item()
        class_total[label] += 1

In [173]:
classes = ('negative', 'positive')

In [174]:
for i in range(2):
    print('Accuracy of %5s : %2d %%' % (
        classes[i], 100 * class_correct[i] / class_total[i]))

Accuracy of negative : 100 %
Accuracy of positive :  0 %


In [175]:
# Combine the predictions for each batch into a single list of 0s and 1s.
predictions = [item for sublist in predictions for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]

In [177]:
# Get positive labels
np_labels = np.array(true_labels)
pos_labels_idx = np.where(np_labels == 1)
pos_labels = np_labels[pos_labels_idx]

In [179]:
# Get prediction for corresponding label index
np_predictions = np.array(predictions)
pos_predictions = np_predictions[pos_labels_idx]

In [180]:
from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score

# Calculate the F1 score for positive class 
f1_score(pos_predictions, pos_labels)

0.0