In [65]:
import torch
from torch.utils.data import DataLoader, random_split
import torchtext
import numpy as np

from Sentiment140 import SentimentDataset 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("torch:", torch.__version__, device)
print("torchtext", torchtext.__version__)

torch: 2.1.0+cu118 cuda
torchtext 0.16.1+cpu


In [66]:
# downloads the English language model "en_core_web_sm" for spaCy  
#!python -m spacy download en_core_web_sm

In [67]:
# contstants
SEED = 20
FILE_PATH = "Sentiment140/training.1600000.processed.noemoticon.csv"
torch.manual_seed(SEED)
np.random.seed(0)

Data was retrieved from [kaggle](https://www.kaggle.com/datasets/kazanova/sentiment140/).

In [68]:
dataset = SentimentDataset(path=FILE_PATH, device=device, count=100, seed=SEED)
size = len(dataset)
train_size = int(0.7 * size)
validation_size = int(0.15 * size)
test_size = size - train_size - validation_size
train_dataset, test_dataset, validation_dataset = random_split(dataset, [train_size, test_size, validation_size])
print(f"Total Samples:    {size}")
print(f"Train (70%):      {train_size}")
print(f"Validation (15%): {validation_size}")
print(f"Test (15%):       {test_size}")

Total Samples:    100
Train (70%):      70
Validation (15%): 15
Test (15%):       15


In [69]:
train_dataloader = DataLoader(train_dataset, drop_last=True, batch_size=4, shuffle=True, num_workers=0)
test_dataloader = DataLoader(test_dataset, drop_last=True, batch_size=4, shuffle=True, num_workers=0)
validation_dataloader = DataLoader(validation_dataset, drop_last=True, batch_size=4, shuffle=True, num_workers=0)

In [70]:
INPUT_DIM = dataset.vocab_size
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

import RNN

model = RNN.Classifier(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [71]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 111,805 trainable parameters


In [72]:
import torch.optim as optim
import torch.nn as nn

optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

In [73]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [77]:
def train(model, dataloader, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for text, label in dataloader:
        optimizer.zero_grad()                
        predictions = model(text).squeeze(1)
        loss = criterion(predictions, label)
        acc = binary_accuracy(predictions, label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(dataloader), epoch_acc / len(dataloader)

In [78]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [79]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_dataloader, optimizer, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')


Epoch: 01 | Epoch Time: 0m 0s
	Train Loss: 0.683 | Train Acc: 58.82%
Epoch: 02 | Epoch Time: 0m 0s
	Train Loss: 0.686 | Train Acc: 57.35%
Epoch: 03 | Epoch Time: 0m 0s
	Train Loss: 0.681 | Train Acc: 58.82%
Epoch: 04 | Epoch Time: 0m 0s
	Train Loss: 0.678 | Train Acc: 60.29%
Epoch: 05 | Epoch Time: 0m 0s
	Train Loss: 0.678 | Train Acc: 60.29%
