# Installing Dependencies

In [128]:
!pip install datasets
!pip install huggingface_hub
!pip install ipywidgets
!pip install torch





# Logging into Huggingface

In [129]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Data Processing

In [130]:
import numpy as np
import os
import torch

from datasets import load_dataset

dataset = load_dataset('sst2')

train_dataset      = dataset['train']
test_dataset       = dataset['validation']

embeddings_per_word = {}

with open('glove.6B.100d.txt', 'r') as file: 
    for idx, line in enumerate(file):
        word, *embedding_str = line.split()
        embeddings_per_word[word] = torch.tensor(np.array(embedding_str, dtype='float32'))

Found cached dataset sst2 (/Users/thijsnulle/.cache/huggingface/datasets/sst2/default/2.0.0/9896208a8d85db057ac50c72282bcb8fe755accc671a57dd8059d4e130961ed5)
100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 586.97it/s]


# Summing the Embedding Vectors

In [133]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def sum_embedding_vectors_for(dataset):
    X = []
    Y = []
    
    for entry in dataset:
        words = [ w for w in entry['sentence'].split() if w in embeddings_per_word ]
        
        if len(words) == 0:
            continue
        
        embeddings = [ embeddings_per_word[w] for w in words ]
        embedding = torch.stack(embeddings).sum(dim=0)
        
        X.append(embedding)
        Y.append(int(entry['label']))
        
    return torch.stack(X).to(device), torch.tensor(Y).to(device)
        
X_train, y_train = sum_embedding_vectors_for(train_dataset)
X_test, y_test = sum_embedding_vectors_for(test_dataset)

# Sentiment Classifier

In [150]:
import torch
import torch.nn as nn
import torch.optim as optim

class SentimentClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super(SentimentClassifier, self).__init__()

        self.fc = nn.Linear(embedding_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.fc(x)
        x = self.output(x)

        return x

# Training Loop

In [155]:
embedding_dim = 100
hidden_dim = 256
output_dim = 2

model = SentimentClassifier(embedding_dim, hidden_dim, output_dim).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

num_epochs = 20
batch_size = 128
accuracies = []

for epoch in range(num_epochs):
    model.train()

    num_batches = len(X_train) // batch_size
    for batch in range(num_batches):
        start = batch * batch_size
        end = start + batch_size
        inputs = X_train[start:end]
        labels = y_train[start:end]
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    model.eval()
    
    total = 0
    correct = 0
    
    with torch.no_grad():
        num_batches = len(X_test) // batch_size

        for batch in range(num_batches):
            start = batch * batch_size
            end = start + batch_size
            inputs = X_test[start:end]
            labels = y_test[start:end]

            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracies.append(correct / total * 100)
    accuracy = sum(accuracies) / len(accuracies)
    print(f'Epoch {epoch + 1}/{num_epochs}: {accuracy:.2f}% accuracy')

Epoch 1/20: 75.39% accuracy
Epoch 2/20: 74.35% accuracy
Epoch 3/20: 74.18% accuracy
Epoch 4/20: 74.15% accuracy
Epoch 5/20: 74.38% accuracy
Epoch 6/20: 74.41% accuracy
Epoch 7/20: 74.48% accuracy
Epoch 8/20: 74.63% accuracy
Epoch 9/20: 74.71% accuracy
Epoch 10/20: 74.62% accuracy
Epoch 11/20: 74.64% accuracy
Epoch 12/20: 74.64% accuracy
Epoch 13/20: 74.67% accuracy
Epoch 14/20: 74.58% accuracy
Epoch 15/20: 74.64% accuracy
Epoch 16/20: 74.56% accuracy
Epoch 17/20: 74.59% accuracy
Epoch 18/20: 74.65% accuracy
Epoch 19/20: 74.68% accuracy
Epoch 20/20: 74.59% accuracy


# Saving the Model

In [156]:
import torch

torch.save(model.state_dict(), 'models/embedding_model.pth')