# Installing Dependencies

In [1]:
!pip install datasets
!pip install huggingface_hub
!pip install ipywidgets
!pip install torch





# Logging into Huggingface

In [2]:
# from huggingface_hub import notebook_login

# notebook_login()
print("hi there?")

hi there?


# Data Processing

In [None]:
import numpy as np
import os
import torch

from datasets import load_dataset

dataset = load_dataset('sst2')

train_dataset      = dataset['train']
test_dataset       = dataset['validation']

embeddings_per_word = {}

with open('glove.6B.100d.txt', 'r') as file: 
    for idx, line in enumerate(file):
        word, *embedding_str = line.split()
        embeddings_per_word[word] = torch.tensor(np.array(embedding_str, dtype='float32'))

# Summing the Embedding Vectors

In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def sum_embedding_vectors_for(dataset):
    X = []
    Y = []
    
    for entry in dataset:
        words = [ w for w in entry['sentence'].split() if w in embeddings_per_word ]
        
        if len(words) == 0:
            continue
        
        embeddings = [ embeddings_per_word[w] for w in words ]
        embedding = torch.stack(embeddings).sum(dim=0)
        
        X.append(embedding)
        Y.append(int(entry['label']))
        
    return torch.stack(X).to(device), torch.tensor(Y).to(device)
        
X_train, y_train = sum_embedding_vectors_for(train_dataset)
X_test, y_test = sum_embedding_vectors_for(test_dataset)

# Sentiment Classifier

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class SentimentClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super(SentimentClassifier, self).__init__()

        self.fc = nn.Linear(embedding_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.fc(x)
        x = self.output(x)

        return x

# Training Loop

In [None]:
embedding_dim = 100
hidden_dim = 256
output_dim = 2

model = SentimentClassifier(embedding_dim, hidden_dim, output_dim).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

num_epochs = 20
batch_size = 128
accuracies = []

for epoch in range(num_epochs):
    model.train()

    num_batches = len(X_train) // batch_size
    for batch in range(num_batches):
        start = batch * batch_size
        end = start + batch_size
        inputs = X_train[start:end]
        labels = y_train[start:end]
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    model.eval()
    
    total = 0
    correct = 0
    
    with torch.no_grad():
        num_batches = len(X_test) // batch_size

        for batch in range(num_batches):
            start = batch * batch_size
            end = start + batch_size
            inputs = X_test[start:end]
            labels = y_test[start:end]

            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracies.append(correct / total * 100)
    accuracy = sum(accuracies) / len(accuracies)
    print(f'Epoch {epoch + 1}/{num_epochs}: {accuracy:.2f}% accuracy')

# Saving the Model

In [None]:
import torch

torch.save("models/embedding_model_v2.pth")
# torch.save(model.state_dict(), 'models/embedding_model.pth')