In [1]:
%matplotlib widget
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
import os, glob

device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [30]:
torch.randint(low=0, high=100, size=(1, )).item()

24

In [3]:
file = glob.glob(os.path.expanduser("~/Documents/projects/chatgpt-from-scratch/data/*.csv"))[0]
df = pd.read_csv(file, index_col=0).dropna(how="any", axis=0)

In [35]:
temp = set()
for item in df["statement"].apply(set):
    temp = temp | item

In [60]:
encoder = {s:i for i, s in enumerate(temp)}

In [4]:
statements = df["statement"].values
labels = df["status"].values

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [10]:
train_statements, val_statements, train_labels, val_labels = train_test_split(statements, encoded_labels, test_size=0.2, random_state=42)

In [7]:
class SentimentDataset(Dataset):
    def __init__(self, statements, labels, tokenizer, max_length=128):
        self.statements = statements
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.statements)

    def __getitem__(self, idx):
        statement = self.statements[idx]
        label = self.labels[idx]
        tokens = self.tokenizer(statement)
        tokens = tokens[:self.max_length]
        tokens = torch.tensor(tokens)

        return tokens, torch.tensor(label)

def collate_fn(batch):
    tokens, labels = zip(*batch)
    tokens_padded = pad_sequence(tokens, batch_first=True, padding_value=0)
    labels = torch.stack(labels)
    return tokens_padded, labels

In [8]:
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

class CustomTransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers, num_classes):
        super(CustomTransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
        self.fc = nn.Linear(d_model, num_classes)
    
    def forward(self, src):
        src = self.embedding(src) * math.sqrt(self.embedding.embedding_dim)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = output.mean(dim=1)  # Global average pooling
        output = self.fc(output)
        return output


In [63]:
# Assume you have a tokenizer function
def simple_tokenizer(text):
    return [encoder[c] for c in text]  # Simple example: convert each character to its ASCII value

# Create datasets
train_dataset = SentimentDataset(train_statements, train_labels, tokenizer=simple_tokenizer)
val_dataset = SentimentDataset(val_statements, val_labels, tokenizer=simple_tokenizer)

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_fn, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=collate_fn)

In [64]:
for inputs, labels in train_loader:
    print(inputs.shape)
    print(inputs)
    print(labels.shape)
    print(labels)
    break

torch.Size([32, 128])
tensor([[148, 378, 380,  ...,   0,   0,   0],
        [357, 276, 389,  ..., 196, 389, 148],
        [389, 231, 262,  ...,   0,   0,   0],
        ...,
        [322, 378, 215,  ...,   0,   0,   0],
        [357, 389, 322,  ..., 389,  21, 294],
        [150, 294, 389,  ..., 389,  41, 380]])
torch.Size([32])
tensor([2, 5, 6, 6, 6, 4, 5, 5, 3, 6, 2, 3, 6, 2, 2, 2, 2, 3, 6, 2, 2, 0, 0, 6,
        2, 2, 2, 0, 2, 3, 6, 2])


In [65]:
inputs.max(axis=0)

torch.return_types.max(
values=tensor([389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389,
        389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389,
        389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389,
        389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389,
        389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389,
        389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389,
        389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389,
        389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389,
        389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389, 389,
        389, 389]),
indices=tensor([ 2,  3,  1,  8,  0,  1,  5,  0, 12, 10,  5,  0,  1,  3,  1,  4,  0,  9,
         4,  0,  9, 10,  7,  0,  4,  0, 15,  1, 18,  6,  4, 15,  0,  5,  1,  3,
         1,  0,  9,  3,  0,  9, 19,  7,  3,  0,  1,  4,  7,  3,  0, 28,  3,  0,

In [67]:
nn.Embedding(394, 512)(inputs)

tensor([[[ 1.8472, -0.4284,  0.3114,  ...,  0.1930, -1.3374, -0.6331],
         [-1.1831,  0.3195, -2.4341,  ...,  0.0198, -1.4056,  1.3602],
         [ 0.0649,  0.1676,  0.5911,  ...,  0.0372, -0.4619,  0.1664],
         ...,
         [ 0.5315, -2.1471,  0.3773,  ..., -0.7073, -0.5457, -0.4716],
         [ 0.5315, -2.1471,  0.3773,  ..., -0.7073, -0.5457, -0.4716],
         [ 0.5315, -2.1471,  0.3773,  ..., -0.7073, -0.5457, -0.4716]],

        [[-1.5461,  0.8485, -0.5387,  ...,  0.2516,  0.4907,  0.2928],
         [-0.7916, -0.0676, -0.4590,  ..., -0.0384,  0.2466,  0.8198],
         [-1.6470,  0.0416, -0.8102,  ...,  0.2648,  0.1906, -2.0705],
         ...,
         [-1.0920, -1.7428, -0.3774,  ...,  0.2005,  0.7057,  1.2666],
         [-1.6470,  0.0416, -0.8102,  ...,  0.2648,  0.1906, -2.0705],
         [ 1.8472, -0.4284,  0.3114,  ...,  0.1930, -1.3374, -0.6331]],

        [[-1.6470,  0.0416, -0.8102,  ...,  0.2648,  0.1906, -2.0705],
         [ 0.4037, -0.2812,  0.2534,  ...,  0

In [None]:
# Instantiate the model
vocab_size = 32759  # Based on simple_tokenizer, you may need to adjust this based on your tokenizer
model = CustomTransformerModel(vocab_size=vocab_size, d_model=512, nhead=8, num_encoder_layers=6, num_classes=len(label_encoder.classes_)).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
model.train()
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs.to(device))
        loss = criterion(outputs, labels.to(device))
        loss.backward()
        optimizer.step()
    
    # Optional: Evaluate on the validation set after each epoch
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs.to(device))
            loss = criterion(outputs, labels.to(device))
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    print(f'Epoch {epoch+1}, Loss: {val_loss/len(val_loader)}, Accuracy: {100 * correct / total}%')
    model.train()