In [1]:
%matplotlib widget
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
import os, glob

device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [3]:
file = glob.glob(os.path.expanduser("~/Documents/projects/chatgpt-from-scratch/data/*.csv"))[0]
df = pd.read_csv(file, index_col=0)

In [4]:
df["status"].unique()

array(['Anxiety', 'Normal', 'Depression', 'Suicidal', 'Stress', 'Bipolar',
       'Personality disorder'], dtype=object)

In [52]:
df["statement"].dropna().apply(lambda x: len(x)).sort_values().tail(100)

39827     6349
10347     6358
48657     6366
17977     6419
38280     6421
         ...  
9416     25302
46660    26163
10834    27390
51396    27588
13293    32759
Name: statement, Length: 100, dtype: int64

In [5]:
statements = df["statement"].values
labels = df["status"].values

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [54]:
train_statements, val_statements, train_labels, val_labels = train_test_split(statements, encoded_labels, test_size=0.2, random_state=42)

In [7]:
class SentimentDataset(Dataset):
    def __init__(self, statements, labels, tokenizer, max_length=128):
        self.statements = statements
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.statements)

    def __getitem__(self, idx):
        statement = self.statements[idx]
        label = self.labels[idx]
        tokens = self.tokenizer(statement)
        tokens = tokens[:self.max_length]
        tokens = torch.tensor(tokens)

        return tokens, torch.tensor(label)

def collate_fn(batch):
    tokens, labels = zip(*batch)
    tokens_padded = pad_sequence(tokens, batch_first=True, padding_value=0)
    labels = torch.stack(labels)
    return tokens_padded, labels

In [8]:
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

class CustomTransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers, num_classes):
        super(CustomTransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
        self.fc = nn.Linear(d_model, num_classes)
    
    def forward(self, src):
        src = self.embedding(src) * math.sqrt(self.embedding.embedding_dim)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = output.mean(dim=1)  # Global average pooling
        output = self.fc(output)
        return output


In [None]:
# Assume you have a tokenizer function
def simple_tokenizer(text):
    return [ord(c) for c in text]  # Simple example: convert each character to its ASCII value

# Create datasets
train_dataset = SentimentDataset(train_statements, train_labels, tokenizer=simple_tokenizer)
val_dataset = SentimentDataset(val_statements, val_labels, tokenizer=simple_tokenizer)

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_fn, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=collate_fn)

# Instantiate the model
vocab_size = 128  # Based on simple_tokenizer, you may need to adjust this based on your tokenizer
model = CustomTransformerModel(vocab_size=vocab_size, d_model=512, nhead=8, num_encoder_layers=6, num_classes=len(label_encoder.classes_))

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
model.train()
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    # Optional: Evaluate on the validation set after each epoch
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    print(f'Epoch {epoch+1}, Loss: {val_loss/len(val_loader)}, Accuracy: {100 * correct / total}%')
    model.train()
