In [28]:
%matplotlib widget
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
import os, glob, datetime

device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

In [3]:
file = glob.glob(os.path.expanduser("~/Documents/projects/chatgpt-from-scratch/data/*.csv"))[0]
df = pd.read_csv(file, index_col=0).dropna(how="any", axis=0)

### Build tokenizer

In [None]:
text = df["statement"].tolist()

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(vocab_size=30000, min_frequency=3, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.pre_tokenizer = Whitespace()

tokenizer.train_from_iterator(text, trainer)

tokenizer.save("../data/tokenizer-mental-health.json")

### Model

In [4]:
DROPOUT = 0.2
BATCH_SIZE = 64
D_MODEL = 16
FF_SIZE = 8
EPOCH = 50

In [5]:
class SentimentDataset(Dataset):
    def __init__(self, statements, labels, tokenizer, max_length=1000):
        self.statements = statements
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.statements)

    def __getitem__(self, idx):
        statement = self.statements[idx]
        label = self.labels[idx]
        tokens = self.tokenizer(statement)
        if len(tokens) > self.max_length:
            i_start = torch.randint(low=0, high=len(tokens) - self.max_length + 1, size=(1, )).item()
            tokens = tokens[i_start:i_start+self.max_length]
        tokens = torch.tensor(tokens)

        return tokens, torch.tensor(label)

def collate_fn(batch):
    tokens, labels = zip(*batch)
    tokens_padded = pad_sequence(tokens, batch_first=True, padding_value=0)
    labels = torch.stack(labels)
    return tokens_padded, labels


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x


class CustomTransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers, num_classes):
        super(CustomTransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        # self.pos_encoder = PositionalEncoding(d_model)
        self.positional_embedding = nn.Embedding(max_length, d_model)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, batch_first=True, dropout=DROPOUT, dim_feedforward=FF_SIZE)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
        self.fc = nn.Linear(d_model, num_classes)
        # self.batch_norm = nn.BatchNorm1d(d_model)
    
    def forward(self, src):
        src_positions = torch.arange(0, src.size(1), device=src.device).unsqueeze(0).expand(src.size(0), -1)
        # src = self.embedding(src) * math.sqrt(self.embedding.embedding_dim)
        src = self.embedding(src) + self.positional_embedding(src_positions)
        # src = self.pos_encoder(src)
        # src = self.batch_norm(src)
        output = self.transformer_encoder(src)
        output = output.mean(dim=1)  # Global average pooling
        output = self.fc(output)
        return output

### Eval tokenizer

In [6]:
tokenizer = Tokenizer.from_file("data/tokenizer-mental-health.json")

In [7]:
tokenizer.encode("on my gosh").tokens

['on', 'my', 'gosh']

#### Augument data to make data balance

In [27]:
df.groupby("status").count()

Unnamed: 0_level_0,statement
status,Unnamed: 1_level_1
Anxiety,3841
Bipolar,2777
Depression,15404
Normal,16343
Personality disorder,1077
Stress,2587
Suicidal,10652


In [None]:
threshold = df.groupby("status").count().quantile(0.7)

threshold.values[0]

for status in df["status"].unique():
    _data = df[df["status"] == status]
    if len(_data) < threshold.values[0]:
        print(f"{status}, {len(_data)}")
        n = threshold.values[0] // len(_data)
        for _ in range(int(n)):
            df = pd.concat((df, _data))

Anxiety, 3841
Suicidal, 10652
Stress, 2587
Bipolar, 2777
Personality disorder, 1077


In [10]:
df.groupby("status").count()

Unnamed: 0_level_0,statement
status,Unnamed: 1_level_1
Anxiety,15364
Bipolar,13885
Depression,15404
Normal,16343
Personality disorder,11847
Stress,12935
Suicidal,21304


#### Build datasets

In [None]:
max_length = int(df["statement"].apply(len).quantile(0.9))
max_length = 1562

statements = df["statement"].values
labels = df["status"].values

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

train_statements, val_statements, train_labels, val_labels = train_test_split(statements, encoded_labels, test_size=0.2, random_state=42)

In [25]:
def simple_tokenizer(text):
    return tokenizer.encode(text).ids

# Create datasets
train_dataset = SentimentDataset(train_statements, train_labels, tokenizer=simple_tokenizer)
val_dataset = SentimentDataset(val_statements, val_labels, tokenizer=simple_tokenizer)
whole_dataset = SentimentDataset(statements, encoded_labels, tokenizer=simple_tokenizer)

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_fn, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=collate_fn)
whole_loader = DataLoader(whole_dataset, batch_size=32, collate_fn=collate_fn)

In [16]:
# Instantiate the model
vocab_size = 30000  # Based on simple_tokenizer, you may need to adjust this based on your tokenizer
model = CustomTransformerModel(vocab_size=vocab_size, d_model=16, nhead=2, num_encoder_layers=4, num_classes=len(label_encoder.classes_)).to(device)

### Model training loop

In [None]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, min_lr=1e-8)

# Training loop
num_epochs = EPOCH
model.train()

write = open("epoches.csv", "w")
writer = csv.writer(write)
writer.writerow([f"Dropout: {DROPOUT}, Batch_size: {BATCH_SIZE}, D_model: {D_MODEL}, Feed forward: {FF_SIZE}"])

try:
    for epoch in range(num_epochs):
        print(f"{datetime.datetime.now().strftime('%H:%M:%S %p')}: start training epoch {epoch+1}...")
        total_losses = 0
        correct = 0
        total = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            total_losses += loss.item()                
            
            _, predicted = torch.max(outputs.data, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
        training_loss = total_losses/len(train_loader)
        train_accuracy = 100 * correct / total
              
        # Optional: Evaluate on the validation set after each epoch
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)        
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
                
        scheduler.step(val_loss)
        
        print(f'Epoch {epoch+1}, Training loss: {training_loss:.6f}, Train Accuracy: {train_accuracy:.2f}%; Validation Loss: {val_loss/len(val_loader):.6f}, Accuracy: {100 * correct / total:.2f}%, Learning rate: {scheduler.get_last_lr()[0]}')
        writer.writerow([f'Epoch {epoch+1}, Training loss: {training_loss:.6f}, Train Accuracy: {train_accuracy:.2f}%; Validation Loss: {val_loss/len(val_loader):.6f}, Accuracy: {100 * correct / total:.2f}%, Learning rate: {scheduler.get_last_lr()[0]}'])
        model.train()
except KeyboardInterrupt as e:
    raise e
finally:
    write.close()
    torch.save(model.state_dict(), "model_state.pth")

### Model Evaluation

In [17]:
model_state_dict = torch.load("model_state.pth", weights_only=True)
model.load_state_dict(model_state_dict)

<All keys matched successfully>

In [20]:
model.eval()

CustomTransformerModel(
  (embedding): Embedding(30000, 16)
  (positional_embedding): Embedding(1562, 16)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
        )
        (linear1): Linear(in_features=16, out_features=8, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=8, out_features=16, bias=True)
        (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (fc): Linear(in_features=16, out_features=7, bias=True)
)

In [21]:
model.to("cpu")

CustomTransformerModel(
  (embedding): Embedding(30000, 16)
  (positional_embedding): Embedding(1562, 16)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=16, out_features=16, bias=True)
        )
        (linear1): Linear(in_features=16, out_features=8, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=8, out_features=16, bias=True)
        (norm1): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((16,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (fc): Linear(in_features=16, out_features=7, bias=True)
)

In [22]:
input_statement = "I feel tired all the time"

tokens = tokenizer.encode(input_statement)
res = model(torch.tensor(tokens.ids).view(1, -1))
_, label = torch.max(res, 1)
result = label_encoder.inverse_transform(label)[0]
print(result)

Depression


#### Evaluation model performance

In [26]:
from tqdm.autonotebook import tqdm

correct, total = 0, 0
with torch.no_grad():
    for item, label_ in tqdm(whole_loader):
        outputs = model(item)
        _, predicted = torch.max(outputs, 1)
        total += label_.size(0)
        correct += (predicted == label_).sum().item()

accuracy = correct / total * 100

print(f"Accuracy: {accuracy:.2f}%")

  0%|          | 0/1647 [00:00<?, ?it/s]

Accuracy: 89.35%
