In [1]:
%matplotlib widget
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
import os, glob

device = "cuda" if torch.cuda.is_available() else "cpu"

In [1]:
import keras

In [2]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

In [3]:
file = glob.glob(os.path.expanduser("~/Documents/projects/chatgpt-from-scratch/data/*.csv"))[0]
df = pd.read_csv(file, index_col=0).dropna(how="any", axis=0)

In [5]:
encoder = {s:i+1 for i, s in enumerate(sorted(temp))}
decoder = {i:s for s, i in encoder.items()}

In [6]:
max_length = int(df["statement"].apply(len).quantile(0.9))

In [7]:
threshold = df.groupby("status").count().quantile(0.7)

In [8]:
threshold.values[0]

np.float64(11602.399999999996)

In [9]:
for status in df["status"].unique():
    _data = df[df["status"] == status]
    if len(_data) < threshold.values[0]:
        print(f"{status}, {len(_data)}")
        n = threshold.values[0] // len(_data)
        for _ in range(int(n)):
            df = pd.concat((df, _data))

Anxiety, 3841
Suicidal, 10652
Stress, 2587
Bipolar, 2777
Personality disorder, 1077


In [10]:
df.groupby("status").count()

Unnamed: 0_level_0,statement
status,Unnamed: 1_level_1
Anxiety,15364
Bipolar,13885
Depression,15404
Normal,16343
Personality disorder,11847
Stress,12935
Suicidal,21304


In [13]:
import torchtext

ModuleNotFoundError: No module named 'torchtext'

In [16]:
statements = df["statement"].values
labels = df["status"].values

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [17]:
train_statements, val_statements, train_labels, val_labels = train_test_split(statements, encoded_labels, test_size=0.2, random_state=42)

In [18]:
DROPOUT = 0.2
BATCH_SIZE = 32
D_MODEL = 128
EPOCH = 10

In [19]:
class SentimentDataset(Dataset):
    def __init__(self, statements, labels, tokenizer, max_length=1000):
        self.statements = statements
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.statements)

    def __getitem__(self, idx):
        statement = self.statements[idx]
        label = self.labels[idx]
        tokens = self.tokenizer(statement)
        if len(tokens) > self.max_length:
            i_start = torch.randint(low=0, high=len(tokens) - self.max_length + 1, size=(1, )).item()
            tokens = tokens[i_start:i_start+self.max_length]
        tokens = torch.tensor(tokens)

        return tokens, torch.tensor(label)

def collate_fn(batch):
    tokens, labels = zip(*batch)
    tokens_padded = pad_sequence(tokens, batch_first=True, padding_value=0)
    labels = torch.stack(labels)
    return tokens_padded, labels

In [20]:
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

class CustomTransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers, num_classes):
        super(CustomTransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, batch_first=True, dropout=DROPOUT)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
        self.fc = nn.Linear(d_model, num_classes)
    
    def forward(self, src):
        src = self.embedding(src) * math.sqrt(self.embedding.embedding_dim)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = output.mean(dim=1)  # Global average pooling
        output = self.fc(output)
        return output


In [21]:
# Assume you have a tokenizer function
def simple_tokenizer(text):
    return [encoder[c] for c in text]  # Simple example: convert each character to its ASCII value

# Create datasets
train_dataset = SentimentDataset(train_statements, train_labels, tokenizer=simple_tokenizer)
val_dataset = SentimentDataset(val_statements, val_labels, tokenizer=simple_tokenizer)

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_fn, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, collate_fn=collate_fn)

In [22]:
# Instantiate the model
vocab_size = len(temp)  # Based on simple_tokenizer, you may need to adjust this based on your tokenizer
model = CustomTransformerModel(vocab_size=vocab_size, d_model=D_MODEL, nhead=8, num_encoder_layers=6, num_classes=len(label_encoder.classes_)).to(device)

In [27]:
model_state_dict = torch.load("../model_state.pth")
model.load_state_dict(model_state_dict)

  model_state_dict = torch.load("../model_state.pth")


<All keys matched successfully>

In [28]:
model.eval()

CustomTransformerModel(
  (embedding): Embedding(393, 128)
  (pos_encoder): PositionalEncoding()
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=2048, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=2048, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (fc): Linear(in_features=128, out_features=7, bias=True)
)

In [39]:
for inputs, labels in val_loader:
    print("".join([decoder[item.item()]for item in inputs[1]]))
    print(label_encoder.inverse_transform(labels[1].view(1, -1)))
    break

i think the wifi on my iphone is broken it will quot connect quot but when i actually have to use it that s another storyLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL

  y = column_or_1d(y, warn=True)


In [48]:
labels[0]

tensor(3)

In [57]:
model(inputs.to(device))

tensor([[-0.5605, -1.1040,  0.9406,  0.9637, -2.2657, -1.0721,  0.4873],
        [-0.5605, -1.1040,  0.9406,  0.9637, -2.2657, -1.0721,  0.4873],
        [-0.5605, -1.1040,  0.9406,  0.9637, -2.2657, -1.0721,  0.4873],
        [-0.5605, -1.1040,  0.9406,  0.9637, -2.2657, -1.0721,  0.4873],
        [-0.5605, -1.1040,  0.9406,  0.9637, -2.2657, -1.0721,  0.4873],
        [-0.5605, -1.1040,  0.9406,  0.9637, -2.2657, -1.0721,  0.4873],
        [-0.5605, -1.1040,  0.9406,  0.9637, -2.2657, -1.0721,  0.4873],
        [-0.5605, -1.1040,  0.9406,  0.9637, -2.2657, -1.0721,  0.4873],
        [-0.5605, -1.1040,  0.9406,  0.9637, -2.2657, -1.0721,  0.4873],
        [-0.5605, -1.1040,  0.9406,  0.9637, -2.2657, -1.0721,  0.4873],
        [-0.5605, -1.1040,  0.9406,  0.9637, -2.2657, -1.0721,  0.4873],
        [-0.5605, -1.1040,  0.9406,  0.9637, -2.2657, -1.0721,  0.4873],
        [-0.5605, -1.1040,  0.9406,  0.9637, -2.2657, -1.0721,  0.4873],
        [-0.5605, -1.1040,  0.9406,  0.9637, -2.265

In [56]:
labels

tensor([3, 3, 1, 2, 3, 6, 3, 3, 5, 2, 2, 4, 2, 2, 2, 2, 2, 6, 3, 0, 1, 2, 6, 2,
        0, 2, 3, 3, 2, 2, 4, 3])

In [44]:
label_encoder.inverse_transform([[3]])

  y = column_or_1d(y, warn=True)


array(['Normal'], dtype=object)