In [2]:
import os
import torch
from torch.optim import Adam
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import (BigBirdForSequenceClassification, BigBirdTokenizer
                          , Trainer, TrainingArguments, DataCollatorWithPadding
                         , logging)
import pandas as pd
from sklearn.model_selection import train_test_split
import time
import sentencepiece
from datasets import load_metric
from tqdm import tqdm  # Import tqdm for the progress bar
from itertools import product
import glob
import numpy as np


In [7]:
# set paths
data_dir = '../../data/processed'
posts_equal_csv = os.path.join(data_dir,'aita_equal.csv')

# dump proportion of data (for testing, or if training would take too long with all data)
dump_data_prop = 0

model = 6
# param_search_dict = {
#     "classifier_dropout":[0,.5]
#     , "hidden_dropout":[0,.5]
#     , "weight_decay":[0,.5]
# }

# count = 0
# for i in product(param_search_dict["classifier_dropout"]
#                  ,param_search_dict["hidden_dropout"]
#                  ,param_search_dict["weight_decay"]):
#     if model == count:
#         classifier_dropout = i[0]
#         hidden_dropout = i[1]
#         weight_decay = i[2]
#     count += 1

# model_save_path=(
#     f'./saved_models/testModelB{model}_66000_cd{classifier_dropout}_hd{hidden_dropout}_wd{weight_decay}'
# )

model_dict = {
    "B1":[0,0,.5]
    , "A3":[.2,.2,.1]
    , "C1":[.2,.2,.5]
    , "3":[0,.2,.01]
    , "6":[.2,.2,0]
    , "7":[.2,.2,.01]
    , "A1":[0,.2,.1]
    , "B5":[.5,0,.5]
    , "2":[0,.2,0]
}

count = 0
for model_name, param_list in model_dict.items():

    if model == count:
        classifier_dropout = param_list[0]
        hidden_dropout = param_list[1]
        weight_decay = param_list[2]
        model_save_path=(
            f'../../data/saved_models/testModel{model_name}_132000_cd{classifier_dropout}_hd{hidden_dropout}_wd{weight_decay}'
    )
    count += 1

print(model_save_path)

../../data/saved_models/testModelA1_132000_cd0_hd0.2_wd0.1


In [8]:
# Load  from CSV
aita_df = pd.read_csv(posts_equal_csv)

In [9]:
if torch.cuda.is_available():
    device = torch.device("cuda")  # GPU
    print("GPU is available.")
else:
    device = torch.device("cpu")  # CPU
    print("GPU is not available. Using CPU.")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

GPU is available.


In [10]:
# Preprocessing

# recode flair
aita_df['label'] = [1 if x == 'YTA' else 0 for x in aita_df['link_flair_text']]

x = aita_df['title'] + ' ' + aita_df['selftext']
y = aita_df['label']

In [11]:
# split out some if not using whole dataset
if dump_data_prop > 0:
  X_use, X_dump, y_use, y_dump = train_test_split(
      x,y, stratify=y, test_size=dump_data_prop, random_state=42)
else:
  X_use = x
  y_use = y

# split out train from val+test
X_train, X_hold, y_train, y_hold = train_test_split(
    X_use,y_use, stratify=y_use, test_size=0.132865, random_state=42)

# split out val and test
X_val, X_test, y_val, y_test = train_test_split(
    X_hold,y_hold, stratify=y_hold, test_size=0.5, random_state=42)

# only train on part of the data for param testing
# X_train, X_dump2, y_train, y_dump2 = train_test_split(
#     X_train, y_train, stratify=y_train, test_size=0.5, random_state=42)

print(f"Train Size: {len(X_train)} Val Size: {len(X_val)} Test Size: {len(X_test)}")

Train Size: 132000 Val Size: 10113 Test Size: 10113


In [12]:
# load tokenizer and split data into train/validate/test
tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
x_train_tokens = tokenizer(list(X_train), padding=True, truncation=True, return_tensors="pt")
x_val_tokens = tokenizer(list(X_val), padding=True, truncation=True, return_tensors="pt")
x_test_tokens = tokenizer(list(X_test), padding=True, truncation=True, return_tensors="pt")

y_train_tensor = torch.tensor(list(y_train), dtype=torch.float)
y_val_tensor = torch.tensor(list(y_val), dtype=torch.float)
y_test_tensor = torch.tensor(list(y_test), dtype=torch.float)


display(X_train[0:2])

7026     AITA for calling my grandma an alcoholic? Grow...
74236    WIBTA if I confronted my best friend about her...
dtype: object

In [13]:
# Calculate the maximum and minimum sequence lengths
max_sequence_length = x_train_tokens['input_ids'].shape[1]
min_sequence_length = x_train_tokens['input_ids'].shape[1]

print(f"Maximum Sequence Length: {max_sequence_length}")
print(f"Minimum Sequence Length: {min_sequence_length}")

Maximum Sequence Length: 1653
Minimum Sequence Length: 1653


In [14]:
# prep data
train_dataset = TensorDataset(x_train_tokens['input_ids'], x_train_tokens['attention_mask'], y_train_tensor)
val_dataset = TensorDataset(x_val_tokens['input_ids'], x_val_tokens['attention_mask'], y_val_tensor)
test_dataset = TensorDataset(x_test_tokens['input_ids'], x_test_tokens['attention_mask'], y_test_tensor)

In [15]:
# Create the model
model = BigBirdForSequenceClassification.from_pretrained(
    'google/bigbird-roberta-base'
    , num_labels=1
    , classifier_dropout = classifier_dropout
    , hidden_dropout_prob = hidden_dropout
)
model = model.to(device)
optimizer = Adam(model.parameters(), lr=1e-5)
loss_fn = nn.BCEWithLogitsLoss()

Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at google/bigbird-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# create accuracy metric
metric = load_metric("accuracy")

def compute_metrics(p):
    return metric.compute(predictions=p.predictions > 0.5, references=p.label_ids)

# Define TrainingArguments
training_args = TrainingArguments(
    output_dir=model_save_path,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=1000,
    weight_decay=weight_decay,
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=33000,
    save_steps=5500,
    logging_steps=33000,
    load_best_model_at_end=False,
    save_total_limit=1,
    learning_rate=1e-5,
    fp16=True,
    report_to='none',
    seed=42
)

# trainer.log_metrics("train", compute_metrics)

print(model_save_path)

  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Downloading builder script: 4.21kB [00:00, 14.9MB/s]                   

../../data/saved_models/testModelA1_132000_cd0_hd0.2_wd0.1





In [17]:
def data_collector(features):
    batch = {}
    batch['input_ids'] = torch.stack([f[0] for f in features])
    batch['attention_mask'] = torch.stack([f[1] for f in features])
    batch['labels'] = torch.stack([f[2] for f in features])

    return batch

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collector,
    compute_metrics=compute_metrics
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [18]:
if glob.glob(f"{model_save_path}/checkpoint*/pytorch_model.bin"):
    resume = True
else:
    resume = False

# train the model
logging.set_verbosity_error()
trainer.train(resume_from_checkpoint = resume)

  3%|▎         | 10667/330000 [1:47:26<54:34:41,  1.63it/s]

KeyboardInterrupt: 