In [1]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, AdamW, get_scheduler
from datasets import load_dataset, Dataset
import pandas as pd
from tqdm.auto import tqdm

2024-09-04 17:21:01.235613: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-04 17:21:01.243058: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1725495661.250965  126413 cuda_dnn.cc:8322] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1725495661.253391  126413 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-04 17:21:01.262252: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
df = pd.read_csv("data/Combined Data.csv", index_col=0).dropna(how="any")

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(df["status"])

train_text, test_text, train_labels, test_labels = train_test_split(df["statement"], labels, test_size=0.2, random_state=42)

In [4]:
train_dataset = Dataset.from_pandas(pd.DataFrame({"text": train_text, "label": train_labels}))
test_dataset = Dataset.from_pandas(pd.DataFrame({"text": test_text, "label": test_labels}))

In [5]:
# model_path = "nvidia/Llama-3.1-Minitron-4B-Width-Base"
model_path = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token



In [6]:
def preprocess_data(examples):
    tokenized = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)
    tokenized["label"] = examples["label"]
    return tokenized

In [7]:
train_dataset = train_dataset.map(preprocess_data, batched=True)
test_dataset = test_dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/42144 [00:00<?, ? examples/s]

Map:   0%|          | 0/10537 [00:00<?, ? examples/s]

In [8]:
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [9]:
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

train_labels = np.array(train_dataset["label"])
class_weights = compute_class_weight(class_weight="balanced", classes=np.unique(train_labels), y=train_labels)

In [10]:
from torch.utils.data import DataLoader

batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=len(label_encoder.classes_), ignore_mismatched_sizes=False)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

criterion = torch.nn.CrossEntropyLoss(weight=class_weights)

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 5
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)



In [13]:
for epoch in range(num_epochs):
    model.train()
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
        
        loss = criterion(outputs.logits, batch["label"])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        progress_bar.set_postfix({"loss": loss.item()})

    model.eval()
    total_eval_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in test_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
            loss = criterion(outputs.logits, batch["label"])
            total_eval_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            correct_predictions += torch.sum(preds == batch["label"]).item()

    avg_eval_loss = total_eval_loss / len(test_loader)
    accuracy = correct_predictions / len(test_dataset)

    print(f"Epoch {epoch+1}: Eval Loss = {avg_eval_loss:.4f}, Accuracy = {100 * accuracy:.2f}%")

print("Training complete.")

Epoch 1:   0%|          | 0/2634 [00:00<?, ?it/s]

Epoch 1: Eval Loss = 0.5226, Accuracy = 0.7831


Epoch 2:   0%|          | 0/2634 [00:00<?, ?it/s]

Epoch 2: Eval Loss = 0.4856, Accuracy = 0.7926


Epoch 3:   0%|          | 0/2634 [00:00<?, ?it/s]

Epoch 3: Eval Loss = 0.4856, Accuracy = 0.8288


Epoch 4:   0%|          | 0/2634 [00:00<?, ?it/s]

Epoch 4: Eval Loss = 0.5360, Accuracy = 0.8243


Epoch 5:   0%|          | 0/2634 [00:00<?, ?it/s]

Epoch 5: Eval Loss = 0.6147, Accuracy = 0.8306
Training complete.
