In [None]:
#Imports
from datasets import load_dataset,DatasetDict
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertModel, DataCollatorWithPadding, AutoModelForSequenceClassification, AdamW, get_scheduler, BertForSequenceClassification, pipeline
import tensorflow as tf
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
#import evaluate
from sklearn.metrics import accuracy_score, f1_score


#Might not be able to import evaluate (might not work on mimi)

SEED = 42

In [None]:
#retrieve data

train_dataset = load_dataset('ag_news', split='train')
test_dataset = load_dataset('ag_news', split='test')

split = train_dataset.train_test_split(test_size=0.1, seed=SEED)

train_dataset = split['train']
validation_dataset = split['test']

#ONLY USE THE FIRST 10000K EXAMPLES OR ELSE TOO LONG (20k was too slow)
train_dataset = train_dataset.shuffle(seed=42).select(range(20000))  # Use first 10k examples
validation_dataset = validation_dataset.shuffle(seed=42).select(range(4000))  # Use first 5k examples as validation

In [None]:
#get shape

print(train_dataset.shape)
print(test_dataset.shape)
print(validation_dataset.shape)

In [None]:
# create datasets for fine-tuning
datasets = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
})
datasets

In [None]:
#Learn about the classes

num_labels = datasets['train'].features['label'].num_classes
id2label = {}
label2id = {}
for label_id,label in enumerate(datasets['train'].features['label'].names):
    id2label[label_id] = label
    label2id[label] = label_id
    
print(f"NUM_LABELS: {num_labels}")
print(f"ID2LABEL: {id2label}")
print(f"LABEL2ID: {label2id}")


In [None]:
#print some example text blurbs

import random
random.seed(SEED)

# get random integers in the range of 0 to train_dataset_length
EXAMPLE_INDICES = [random.randrange(len(datasets['train'])) for _ in range(3)]

for i in EXAMPLE_INDICES:
    text = datasets['train']['text'][i]
    label_id = datasets['train']['label'][i]
    label = id2label[label_id]
    print(f" TEXT[{i}]: {text}")
    print(f"LABEL[{i}]: {label} ({label_id})")
    print()


Begin the training process

In [None]:
#Load the pre-trained model
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
#model = BertModel.from_pretrained("google-bert/bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("google-bert/bert-base-uncased", num_labels=4)

In [None]:
#switch to mimi gpu if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

#Here we switch to the mac GPU (will still freeze your computer so... yeah)
#if torch.backends.mps.is_available():
#    print("MPS (Apple GPU) is available!")
#else:
#    print("MPS is not available. Falling back to CPU.")

device = torch.device("mps")
model.to(device)

In [None]:
#tokenize the datasets

#this tokenizes each text
tokenized_datasets = datasets.map(lambda x: tokenizer(x['text'], truncation=True), batched=True, remove_columns=['text'])

# rename for multiclass fine-tuning
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')

# set format to pytorch
tokenized_datasets.set_format(type='torch')

tokenized_datasets
#labels are target class labels for classification, integers (0 to 3)
#input ids are tokenized representations of the text input.
#attention mask is binary mask (0s and 1s) indicating which tokens are actual words (1) and which are padding (0).

In [None]:
#set up data collator (this makes it so that your max sentance lengh is only as big as that batch, improving efficiency)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
#Load datasets
#Attempting to reduce batch size to 8

dataloaders = {
    'train': None,
    'validation': None,
    'test': None,
}
for dataset_type in ['train', 'validation', 'test']:
    dataloaders[dataset_type] = DataLoader(
        dataset = tokenized_datasets[dataset_type],
        batch_size = 32,
        shuffle = True,
        collate_fn = data_collator,
    )

In [None]:
#define optimizer and scheduler and accuracy score

#optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0, no_deprecation_warning=True)

# Learning rate scheduler
num_training_steps = len(dataloaders['train']) * 3  # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


#model.parameters(): Optimizes all trainable parameters of BERT.
#lr=5e-5: learning rate, standard for fine-tuning BERT.
#weight_decay=0: Disables L2 regularization.
#A learning rate scheduler adjusts the learning rate during training to improve performance.
#AdamW improves gradient updates and prevents over-regularization.

#accuracy_metric = evaluate.load('accuracy')
#f1_metric = evaluate.load('f1')

In [None]:
#training function
#trains a BERT model for one epoch on a classification dataset using gradient descent
#updates model parameters.

def train(model, dataloader):
    # set to train mode
    model.train() #activates dropout and layer normalization
    loss = 0
    for batch in tqdm(dataloader): #Iterates over each mini-batch in the dataset and displays progress with tqdm.
        # reset gradients
        optimizer.zero_grad() #Clears old gradients before backpropagation (otherwise, PyTorch accumulates gradients).
        
        # get predictions
        batch = {k:v.to(device) for k,v in batch.items()} #Moves all tensors (input_ids, attention_mask, labels) to GPU or CPU.
        outputs = model(**batch)  #Feeds input into BERT and gets outputs.
        predictions = torch.argmax(outputs.logits, dim=-1)  #Converts logits to predicted class indices (highest probability). Logits (raw scores) for each class.
        labels = batch['labels'] #these are the ground truth labels
        
        # gradient descent
        outputs.loss.backward()  # Compute gradients
        optimizer.step() # Update model parameters
        lr_scheduler.step() # Updates the learning rate according to the scheduler.
        
        # accumulate metrics
        loss += outputs.loss.item() #Adds batch loss to total loss.
        accuracy_metric.add_batch(predictions=predictions, references=labels) #Stores batch predictions & labels for later accuracy calculation.
        f1_metric.add_batch(predictions=predictions, references=labels) #Stores predictions for F1-score computation.
    
    # return metrics
    loss /= len(dataloader) #Computes average loss over all batches.
    accuracy = accuracy_metric.compute() # Calculates accuracy after processing all batches.
    f1 = f1_metric.compute(average='macro')
    return {'loss':loss, **accuracy, **f1} #Returns a dictionary with loss, accuracy, and F1-score.


def trainfaster(model, dataloader):
    model.train()  # Activate dropout and layer normalization
    total_loss = 0
    all_predictions = []
    all_labels = []

    for batch in tqdm(dataloader):  # Iterate over batches with progress bar
        optimizer.zero_grad()  # Clear old gradients
        
        # Move batch to device (GPU or CPU)
        #batch = {k: v.to(device) for k, v in batch.items()}
        batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
        outputs = model(**batch)  # Forward pass
        loss = outputs.loss  # Get loss
        loss.backward()  # Compute gradients
        optimizer.step()  # Update parameters
        lr_scheduler.step()  # Update learning rate

        # Store loss
        total_loss += loss.item()

        # Convert logits to class predictions
        predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
        labels = batch['labels'].cpu().numpy()

        # Store predictions and labels
        all_predictions.extend(predictions)
        all_labels.extend(labels)

    # Compute final loss, accuracy, and F1-score
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions, average='macro')

    return {'loss': avg_loss, 'accuracy': accuracy, 'f1': f1}

In [None]:
#evaluate
# only measures performance without modifying the model.

def evaluate(model, dataloader):
    # set to evaluation mode
    model.eval()
    loss = 0
    # disable gradient computation
    with torch.no_grad():
        for batch in tqdm(dataloader):
            # get predictions
            batch = {k:v.to(device) for k,v in batch.items()}
            outputs = model(**batch)
            predictions = torch.argmax(outputs.logits, dim=-1)
            labels = batch['labels'] 

            # accumulate metrics
            loss += outputs.loss.item() 
            accuracy_metric.add_batch(predictions=predictions, references=labels)
            f1_metric.add_batch(predictions=predictions, references=labels)
    
    # return metrics
    loss /= len(dataloader)
    accuracy = accuracy_metric.compute()
    f1 = f1_metric.compute(average='macro')
    return {'loss':loss, **accuracy, **f1}

def evaluatefaster(model, dataloader):
    model.eval()  # Set model to evaluation mode
    total_loss = 0
    all_predictions = []
    all_labels = []

    with torch.no_grad():  # Disable gradient computation for efficiency
        for batch in tqdm(dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss  # Get loss
            total_loss += loss.item()

            # Convert logits to class predictions
            predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
            labels = batch['labels'].cpu().numpy()

            # Store predictions and labels
            all_predictions.extend(predictions)
            all_labels.extend(labels)

    # Compute final loss, accuracy, and F1-score
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_predictions)
    f1 = f1_score(all_labels, all_predictions, average='macro')

    return {'loss': avg_loss, 'accuracy': accuracy, 'f1': f1}

Test and fine-tune:

In [None]:
#THIS IS CHAT GPTS TRAIN FUNCTION. I AM NOT USING IT

def chat_train():
    #epochs = 3
    best_val_accuracy = 0

    def train2(epochs):
        for epoch in range(epochs):
            model.train()
            total_loss = 0
            correct = 0
            total = 0

            loop = tqdm(dataloaders['train'], leave=True)
            for batch in loop:
                batch = {k: v.to(device) for k, v in batch.items()}
                optimizer.zero_grad()
                
                outputs = model(**batch)
                loss = outputs.loss
                logits = outputs.logits

                loss.backward()
                optimizer.step()
                lr_scheduler.step()

                total_loss += loss.item()

                # Compute training accuracy
                preds = torch.argmax(logits, dim=1)
                correct += (preds == batch["label"]).sum().item()
                total += batch["label"].size(0)

                loop.set_description(f"Epoch {epoch+1}")
                loop.set_postfix(loss=total_loss / total, acc=correct / total)

            # Validation
            model.eval()
            val_preds = []
            val_labels = []
            with torch.no_grad():
                for batch in dataloaders['validation']:
                    batch = {k: v.to(device) for k, v in batch.items()}
                    outputs = model(**batch)
                    logits = outputs.logits

                    val_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
                    val_labels.extend(batch["label"].cpu().numpy())

            val_accuracy = accuracy_score(val_labels, val_preds)
            print(f"Validation Accuracy: {val_accuracy:.4f}")

            # Early stopping
            if val_accuracy < best_val_accuracy:
                print("Validation accuracy decreased, stopping training early.")
                break
            best_val_accuracy = val_accuracy

        print("Training complete!")


In [None]:
#run the test data on the un-trained model to get baseline performance
# Since the model hasn’t been trained, its weights are random, It will likely make random predictions.
#The accuracy will be close to random chance: For a 4-class classification task, accuracy ≈ 25%.

test_metrics = evaluatefaster(model, dataloaders['test'])
print(f"TEST ACCURACY: {test_metrics['accuracy']:.5f}", end=" ; ")
print(f"F1 (MACRO): {test_metrics['f1']:.5f}")

In [None]:
#test out the un-trained model on some examples

model.eval()  # Ensure stable inference
sentances =["Breaking news: AI is taking over!", "The HABS beat the senators 10-1 in a close game!", "Is Tesla coming out with a new car?", "I love cats", "Soccer World Cup Final Tomorrow!"]

for sentance in sentances:
    with torch.no_grad():
        inputs = tokenizer(sentance, return_tensors="pt").to(device)
        outputs = model(**inputs)
        predicted_class = torch.argmax(outputs.logits, dim=-1)
    print(f"text: {sentance} \npredicted label: {id2label[predicted_class.item()]}\n")


In [None]:
#fine-tune the model
#THIS WAS RUN SEPERATELY ON MIMI, SO DON'T NEED TO RUN THIS AGAIN. PRE TRAINED MODEL IS SAVED IN SAVED_MODEL

'''
for epoch in range(3): #2 epochs
    train_metrics = trainfaster(model, dataloaders['train'])
    validation_metrics = evaluatefaster(model, dataloaders['validation'])
          
    print(f"EPOCH {epoch+1}", end=" | ")
    print(f"TRAIN LOSS: {train_metrics['loss']:.5f}", end=" | ")
    print(f"VALIDATION LOSS: {validation_metrics['loss']:.5f}", end=" ; ")
    print(f"ACCURACY: {validation_metrics['accuracy']:.5f}", end=" ; ")
    print(f"F1 (MACRO): {validation_metrics['f1']:.5f}")*/
'''

In [None]:
#save the trained model
#THIS WAS RUN SEPERATELY ON MIMI, SO DON'T NEED TO RUN THIS AGAIN. PRE TRAINED MODEL IS SAVED IN SAVED_MODEL

'''
model.cpu()
model.save_pretrained("saved_model")
tokenizer.save_pretrained("saved_model")
'''

In [None]:
#Load the pre-trained model onto gpu

trainedmodel = BertForSequenceClassification.from_pretrained("saved_model")

if torch.backends.mps.is_available():
    print("MPS (Apple GPU) is available!")
else:
   print("MPS is not available. Falling back to CPU.")
device = torch.device("mps")

trainedmodel.to(device)

In [None]:
#evaluate the fine-tuned model

test_metrics = evaluatefaster(trainedmodel, dataloaders['test'])
print(f"TEST ACCURACY: {test_metrics['accuracy']:.5f}", end=" ; ")
print(f"F1 (MACRO): {test_metrics['f1']:.5f}")

In [None]:
#pre-define a classifier pipeline that packages all params into one model

news_topic_classifier = pipeline(task='text-classification', model=trainedmodel, tokenizer=tokenizer, device=torch.device("mps"))