### Fine Tuning GPT-2

In [100]:
from datasets import load_dataset, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
from peft import LoraConfig, TaskType, PeftModel, get_peft_model
from pathlib import Path
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

mdl_tok_name = "gpt2"

#### Lora Configuration for GPT-2 model

In [101]:
lora_config = \
LoraConfig(
	r = 16,  # Low-rank dimension: Start with 16, modify based on model size
	target_modules = ["c_attn", "c_proj"],  # Correct target modules for GPT-2
	task_type = TaskType.SEQ_CLS,  # Task type, e.g., Sequence Classification
	lora_alpha = 32,  # Scaling factor, consider increasing for larger models
	lora_dropout = 0.05  # Dropout, increase slightly if facing overfitting
)

#### Loading the filtered dataset

In [117]:
# Define the file path to the dataset
file_path = Path("data/filtered_dataset.csv")

# Load the dataset using Hugging Face's `load_dataset`
dataset = load_dataset('csv', data_files = str(file_path))

# Inspect the unique values in the 'labels' column
product_classes = dataset["train"].unique("Product")

# Convert the 'Product' column to a ClassLabel feature
product_label = ClassLabel(names=product_classes)
dataset = dataset.cast_column("Product", product_label)

# Rename the columns: "Product" to "labels", and "Consumer complaint narrative" to "complaint"
dataset = dataset.rename_column("Product", "labels")
dataset = dataset.rename_column("Consumer complaint narrative", "complaint")

# Extract the features (columns) we want
dataset = \
    dataset["train"].select_columns(
        ["complaint", "labels"]
    ).train_test_split(
        test_size=0.2,
        shuffle=True,
        seed=23,
        stratify_by_column="labels"
    )

splits = ["train", "test"]

# View the resulting dataset
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['complaint', 'labels'],
        num_rows: 5312
    })
    test: Dataset({
        features: ['complaint', 'labels'],
        num_rows: 1328
    })
})


#### Inspecting the labels

Credit card is labeled as 0 and  Mortgage is labeled as 1

In [118]:
product_label

ClassLabel(names=['Credit card', 'Mortgage'], id=None)

#### Preprocess dataset

Tokenizing 'Consumer complaint narrative' feature values

In [120]:
tokenizer = AutoTokenizer.from_pretrained(mdl_tok_name)

# Check if the tokenizer already has a pad_token
if tokenizer.pad_token is None:
	# Option 1: Use an existing token as the pad_token
	tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token if suitable

	# Option 2: Add a new padding token if no suitable token exists
	#tokenizer.add_special_tokens({'pad_token': '[PAD]'})
	#tokenizer.pad_token = '[PAD]'

# Let's use a lambda function to tokenize all the examples
tokenized_dataset = {}
for split in splits:
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["complaint"], truncation=True,
                            padding=True, #"max_length"
                            return_tensors = "pt"
                            ),
	    batched=True,

    )


# Inspect the available columns in the dataset
tokenized_dataset["train"]

Map: 100%|██████████| 5312/5312 [00:03<00:00, 1610.18 examples/s]
Map: 100%|██████████| 1328/1328 [00:00<00:00, 1633.45 examples/s]


Dataset({
    features: ['complaint', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 5312
})

#### Loading Model

In [121]:
model = AutoModelForSequenceClassification.from_pretrained(
    mdl_tok_name,
    num_labels=2,
    id2label={0: "Credit card", 1: "Mortgage"},
    label2id={"Credit card": 0, "Mortgage": 1},
)
# Set padding token
tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token for GPT2
model.config.pad_token_id = tokenizer.pad_token_id

print(model)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)


In [122]:
# If you added new tokens, resize the model's embeddings accordingly
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

In [123]:
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

trainable params: 1,623,552 || all params: 126,064,896 || trainable%: 1.2879


#### Defining Evaluation Metrics as a function

In [124]:
def compute_metrics_v1(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

In [125]:
def compute_metrics(eval_pred):
	# Unpack predictions and labels
	predictions, labels = eval_pred
	# Get the predicted class (argmax selects the class with the highest score)
	predictions = np.argmax(predictions, axis = 1)
	# Compute metrics
	accuracy = accuracy_score(labels, predictions)
	precision = precision_score(labels, predictions, average = "binary")
	recall = recall_score(labels, predictions, average = "binary")
	f1 = f1_score(labels, predictions, average = "binary")

	# Return all metrics
	return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


#### Define Trainer to fine-tuning the foundation model

The HuggingFace Trainer class handles the training and eval loop for PyTorch for us.

You can find more at this [link](https://huggingface.co/docs/transformers/main_classes/trainer).

In [126]:
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir= "./data/creditc_mortg",
        # Learning rate
        learning_rate= 1e-5,  #2e-5 # Lowered to prevent instability on CPU
        # Train/Validate batch size
        per_device_train_batch_size= 4,  #16 # Reduce batch size to avoid memory crashes
        per_device_eval_batch_size= 4, #16 # Same for evaluation
        # Evaluate and save the model after each epoch
        evaluation_strategy= "epoch", # Evaluate at the end of each epoch
        save_strategy= "epoch", # Save model checkpoint every epoch
	    # Epochs and weight decay
        num_train_epochs= 1, # Start with 1 epoch, increase as needed
        weight_decay= 0.01,  #Standard weight decay
	    # Resource management
		gradient_accumulation_steps= 4,  # Simulate larger batches with accumulation
	    #
        load_best_model_at_end= True,
	    no_cuda= True, # Ensure no GPU usage
    ),
    train_dataset= tokenized_dataset["train"],
    eval_dataset= tokenized_dataset["test"],
    tokenizer= tokenizer,
    data_collator= DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics= compute_metrics,
)

  trainer = Trainer(


#### Start fine-tuning

In [113]:
trainer.train()

ValueError: The model did not return a loss from the inputs, only the following keys: logits,past_key_values. For reference, the inputs it received are input_ids,attention_mask.

#### Validate fine-tuned model

In [None]:
trainer.evaluate()