### Fine Tuning GPT-2

In [1]:
from datasets import load_dataset, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
from peft import LoraConfig, TaskType, PeftModel, get_peft_model
from pathlib import Path
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

mdl_tok_name = "gpt2"

  from .autonotebook import tqdm as notebook_tqdm


#### Lora Configuration for GPT-2 model

In [2]:
lora_config = \
LoraConfig(
	r = 16,  # Low-rank dimension: Start with 16, modify based on model size
	target_modules = ["c_attn", "c_proj"],  # Correct target modules for GPT-2
	task_type = TaskType.SEQ_CLS,  # Task type, e.g., Sequence Classification
	lora_alpha = 32,  # Scaling factor, consider increasing for larger models
	lora_dropout = 0.05  # Dropout, increase slightly if facing overfitting
)

#### Loading the filtered dataset

In [3]:
# Define the file path to the dataset
file_path = Path("data/filtered_dataset.csv")

# Load the dataset using Hugging Face's `load_dataset`
dataset = load_dataset('csv', data_files = str(file_path))

# Inspect the unique values in the 'labels' column
product_classes = dataset["train"].unique("Product")

# Convert the 'Product' column to a ClassLabel feature
product_label = ClassLabel(names=product_classes)
dataset = dataset.cast_column("Product", product_label)

# Rename the columns: "Product" to "labels", and "Consumer complaint narrative" to "complaint"
dataset = dataset.rename_column("Product", "labels")
dataset = dataset.rename_column("Consumer complaint narrative", "complaint")

# Extract the features (columns) we want
dataset = \
    dataset["train"].select_columns(
        ["complaint", "labels"]
    ).train_test_split(
        test_size=0.2,
        shuffle=True,
        seed=23,
        stratify_by_column="labels"
    )

splits = ["train", "test"]

# View the resulting dataset
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['complaint', 'labels'],
        num_rows: 5312
    })
    test: Dataset({
        features: ['complaint', 'labels'],
        num_rows: 1328
    })
})


#### Inspecting the labels

Credit card is labeled as 0 and  Mortgage is labeled as 1

In [4]:
product_label

ClassLabel(names=['Credit card', 'Mortgage'], id=None)

#### Preprocess dataset

Tokenizing 'Consumer complaint narrative' feature values

In [15]:
tokenizer = AutoTokenizer.from_pretrained(mdl_tok_name)

# Check if the tokenizer already has a pad_token
if tokenizer.pad_token is None:
	# Option 1: Use an existing token as the pad_token
	tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token if suitable

	# Option 2: Add a new padding token if no suitable token exists
	#tokenizer.add_special_tokens({'pad_token': '[PAD]'})
	#tokenizer.pad_token = '[PAD]'

# Let's use a lambda function to tokenize all the examples
tokenized_dataset = {}
for split in splits:
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["complaint"], truncation=True,
                            padding=True, #"max_length"
                            return_tensors = "pt"
                            ),
	    batched=True,

    )


# Inspect the available columns in the dataset
tokenized_dataset["train"]

Dataset({
    features: ['complaint', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 5312
})

#### Loading Model

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(
    mdl_tok_name,
    num_labels=2,
    id2label={0: "Credit card", 1: "Mortgage"},
    label2id={"Credit card": 0, "Mortgage": 1},
)
# Set padding token
tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token for GPT2
model.config.pad_token_id = tokenizer.pad_token_id

print(model)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)


In [17]:
# If you added new tokens, resize the model's embeddings accordingly
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

In [18]:
peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

trainable params: 1,623,552 || all params: 126,064,896 || trainable%: 1.2879


#### Defining Evaluation Metrics as a function

In [19]:
def compute_metrics_v1(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

In [20]:
def compute_metrics(eval_pred):
	# Unpack predictions and labels
	predictions, labels = eval_pred
	# Get the predicted class (argmax selects the class with the highest score)
	predictions = np.argmax(predictions, axis = 1)
	# Compute metrics
	accuracy = accuracy_score(labels, predictions)
	precision = precision_score(labels, predictions, average = "binary")
	recall = recall_score(labels, predictions, average = "binary")
	f1 = f1_score(labels, predictions, average = "binary")
	# Return all metrics
	return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


#### Define Trainer to fine-tuning the foundation model

The HuggingFace Trainer class handles the training and eval loop for PyTorch for us.

You can find more at this [link](https://huggingface.co/docs/transformers/main_classes/trainer).

In [21]:
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir= "./data/creditc_mortg",
        # Learning rate
        learning_rate= 1e-5,  #2e-5 # Lowered to prevent instability on CPU
        # Train/Validate batch size
        per_device_train_batch_size= 4,  #16 # Reduce batch size to avoid memory crashes
        per_device_eval_batch_size= 4, #16 # Same for evaluation
        # Evaluate and save the model after each epoch
        evaluation_strategy= "epoch", # Evaluate at the end of each epoch
        save_strategy= "epoch", # Save model checkpoint every epoch
	    # Epochs and weight decay
        num_train_epochs= 1, # Start with 1 epoch, increase as needed
        weight_decay= 0.01,  #Standard weight decay
	    # Resource management
		gradient_accumulation_steps= 4,  # Simulate larger batches with accumulation
	    #
        load_best_model_at_end= True,
	    no_cuda= True, # Ensure no GPU usage
    ),
    train_dataset= tokenized_dataset["train"],
    eval_dataset= tokenized_dataset["test"],
    tokenizer= tokenizer,
    data_collator= DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics= compute_metrics,
)

  trainer = Trainer(


#### Start fine-tuning

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.54878,0.753765,0.230769,0.029412,0.052174


TrainOutput(global_step=332, training_loss=2.1929725738893073, metrics={'train_runtime': 36451.3655, 'train_samples_per_second': 0.146, 'train_steps_per_second': 0.009, 'total_flos': 2829004428017664.0, 'train_loss': 2.1929725738893073, 'epoch': 1.0})

#### Validate fine-tuned model

In [23]:
trainer.evaluate()

{'eval_loss': 1.5487797260284424,
 'eval_accuracy': 0.7537650602409639,
 'eval_precision': 0.23076923076923078,
 'eval_recall': 0.029411764705882353,
 'eval_f1': 0.05217391304347826,
 'eval_runtime': 1849.0893,
 'eval_samples_per_second': 0.718,
 'eval_steps_per_second': 0.18,
 'epoch': 1.0}

In [24]:
peft_model.save_pretrained("./vtsoumpris/fnc-gpt2-lora")

In [25]:
# Make a dataframe with the predictions and the text and the labels
items_for_manual_review = tokenized_dataset["test"].select(
    [0, 1, 22, 31, 43, 292, 448, 487]
)

results = trainer.predict(items_for_manual_review)
df = pd.DataFrame(
    {
        "complaint": [item["complaint"] for item in items_for_manual_review],
        "predictions": results.predictions.argmax(axis=1),
        "labels": results.label_ids,
    }
)
# Show all the cell
pd.set_option("display.max_colwidth", None)
df

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,complaint,predictions,labels
0,XX/XX/2019/ XX/XX/XXXX XXXX XX/XX/XXXX-home equity line/ $ XXXX/ denied reason insufficient credit history/ on free and clear title deed home/ Citibank myhomeequity/ $ XXXX/XX/XX/2019/ action denied/ not approved/ free and clear title deed,0,1
1,I made a payment of {$100.00} on my credit card. I had a promo balance of {$4600.00} with 0 % interest. I made a purchase for {$25.00} on XXXX. I made XXXX payment that should have covered the balances of the the {$25.00} charge. Payment dates were XXXX for {$25.00} and XXXX for {$100.00}. I minimum payment was {$67.00} that month. I was charged a minimum interest fee of {$.00} on a balance of {$.00}. I think that charge is unfair and they refuse to refund me the {$.00}.,0,0
2,"On XX/XX/ 2018 a 10 ton 100 foot tall oak tree crushed my living room. \nIn the state of Virginia, that means that the lender CitiMortgage holds funds and releases them as progress is made no the insurance claim. \n\nAs of XX/XX/2018, the living room has been reconstructed, painted, etc. \n\nThe CitiMortgage system for managing inspections to get money released is broken. It has been 6 weeks since I started my request for final inspection and release of remaining funds. CitiMortgage would cancel inspections with no contact or notice. When the inspection was done, it was agreed that 100 % of structural work was complete and minor paint touch up was needed for walls and for floor. \n\nCitiMorgtage has decided that they will not honor the 100 % structural and are stating, without contacting me or reviewing with me. At first they claimed that only a 33 % complete inspection was done ( again their XXXX XXXX XXXX system is broken ) and then they have decided that since there is still some plywood in my living room protecting the floor while we finish any painting and getting money to pay for removal of the damaged baby grand piano ... .they have decided that we are only 95 % complete and as such will not release any funds. \n\nI am not allowed to get past any phone support at CitiMortgage to talk to an actual person making a decision. \nThis has been gong on for 6 weeks. The amount for the remaining funds is about {$22000.00}. \n\nI am truly dissappointed at CitiMortgage. They come off as a desperate bank that is not interested in letting money, that is not theirs, go.",0,1
3,XX/XX/2015 my daughter contacted XXXX to inquire about pricing about a flight to florida. Unknowingly when my daughter gave my credit card number XXXX considers that booking a flight. I disbuted the charge with Citi Bank they denied me stating I book the flight via internet and I read and understood the rules. Which was false. I called and stated This was a phone call and my daughter was on the phone and she is not an authorized user on my account and I should not be charged for an inquiry only. I contacted XXXX explained the situation they apologized for there agents mistake and explained it is company policy that they DO NOT give cash refunds they give flight credit vouchers. Which they did for the full amount of $ XXXX.on XX/XX/2015. I called Citibank and showed them the documentations and they still would not refund me. I then stated I would like to take this to court and let a XXXX make the final decision which they agreed to do. I also stated I am going make an advance payment of {$2900.00} which would leave only the {$120.00} balance in question. They agreed. Time went by I had forgotten about this until I applied for a car loan. To find out the Citi bank had destroyed my credit because i had not made any payments for 6 mo on the credit card. This is after making an agreement with me to let a court make the final decsssion and excepting a payment of {$2900.00} in advance so there would be no problem with a late payment. \nThis is outright wrong to do to me after making an agreement with me also i could have made monthly payments at no cost to me because i had interest free for 18 mo. \nXXXX XXXX XXXX is aware of this matter.,0,0
4,I applied for a credit card with Cit Bank a couple of month ago. I just realized that they are not displaying the minimum payment warning table. This is the table to inform consumers of the monetary cost and time it will take to pay off a credit card balance if they only make the minimum monthly payment.,0,0
5,"Dear Sir or Ma'am, We are experiencing a financial hardship related to having to care for our son who is XXXX, consequently we are not current with our Mortgage with CitiMortgage. We had been able to make payments on line through their website XXXX. \nThis morning to our surprise we found that the option to make payments is no longer available and our account locked. This was disappointing and creating more stress to not allowing us to have the opportunity to get our payment done. They advertise "" We are here to help you '', however being locked out of our online account prevents us from making our mortgage payment to have a digital fingerprint of our payment. "" Sorry you are not eligible to make payment online.",0,1
6,"I had a Home Depot credit card for years with a high creditlimit and never fell behind, always made more the minimumdue and all of a sudden without warning I got a letter froma collection agency saying my card was shut off accountclosed and I owed my balance asap or it was gon na ruin mycredit. So while I was talking to the people I asked them whythis happen what have I done ive been a perfect customerand could n't tell me. So right then they said if you pay thisamount they will wipe it clean so I did exactly that.. Thereceived what they asked for within 5 days. so I calledhome depot I honestly was very devasted and could n't understastand what happened. So I called and called finally reachedsomeone back at the collection agency because home depotwould n't talk to me that because a user on my card XXXX who has never used my card or had possession ofa card filed bankruptcy, which he did n't fall through with hehad to do it over a short sale because the mortgage companywas being a jerk even though we had a signed short salecontract for more then what was owed to XXXX theywere being jerks ... So I feel that it is n't right that this happenedI did nothing wrong and his bankruptcy which can be lookedup was never followed up with we had to do it cuz alawyer told him too ... So please look over and help me out..Ido n't see why it should be bad on my credit report. I wouldhave taken his name off hes never used my card ever! Theycan check theres XXXX different numbers on the cards ...",0,0
7,"There was a fraudulent activity on my Macy 's credit card in XXXX XXXX for XXXX charges : {$44.00} and {$47.00} on XX/XX/XXXX and XX/XX/XXXX respectively. I notified Macy 's of the fraudulent activity as soon as my statement arrived XX/XX/XXXX. I received their Statement of XXXX application XX/XX/XXXX and mailed the completed application on XXXX XXXX describing the fraud claim in writing. After numerous calls to Macy 's to check the status of the Fraud Investigation for two months, I was told that Fraud Department help desk had no access to the investigation details other than a status of "" pending ''. They could n't confirm whether the Statement of Fraud application was even received. In order to reassure myself they had received it, I mailed a copy of the initially filled out application with a certified letter XX/XX/XXXX. The certified letter was received on XXXX XXXX. I then called again to check the status and was reassured that the investigation will be completed in 30 to 90 days. The 90 days expire tomorrow ( XXXX/XXXX/XXXX ) and after my last phone call with them, the investigation is still not complete. Macy 's is reporting my account as delinquent and late because of these unpaid fraudulent charges and as a result my credit card score is plummeting. My life has been put on halt as I ca n't achieve my goals because of my poor credit card score. As a consumer, I have done everything in a timely manner and with care while Macy 's has just neglected the reported fraudulent activity on my account for 7 months.",0,0
