### Fine Tuning Deberta-v3

In [29]:
from datasets import load_dataset, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from pathlib import Path

mdl_tok_name = "gpt2"

#### Loading the pre-processed dataset

In [30]:
# Define the file path to the dataset
file_path = Path("data/filtered_dataset.csv")

# Load the dataset using Hugging Face's `load_dataset`
dataset = load_dataset('csv', data_files = str(file_path))

# Inspect the unique values in the 'Product' column
product_classes = dataset["train"].unique("Product")

# Convert the 'Product' column to a ClassLabel feature
product_label = ClassLabel(names=product_classes)
dataset = dataset.cast_column("Product", product_label)

# Extract the features (columns) we want
dataset = \
	dataset["train"].select_columns(
	["Consumer complaint narrative", "Product"]
    ).train_test_split(
	test_size= 0.2,
    shuffle= True,
    seed= 23,
    stratify_by_column = "Product"
    )

splits= ["train","test"]

# View the resulting dataset
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['Consumer complaint narrative', 'Product'],
        num_rows: 5312
    })
    test: Dataset({
        features: ['Consumer complaint narrative', 'Product'],
        num_rows: 1328
    })
})


#### Inspecting the labels

Credit card is labeled as 0 and  Mortgage is labeled as 1

In [31]:
product_label

ClassLabel(names=['Credit card', 'Mortgage'], id=None)

#### Preprocess dataset

Tokenizing 'Consumer complaint narrative' feature values

In [32]:
tokenizer = AutoTokenizer.from_pretrained(mdl_tok_name)

# Let's use a lambda function to tokenize all the examples
tokenized_dataset = {}
for split in splits:
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["Consumer complaint narrative"], truncation=True), batched=True
    )

# Inspect the available columns in the dataset
tokenized_dataset["train"]

Map: 100%|██████████| 1328/1328 [00:00<00:00, 2342.32 examples/s]


Dataset({
    features: ['Consumer complaint narrative', 'Product', 'input_ids', 'attention_mask'],
    num_rows: 5312
})

#### Loading Model

In [36]:
model = AutoModelForSequenceClassification.from_pretrained(
    mdl_tok_name,
    num_labels=2,
    id2label={0: "Credit card", 1: "Mortgage"},
    label2id={"Credit card": 0, "Mortgage": 1},
)

print(model)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)
