# Mounting to Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Install latest version of Huggingface and import necessary packages

In [2]:
!pip install -q git+https://github.com/huggingface/transformers

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
from torchsummary import summary
from tqdm import tqdm
from transformers import DistilBertTokenizer, DistilBertModel
import json

Preparing the data for fine-tuning using OpenAI Playground

In [4]:
# Load training, test, and validation sets
PATH = '/content/drive/MyDrive/LLM_Assignment'
def load_data(PATH):
  train_data = pd.read_csv(PATH + '/training.csv')
  test_data = pd.read_csv(PATH + '/test.csv')
  val_data = pd.read_csv(PATH + '/validation.csv')
  return train_data, test_data, val_data

train_data, test_data, val_data = load_data(PATH)

In [None]:
train_data

Unnamed: 0,Domain,area,keywords,Abstract
0,Medical,Hepatitis C,Feasibility study; hepatitis C risk behaviour...,Aims: This study aimed to develop and test the...
1,CS,Distributed computing,Agent Architecture; Mobile Agent; Agent Cloni...,Mobile agent technology is becoming more popul...
2,ECE,Control engineering,educational software tool; multivariable cont...,This paper presents an educational software to...
3,Psychology,False memories,judgment; metamemory; accuracy; eyewitness me...,"Different researchers have reported positive, ..."
4,Psychology,Leadership,Implementation support; Co-occurring disorder...,Background: Incorporating evidence-based integ...
5,CS,Bioinformatics,secondary metabolism; bioinformatics; genome ...,The soil-borne gram-positive bacteria Aneurini...
6,Psychology,Depression,Adolescence; Expressed emotion; Psychopatholo...,Objective: To investigate the association betw...
7,ECE,Electrical generator,CO2 capture; Efficiency; Exergy; LNG (liquefi...,The LNG (liquefied natural gas) regasification...
8,CS,Distributed computing,Quantum computing; Approximation algorithms; ...,We make use of a kind of distributed semi-quan...
9,Medical,Ankylosing Spondylitis,Biologic agents; Registry; Rheumatoid arthrit...,Despite improved quality of care for rheumatoi...


# Formatting Datasets

In [None]:

def make_finetuning_data_gpt(dataset):
    abstracts = dataset['Abstract']
    domains = dataset['Domain']
    classified_data = []
    for abstract, domain in zip(abstracts, domains):
        message = {"messages": []}
        message["messages"].append({"role": "user", "content": abstract})
        message["messages"].append({"role": "assistant", "content": domain})
        classified_data.append(message)
    return classified_data

In [None]:
processed_train_data = make_finetuning_data_gpt(train_data)
processed_val_data = make_finetuning_data_gpt(val_data)

In [None]:
with open(PATH + '/training.jsonl', 'w') as f:
    for item in processed_train_data:
        json.dump(item, f)
        f.write('\n')  # Write a newline character to separate JSON objects

# Save validation data to JSONL
with open(PATH + '/validation.jsonl', 'w') as f:
    for item in processed_val_data:
        json.dump(item, f)
        f.write('\n')  # Write a newline character to separate JSON objects

Fine Tuning a Distilbert model on the training set


In [None]:
training_set, test_set, validation_set = load_data(PATH)

In [None]:
training_set['sample'] = 'training'
validation_set['sample'] = 'validation'
test_set['sample'] = 'test'

In [None]:
num_classes = training_set['Domain'].nunique()

In [None]:
from sklearn.preprocessing import LabelEncoder
all_sets = pd.concat([training_set, validation_set, test_set], axis=0)

# Encode Domain column
label_encoder = LabelEncoder()
all_sets['Encoded_Domain'] = label_encoder.fit_transform(all_sets['Domain'])

# Split back into individual sets
training_set = all_sets[all_sets['sample'] == 'training']
validation_set = all_sets[all_sets['sample'] == 'validation']
test_set = all_sets[all_sets['sample'] == 'test']

In [None]:
test_set

Unnamed: 0,Domain,area,keywords,Abstract,sample,Encoded_Domain
0,CS,Data structures,In-memory XPath processing; NESTOR; Set-based...,XML is a pervasive technology for representing...,test,0
1,Civil,Ambient Intelligence,Home energy management; persuasive interface;...,The integration of renewable energy sources in...,test,1
2,ECE,Electrical generator,non-standard electrical machine; generation o...,The original free-swinging piston engine with ...,test,2
3,Medical,Hepatitis C,complications; patient engagement; patient-ce...,Barriers to access and long-term complications...,test,4
4,ECE,Control engineering,force feedback haptic interface; virtual real...,This paper is to present a technological solut...,test,2
5,CS,Bioinformatics,Bioinformatics; genomics,Transposable elements (TEs) constitute the mos...,test,0
6,Medical,Weight Loss,Obesity; weight loss; moral work; body projec...,Cultural notions equating greater morality and...,test,4
7,Psychology,Leadership,Data-based decision making; school improvemen...,Although data-based decision making can lead t...,test,5
8,Psychology,Seasonal affective disorder,Ramelteon; sleep; agomelatine; depression; in...,Insomnia is common among elderly people and ne...,test,5
9,CS,Data structures,Succinct dynamic data structures; Succinct tr...,Cardinal trees (or tries of degree ) are a fun...,test,0


In [None]:
def make_finetuning_data_distilbert(dataset):

    labels = dataset['Encoded_Domain'].values

    encoded = tokenizer.batch_encode_plus(
        dataset['Abstract'],
        add_special_tokens=True,
        return_attention_mask=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt',
        truncation=True
    )

    # Create a tensor dataset
    return {
        'input_ids': encoded['input_ids'],
        'attention_mask': encoded['attention_mask'],
        'labels': torch.tensor(labels)
    }

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased")

training_set = make_finetuning_data_distilbert(training_set)
validation_set = make_finetuning_data_distilbert(validation_set)
test_set = make_finetuning_data_distilbert(test_set)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Preparing the dataloader

In [None]:
MAX_LENGTH = 128
BATCH_SIZE_TRAIN = 32
BATCH_SIZE_VAL = 16
BATCH_SIZE_TEST = 16

from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer

class FTDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data['input_ids'])

    def __getitem__(self, index):
        return {
            'input_ids': self.data['input_ids'][index],
            'attention_mask': self.data['attention_mask'][index],
            'labels': self.data['labels'][index]
        }

dataset_train = FTDataset(training_set)
dataloader_train = DataLoader(dataset=dataset_train, batch_size=BATCH_SIZE_TRAIN)
dataset_val = FTDataset(validation_set)
dataloader_val = DataLoader(dataset=dataset_val, batch_size=BATCH_SIZE_VAL)
dataset_test = FTDataset(test_set)
dataloader_test = DataLoader(dataset=dataset_test, batch_size=BATCH_SIZE_TEST)

In [None]:
test_set

{'input_ids': tensor([[  101, 20950,  2003,  ...,  1998,  7926,   102],
         [  101,  1996,  8346,  ...,  1010,  1999,   102],
         [  101,  1996,  2434,  ...,     0,     0,     0],
         ...,
         [  101,  7863,  1024,  ...,  2099,  4442,   102],
         [  101,  1996,  5800,  ..., 13589, 13827,   102],
         [  101,  1037,  4338,  ...,  1010,  4525,   102]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]),
 'labels': tensor([0, 1, 2, 4, 2, 0, 4, 5, 5, 0, 4, 0, 3, 2, 5, 4, 1, 6, 1, 1])}

Adding a trainable layer(s) on top of DistilBert

In [None]:
class FTModel(nn.Module):
    def __init__(self, bert_model):
        super(FTModel, self).__init__()
        self.bert_model = bert_model
        self.dropout = nn.Dropout(0.1)  # Dropout layer for regularization
        self.classifier = nn.Linear(bert_model.config.hidden_size, num_classes)  # Add a linear layer for classification
        self.softmax = nn.Softmax(dim=1)  # Softmax activation for multi-class classification

    def forward(self, input_ids, attention_mask):
        outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # Use the [CLS] token embedding as the pooled output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        probabilities = self.softmax(logits)
        return logits, probabilities

# Create an instance of the FTModel
model = FTModel(bert_model)

In [None]:
LEARNING_RATE = 2e-5
# Use cross-entropy loss
loss_fn = nn.CrossEntropyLoss()

# Initialize Optimizer
optimizer= optim.Adam(model.parameters(),lr= LEARNING_RATE)

In [None]:
# Freeze parameters of the pre-trained Distilbert model
for param in model.bert_model.parameters():
    param.requires_grad = False

Fine-tuning and evaluation functions

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [None]:
VAL_FREQUENCY = 10
PATH1 = '/content/drive/MyDrive/LLM_Assignment/Model.pt'


def evaluate(model, dataloader_val, loss_fn, device):
    model.eval()

    total_loss = 0
    accurate_predictions = 0
    total_samples = 0
    nb_eval_steps = 0

    for batch in dataloader_val:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
          logits = model(input_ids, attention_mask=attention_mask)[0]

        loss = loss_fn(logits, labels)
        total_loss += loss.item()

        _, predictions = torch.max(logits, dim=1)
        accurate_predictions += (predictions == labels).sum().item()
        total_samples += labels.size(0)

        nb_eval_steps+=1

    avg_loss = total_loss / nb_eval_steps
    accuracy = accurate_predictions / total_samples
    print(f"Validation Loss: {avg_loss}, Accuracy: {accuracy}")
    return avg_loss

def finetune(epochs, model, loss_fn, optimizer, dataloader_train, dataloader_val, device):
    min_vloss = 10000
    patience_index = 0
    patience = 5

    for epoch in range(epochs):
        model.train()
        tr_loss = 0
        nb_tr_steps = 0
        loop = tqdm(enumerate(dataloader_train), leave=False, total=len(dataloader_train))

        for step, batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs[0], labels)
            loss.backward()
            optimizer.step()

            tr_loss += loss.item()
            nb_tr_steps += 1

        avg_train_loss = tr_loss / nb_tr_steps
        print("Train loss: {}".format(avg_train_loss))

        if epoch % VAL_FREQUENCY == 0:
            val_loss = evaluate(model, dataloader_val, loss_fn, device)
            if val_loss <= min_vloss:
                min_vloss = val_loss
                torch.save(model.state_dict(), PATH1)
                patience_index = 0
            else:
                patience_index += 1
            if patience_index == patience:
                break

    return model
model = finetune(200, model, loss_fn, optimizer, dataloader_train, dataloader_val, device)



Train loss: 2.130968928337097
Validation Loss: 2.268948554992676, Accuracy: 0.08




Train loss: 2.0794063210487366




Train loss: 2.1097753047943115




Train loss: 2.0849048495292664




Train loss: 2.093376874923706




Train loss: 2.11032497882843




Train loss: 2.1011794209480286




Train loss: 2.104034900665283




Train loss: 2.0867156982421875




Train loss: 2.0707375407218933




Train loss: 2.0601966977119446
Validation Loss: 2.211119055747986, Accuracy: 0.1




Train loss: 2.080949902534485




Train loss: 2.0540565252304077




Train loss: 2.0601508021354675




Train loss: 2.057281732559204




Train loss: 2.0723517537117004




Train loss: 2.0113415718078613




Train loss: 2.0427663326263428




Train loss: 2.042474389076233




Train loss: 2.0490405559539795




Train loss: 2.0548046827316284
Validation Loss: 2.156253695487976, Accuracy: 0.16




Train loss: 2.0472235679626465




Train loss: 2.005276679992676




Train loss: 2.0358265042304993




Train loss: 2.0334011912345886




Train loss: 2.0295369029045105




Train loss: 2.022941470146179




Train loss: 2.036886215209961




Train loss: 2.028301477432251




Train loss: 1.9864330291748047




Train loss: 1.9940831065177917
Validation Loss: 2.104207992553711, Accuracy: 0.3




Train loss: 2.015466570854187




Train loss: 2.008911430835724




Train loss: 2.0022271275520325




Train loss: 1.9873091578483582




Train loss: 1.9998385906219482




Train loss: 1.9884308576583862




Train loss: 1.9916867017745972




Train loss: 1.9727020263671875




Train loss: 1.9759047031402588




Train loss: 1.9875507354736328
Validation Loss: 2.055089980363846, Accuracy: 0.32




Train loss: 1.9597914218902588




Train loss: 1.948026955127716




Train loss: 1.9632987976074219




Train loss: 1.9788766503334045




Train loss: 1.9612272381782532




Train loss: 1.9548190236091614




Train loss: 1.9422082901000977




Train loss: 1.9484710693359375




Train loss: 1.955627977848053




Train loss: 1.9674879908561707
Validation Loss: 2.008765935897827, Accuracy: 0.32




Train loss: 1.9545987248420715




Train loss: 1.9452161192893982




Train loss: 1.946256399154663




Train loss: 1.9439070224761963




Train loss: 1.9163551330566406




Train loss: 1.9503085017204285




Train loss: 1.9087671041488647




Train loss: 1.942805528640747




Train loss: 1.9389869570732117




Train loss: 1.937722384929657
Validation Loss: 1.9649399816989899, Accuracy: 0.34




Train loss: 1.9311075806617737




Train loss: 1.9182838797569275




Train loss: 1.9168893098831177




Train loss: 1.9031649231910706




Train loss: 1.911853551864624




Train loss: 1.909682035446167




Train loss: 1.8964174389839172




Train loss: 1.8977017998695374




Train loss: 1.9016806483268738




Train loss: 1.8885713815689087
Validation Loss: 1.9236093759536743, Accuracy: 0.36




Train loss: 1.8884300589561462




Train loss: 1.872288703918457




Train loss: 1.8893781900405884




Train loss: 1.8960694074630737




Train loss: 1.8869677186012268




Train loss: 1.873008906841278




Train loss: 1.891036033630371




Train loss: 1.8675674200057983




Train loss: 1.8750654458999634




Train loss: 1.8817746043205261
Validation Loss: 1.8848686814308167, Accuracy: 0.36




Train loss: 1.8583375811576843




Train loss: 1.8827233910560608




Train loss: 1.852781593799591




Train loss: 1.863670527935028




Train loss: 1.86995267868042




Train loss: 1.8467293977737427




Train loss: 1.8456562161445618




Train loss: 1.8520023226737976




Train loss: 1.8329458832740784




Train loss: 1.8491775393486023
Validation Loss: 1.8483034074306488, Accuracy: 0.38




Train loss: 1.8380566239356995




Train loss: 1.8211448788642883




Train loss: 1.8416093587875366




Train loss: 1.833822250366211




Train loss: 1.817156970500946




Train loss: 1.8199020624160767




Train loss: 1.8389384746551514




Train loss: 1.818708062171936




Train loss: 1.8361421823501587




Train loss: 1.8140105605125427
Validation Loss: 1.8135414719581604, Accuracy: 0.38




Train loss: 1.8135610222816467




Train loss: 1.8331856727600098




Train loss: 1.835495114326477




Train loss: 1.8071231842041016




Train loss: 1.8256674408912659




Train loss: 1.8174881339073181




Train loss: 1.8045598268508911




Train loss: 1.8077808022499084




Train loss: 1.7830559611320496




Train loss: 1.8012158274650574
Validation Loss: 1.7809302806854248, Accuracy: 0.4




Train loss: 1.8013775944709778




Train loss: 1.7988121509552002




Train loss: 1.794582724571228




Train loss: 1.8017499446868896




Train loss: 1.7684805989265442




Train loss: 1.7945058345794678




Train loss: 1.7841486930847168




Train loss: 1.8034693002700806




Train loss: 1.8022058606147766




Train loss: 1.8026604056358337
Validation Loss: 1.7500783205032349, Accuracy: 0.42




Train loss: 1.7737277746200562




Train loss: 1.8036286234855652




Train loss: 1.7797017097473145




Train loss: 1.7883883714675903




Train loss: 1.7784526348114014




Train loss: 1.7825552821159363




Train loss: 1.7718709111213684




Train loss: 1.7791683077812195




Train loss: 1.7802099585533142




Train loss: 1.7804085612297058
Validation Loss: 1.721586138010025, Accuracy: 0.44




Train loss: 1.7427635788917542




Train loss: 1.772345781326294




Train loss: 1.7507036328315735




Train loss: 1.75075364112854




Train loss: 1.7621873617172241




Train loss: 1.74333918094635




Train loss: 1.758155643939972




Train loss: 1.7675806879997253




Train loss: 1.7546894550323486




Train loss: 1.7415390014648438
Validation Loss: 1.6951889395713806, Accuracy: 0.5




Train loss: 1.7531208395957947




Train loss: 1.723136842250824




Train loss: 1.728075385093689




Train loss: 1.756837248802185




Train loss: 1.732921302318573




Train loss: 1.7357593774795532




Train loss: 1.7303782105445862




Train loss: 1.7306267619132996




Train loss: 1.7406067848205566




Train loss: 1.7370315790176392
Validation Loss: 1.6704608798027039, Accuracy: 0.5




Train loss: 1.7293192148208618




Train loss: 1.7055700421333313




Train loss: 1.725757658481598




Train loss: 1.714715301990509




Train loss: 1.719088077545166




Train loss: 1.750079095363617




Train loss: 1.7050374150276184




Train loss: 1.7153763175010681




Train loss: 1.7167111039161682




Train loss: 1.7057337760925293
Validation Loss: 1.6470391750335693, Accuracy: 0.52




Train loss: 1.6991853713989258




Train loss: 1.7150521874427795




Train loss: 1.7246984839439392




Train loss: 1.716287612915039




Train loss: 1.6864622831344604




Train loss: 1.7247511148452759




Train loss: 1.702700436115265




Train loss: 1.7011532187461853




Train loss: 1.6992537379264832




Train loss: 1.7013089060783386
Validation Loss: 1.6251075565814972, Accuracy: 0.5




Train loss: 1.7206335067749023




Train loss: 1.688014268875122




Train loss: 1.6767684817314148




Train loss: 1.686655342578888




Train loss: 1.685020089149475




Train loss: 1.7064286470413208




Train loss: 1.6815001964569092




Train loss: 1.6957504749298096




Train loss: 1.6733915209770203




Train loss: 1.6910793781280518
Validation Loss: 1.6049956977367401, Accuracy: 0.48




Train loss: 1.6832889318466187




Train loss: 1.7135361433029175




Train loss: 1.6637777090072632




Train loss: 1.691058874130249




Train loss: 1.6599839329719543




Train loss: 1.6699337363243103




Train loss: 1.6888709664344788




Train loss: 1.6954888701438904




Train loss: 1.6719611287117004




Train loss: 1.646501898765564
Validation Loss: 1.5864416658878326, Accuracy: 0.5




Train loss: 1.6725288033485413




Train loss: 1.653636872768402




Train loss: 1.6515762209892273




Train loss: 1.670377790927887




Train loss: 1.667078673839569




Train loss: 1.684049665927887




Train loss: 1.6550106406211853




Train loss: 1.6790630221366882


                                             

Train loss: 1.6600894927978516




Test Set Evaluation

In [None]:
from sklearn.metrics import accuracy_score
def evaluate_on_test(model, test_set):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in test_set:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)[0]
            _, predicted = torch.max(outputs, dim=1)

            predictions.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    print(f"True Labels: {true_labels}")
    print(f"Predicted Labels: {predictions}")
    accuracy = accuracy_score(true_labels, predictions)
    return accuracy


test_accuracy = evaluate_on_test(model, dataloader_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

True Labels: [0, 1, 2, 4, 2, 0, 4, 5, 5, 0, 4, 0, 3, 2, 5, 4, 1, 6, 1, 1]
Predicted Labels: [2, 6, 6, 6, 6, 6, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 6, 6]
Test Accuracy: 20.00%


Fine-tune all parameters


In [5]:
training_set, test_set, validation_set = load_data(PATH)

In [6]:
training_set['sample'] = 'training'
validation_set['sample'] = 'validation'
test_set['sample'] = 'test'

In [7]:
num_classes = training_set['Domain'].nunique()

In [8]:
num_classes

7

In [9]:
from sklearn.preprocessing import LabelEncoder
all_sets = pd.concat([training_set, validation_set, test_set], axis=0)

label_encoder = LabelEncoder()
all_sets['Encoded_Domain'] = label_encoder.fit_transform(all_sets['Domain'])

training_set = all_sets[all_sets['sample'] == 'training']
validation_set = all_sets[all_sets['sample'] == 'validation']
test_set = all_sets[all_sets['sample'] == 'test']

In [10]:
def make_finetuning_data_distilbert(dataset):

    labels = dataset['Encoded_Domain'].values

    encoded = tokenizer.batch_encode_plus(
        dataset['Abstract'],
        add_special_tokens=True,
        return_attention_mask=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt',
        truncation=True
    )

    return {
        'input_ids': encoded['input_ids'],
        'attention_mask': encoded['attention_mask'],
        'labels': torch.tensor(labels)
    }

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased")

training_set = make_finetuning_data_distilbert(training_set)
validation_set = make_finetuning_data_distilbert(validation_set)
test_set = make_finetuning_data_distilbert(test_set)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [11]:
MAX_LENGTH = 128
BATCH_SIZE_TRAIN = 32
BATCH_SIZE_VAL = 16
BATCH_SIZE_TEST = 16

from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer

class FTDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data['input_ids'])

    def __getitem__(self, index):
        return {
            'input_ids': self.data['input_ids'][index],
            'attention_mask': self.data['attention_mask'][index],
            'labels': self.data['labels'][index]
        }

dataset_train = FTDataset(training_set)
dataloader_train = DataLoader(dataset=dataset_train, batch_size=BATCH_SIZE_TRAIN)
dataset_val = FTDataset(validation_set)
dataloader_val = DataLoader(dataset=dataset_val, batch_size=BATCH_SIZE_VAL)
dataset_test = FTDataset(test_set)
dataloader_test = DataLoader(dataset=dataset_test, batch_size=BATCH_SIZE_TEST)

In [12]:
bert_model_fullft = DistilBertModel.from_pretrained("distilbert-base-uncased")

In [13]:
class FTModelFull(nn.Module):
    def __init__(self, bert_model, num_classes, second_num_classes):
        super(FTModelFull, self).__init__()
        self.bert_model = bert_model
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(bert_model.config.hidden_size, num_classes)
        self.softmax = nn.Softmax(dim=1)

        # Additional classifier
        self.second_classifier = nn.Linear(num_classes, second_num_classes)
        self.second_softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)

        logits = self.classifier(pooled_output)
        probabilities = self.softmax(logits)

        # Pass the output through the second classifier
        second_logits = self.second_classifier(probabilities)
        second_probabilities = self.second_softmax(second_logits)

        return second_logits, second_probabilities

In [14]:
# Initialize model but don't freeze Distilbert parameters
model_full_ft = FTModelFull(bert_model_fullft, num_classes, num_classes).to('cuda')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_full_ft = model_full_ft.to(device)

In [15]:
from transformers import AdamW

In [16]:
loss_fn = nn.CrossEntropyLoss()

# Choose parameters wisely!
learning_rate = 2e-5
adam_epsilon = 1e-8

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model_full_ft.named_parameters() if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.2},
    {'params': [p for n, p in model_full_ft.named_parameters() if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)



In [17]:
VAL_FREQUENCY = 10
PATH1 = '/content/drive/MyDrive/LLM_Assignment/Model2.pt'

# Function used to evaluate the model on
# the validation set
def evaluate(model, dataloader_val, loss_fn, device):
    model.eval()

    # Tracking variables
    total_loss = 0
    accurate_predictions = 0
    total_samples = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in dataloader_val:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
          logits = model(input_ids, attention_mask=attention_mask)[0]

        loss = loss_fn(logits, labels)
        total_loss += loss.item()

        _, predictions = torch.max(logits, dim=1)
        accurate_predictions += (predictions == labels).sum().item()
        total_samples += labels.size(0)

        nb_eval_steps+=1

    avg_loss = total_loss / nb_eval_steps
    accuracy = accurate_predictions / total_samples
    print(f"Validation Loss: {avg_loss}, Accuracy: {accuracy}")
    return avg_loss

def finetune(epochs, model, loss_fn, optimizer, dataloader_train, dataloader_val, device):
    min_vloss = 10000
    patience_index = 0
    patience = 5

    for epoch in range(epochs):
        model.train()
        tr_loss = 0
        nb_tr_steps = 0
        loop = tqdm(enumerate(dataloader_train), leave=False, total=len(dataloader_train))

        for step, batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs[0], labels)
            loss.backward()
            optimizer.step()

            tr_loss += loss.item()
            nb_tr_steps += 1

        avg_train_loss = tr_loss / nb_tr_steps
        print("Train loss: {}".format(avg_train_loss))

        if epoch % VAL_FREQUENCY == 0:
            val_loss = evaluate(model, dataloader_val, loss_fn, device)
            if val_loss <= min_vloss:
                min_vloss = val_loss
                torch.save(model.state_dict(), PATH1)
                patience_index = 0
            else:
                patience_index += 1
            if patience_index == patience:
                break

    return model
model_full_ft = finetune(200, model_full_ft, loss_fn, optimizer, dataloader_train, dataloader_val, device)



Train loss: 1.9644247889518738
Validation Loss: 1.9453555047512054, Accuracy: 0.12




Train loss: 1.9509192109107971




Train loss: 1.9416365027427673




Train loss: 1.9280498027801514




Train loss: 1.917827069759369




Train loss: 1.9080519080162048




Train loss: 1.8980006575584412




Train loss: 1.8884796500205994




Train loss: 1.8803942799568176




Train loss: 1.8713381886482239




Train loss: 1.857170283794403
Validation Loss: 1.8254092931747437, Accuracy: 0.36




Train loss: 1.8486335277557373




Train loss: 1.8369040489196777




Train loss: 1.8264393210411072




Train loss: 1.8170443177223206




Train loss: 1.8091287016868591




Train loss: 1.8013059496879578




Train loss: 1.7939849495887756




Train loss: 1.7883821725845337




Train loss: 1.7835643887519836




Train loss: 1.7793784141540527
Validation Loss: 1.8141704201698303, Accuracy: 0.32




Train loss: 1.7750423550605774




Train loss: 1.7705166935920715




Train loss: 1.7661305665969849




Train loss: 1.761139452457428




Train loss: 1.7580763697624207




Train loss: 1.752539336681366




Train loss: 1.7469048500061035




Train loss: 1.7444011569023132




Train loss: 1.741435945034027




Train loss: 1.7377976179122925
Validation Loss: 1.8078336119651794, Accuracy: 0.48




Train loss: 1.7353477478027344




Train loss: 1.7330126762390137




Train loss: 1.7306084036827087




Train loss: 1.7284159064292908




Train loss: 1.7268155813217163




Train loss: 1.7250902652740479




Train loss: 1.7240005731582642




Train loss: 1.7222782969474792




Train loss: 1.7215207815170288




Train loss: 1.7196444272994995
Validation Loss: 1.8030438721179962, Accuracy: 0.5




Train loss: 1.7186006903648376




Train loss: 1.7178491353988647




Train loss: 1.7175626158714294




Train loss: 1.7171627283096313




Train loss: 1.7166049480438232




Train loss: 1.7163475155830383




Train loss: 1.7158896327018738




Train loss: 1.7154839634895325




Train loss: 1.7150752544403076




Train loss: 1.714928925037384
Validation Loss: 1.7988813519477844, Accuracy: 0.5




Train loss: 1.714699625968933




Train loss: 1.7145585417747498




Train loss: 1.7140427231788635




Train loss: 1.7138962745666504




Train loss: 1.713526725769043




Train loss: 1.7127352356910706




Train loss: 1.710715651512146




Train loss: 1.7099753618240356




Train loss: 1.708152174949646




Train loss: 1.7075458765029907
Validation Loss: 1.7974678874015808, Accuracy: 0.5




Train loss: 1.7074699401855469




Train loss: 1.7071110606193542




Train loss: 1.7068724632263184




Train loss: 1.7065925002098083




Train loss: 1.7065401077270508




Train loss: 1.7063642740249634




Train loss: 1.7061457633972168




Train loss: 1.705927312374115




Train loss: 1.7057717442512512




Train loss: 1.7056569457054138
Validation Loss: 1.7911417782306671, Accuracy: 0.52




Train loss: 1.7055429220199585




Train loss: 1.7055351734161377




Train loss: 1.7053186893463135




Train loss: 1.7051680088043213




Train loss: 1.7050792574882507




Train loss: 1.7049657702445984




Train loss: 1.7048330903053284




Train loss: 1.7047885060310364




Train loss: 1.7046520113945007




Train loss: 1.7045778036117554
Validation Loss: 1.7988111078739166, Accuracy: 0.5




Train loss: 1.70440673828125




Train loss: 1.7044952511787415




Train loss: 1.7043153643608093




Train loss: 1.7041712999343872




Train loss: 1.704345703125




Train loss: 1.7040979266166687




Train loss: 1.7039611339569092




Train loss: 1.7038320899009705




Train loss: 1.7038033604621887




Train loss: 1.703701138496399
Validation Loss: 1.7973912060260773, Accuracy: 0.52




Train loss: 1.70378839969635




Train loss: 1.7034704685211182




Train loss: 1.7034342885017395




Train loss: 1.7032396793365479




Train loss: 1.703186571598053




Train loss: 1.7030878067016602




Train loss: 1.7031378149986267




Train loss: 1.7029587030410767




Train loss: 1.702873945236206




Train loss: 1.703017771244049
Validation Loss: 1.7975557744503021, Accuracy: 0.54




Train loss: 1.7027294039726257




Train loss: 1.7026064991950989




Train loss: 1.7025581002235413




Train loss: 1.702422320842743




Train loss: 1.7023337483406067




Train loss: 1.702258825302124




Train loss: 1.702233076095581




Train loss: 1.7021033763885498




Train loss: 1.702088475227356




Train loss: 1.7019977569580078
Validation Loss: 1.798300862312317, Accuracy: 0.52




Train loss: 1.7019288539886475




Train loss: 1.7017910480499268




Train loss: 1.7017191648483276




Train loss: 1.7016353011131287




Train loss: 1.7015718817710876




Train loss: 1.7015182971954346




Train loss: 1.7015318274497986




Train loss: 1.701360046863556




Train loss: 1.7012701034545898


                                             

Train loss: 1.701372742652893
Validation Loss: 1.799689382314682, Accuracy: 0.54




Test Set Evaluation

In [18]:
from sklearn.metrics import accuracy_score
def evaluate_on_test(model, test_set):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in test_set:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)[0]
            _, predicted = torch.max(outputs, dim=1)

            predictions.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    print(f"True Labels: {true_labels}")
    print(f"Predicted Labels: {predictions}")
    accuracy = accuracy_score(true_labels, predictions)
    return accuracy


test_accuracy = evaluate_on_test(model_full_ft, dataloader_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

True Labels: [0, 1, 2, 4, 2, 0, 4, 5, 5, 0, 4, 0, 3, 2, 5, 4, 1, 6, 1, 1]
Predicted Labels: [0, 0, 0, 4, 0, 6, 1, 1, 4, 0, 4, 0, 0, 0, 1, 4, 3, 6, 0, 3]
Test Accuracy: 35.00%
