In [1]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch
from transformers import get_linear_schedule_with_warmup
import numpy as np
import random
import time
import datetime

In [2]:
dataset = load_dataset("nyu-mll/glue", "qnli")

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/877k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/104743 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5463 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5463 [00:00<?, ? examples/s]

In [3]:
# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
tokenizer.decode([101,102,103, 104], return_tensors='pt')

2024-06-18 05:17:40.217206: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-18 05:17:40.217325: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-18 05:17:40.340028: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


'[CLS] [SEP] [MASK] [unused100]'

In [5]:
tokenizer.encode("Hello, [SEP] my dog is cute")

[101, 8667, 117, 102, 1139, 3676, 1110, 10509, 102]

In [6]:
set(dataset['train']['label'])

{0, 1}

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 104743
    })
    validation: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 5463
    })
    test: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 5463
    })
})

In [8]:
random_train = dataset['train'].select(range(2269,12269))
random_val = dataset['validation'].select(range(2269,3269))
random_test = dataset['validation'].select(range(3269, 4269))

In [9]:
sum(random_train['label'])

5013

In [10]:
random_train['question'][0:5]

["Which artist guested on a live version of Queen's The Show Must Go On?",
 'marshall jefferson got involved in house music after hearing whose music?',
 "What trend led to the decrease of Estonia's GDP?",
 'Which architects in the US and Britain still employ the Georgian style for private residences?',
 'Who was responsible for crafting a new look for all Apple products?']

In [11]:
# Assuming dataset is a list of dicts with 'question', 'sentence', and 'label' keys
def template(template_type, dataset):
	if template_type == 'normal':
		inputs= tokenizer([f"{q['question']}  [SEP] {q['sentence']}" for q in dataset],
						padding=True, truncation=True, return_tensors="pt", max_length=256)
	elif template_type == 'PCP':
		inputs= tokenizer([f"{q['question']}  [MASK] , {q['context']}" for q in dataset], 
					padding=True, truncation=True, return_tensors="pt", max_length=256)		
	train_labels = torch.tensor([q['label'] for q in dataset])
	return inputs, train_labels
	


In [12]:
train_inputs, train_outputs = template('normal',random_train)
valid_inputs, valid_outputs = template('normal',random_val)
test_inputs, test_outputs = template('normal',random_test)

In [13]:
train_data = TensorDataset(train_inputs['input_ids'],train_inputs['token_type_ids'],
                            train_inputs['attention_mask'], train_outputs)
train_loader = DataLoader(train_data, batch_size=8, shuffle=True)

valid_data= TensorDataset(valid_inputs['input_ids'],valid_inputs['token_type_ids'],
							valid_inputs['attention_mask'], valid_outputs)
valid_loader = DataLoader(valid_data, batch_size=32 )

test_data = TensorDataset(test_inputs['input_ids'],test_inputs['token_type_ids'],
							test_inputs['attention_mask'], test_outputs)
test_loader = DataLoader(test_data, batch_size=32 )


In [14]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# model = torch.compile(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [15]:
train_data[0][1].shape

torch.Size([256])

In [108]:

# Freeze word_embeddings
def setting_model(model):
	for param in model.bert.parameters():
		param.requires_grad = False

	for param in model.classifier.parameters():
		param.requires_grad = True 

	for param in model.bert.embeddings.word_embeddings.parameters():
		param.requires_grad = True 

# # Freeze position_embeddings
# for param in model.bert.embeddings.position_embeddings.parameters():
#     param.requires_grad = False

# # Freeze token_type_embeddings
# for param in model.bert.embeddings.token_type_embeddings.parameters():
#     param.requires_grad = False

In [109]:
# setting model
setting_model(model)

In [23]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
	pred_flat = np.argmax(preds, axis=1).flatten()
	labels_flat = labels.flatten()
	return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [16]:
# model.add_adapter("qnli", adapter_type=AdapterType.text_task)

In [17]:
# model.add_adapter("qnli", adapter_type=AdapterType.text_task, config="pfeiffer")

In [16]:
from torch.optim import AdamW

In [17]:
# Setup the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-6, betas=(0.9,0.98), weight_decay =0.01)

# Number of training epochs
epochs = 2

# Total number of training steps is [number of batches] x [number of epochs]
total_steps = len(train_loader) * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
											num_warmup_steps=int(0.06*total_steps),
											num_training_steps=total_steps)

# Seed setting for reproducibility
seed_val = 42
random.seed(seed_val+222)
np.random.seed(seed_val+222)
torch.manual_seed(seed_val+222)
torch.cuda.manual_seed_all(seed_val+222)


# Function for formatting elapsed times
def format_time(elapsed):
	elapsed_rounded = int(round((elapsed)))
	return str(datetime.timedelta(seconds=elapsed_rounded))


In [18]:
total_steps

2500

In [19]:
# Training loop
def train(model, loader, optimizer, scheduler, epochs, device):
	# # Set the seed value all over the place to make this reproducible.
	# seed_val = 42
	# random.seed(seed_val)
	# np.random.seed(seed_val)
	# torch.manual_seed(seed_val)
	# torch.cuda.manual_seed_all(seed_val)

	# Store the average loss after each epoch so we can plot them.
	loss_values = []
	for epoch_i in range(0, epochs):
		# Perform one full pass over the training set.
		print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
		# Measure how long the training epoch takes.
		t0 = time.time()
		
		# Reset the total loss for this epoch.
		total_loss = 0
		
		# Put the model into training mode.
		model.train()

		for step, batch in enumerate(loader):
			# Progress update every 40 batches.
			if step % 200 == 0 and not step == 0:
				print('  Batch {:>1,}  of  {:>1,}.    Elapsed: , Loss {:}'.format(step, len(loader) , total_loss / (step+1)))
			# `batch` contains three pytorch tensors:
			#   [0]: input ids 
			# 	[1]: token_type_ids
			#   [2]: attention masks
			#   [3]: labels 
			b_input_ids = batch[0].to(device)
			b_token_type_ids = batch[1].to(device)
			b_input_mask = batch[2].to(device)
			b_labels = batch[3].to(device)
			
			# Always clear any previously calculated gradients before performing a backward pass.
			model.zero_grad()        
			
			# Perform a forward pass (evaluate the model on this training batch).
			# This will return the loss (rather than the model output) because we have provided the `labels`.
			outputs = model(b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask, labels=b_labels)
			loss = outputs.loss
	
			loss.backward()
			torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
			optimizer.step()
			scheduler.step()
			total_loss += loss.item()
		avg_train_loss = total_loss / len(loader)
		print(f"Average training loss: {avg_train_loss:.2f}")

In [20]:
train(model,train_loader, optimizer, scheduler, epochs, device)

  Batch 200  of  1,250.    Elapsed: , Loss 0.6794830278970709
  Batch 400  of  1,250.    Elapsed: , Loss 0.6444760851766403
  Batch 600  of  1,250.    Elapsed: , Loss 0.6127527945995728
  Batch 800  of  1,250.    Elapsed: , Loss 0.5927456688977657
  Batch 1,000  of  1,250.    Elapsed: , Loss 0.5802010090707185
  Batch 1,200  of  1,250.    Elapsed: , Loss 0.568572939589905
Average training loss: 0.57
  Batch 200  of  1,250.    Elapsed: , Loss 0.40790664258213777
  Batch 400  of  1,250.    Elapsed: , Loss 0.4046985257203294
  Batch 600  of  1,250.    Elapsed: , Loss 0.3956586726245388
  Batch 800  of  1,250.    Elapsed: , Loss 0.39912180267692954
  Batch 1,000  of  1,250.    Elapsed: , Loss 0.39573572378988925
  Batch 1,200  of  1,250.    Elapsed: , Loss 0.3931263323848467
Average training loss: 0.39


In [24]:
def evaluation(model, loader, device):
	model.eval()
	total_eval_accuracy = 0
	for batch in loader:
		b_input_ids,b_token_type_ids, b_input_mask, b_labels = batch
		b_input_ids = b_input_ids.to(device)
		b_token_type_ids = b_token_type_ids.to(device)
		b_input_mask = b_input_mask.to(device)
		b_labels = b_labels.to(device)
		
		with torch.no_grad():
			outputs = model(b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask)
		
		logits = outputs.logits
		logits = logits.detach().cpu().numpy()
		label_ids = b_labels.to('cpu').numpy()
		
		# Calculate the accuracy for this batch of test sentences
		total_eval_accuracy += flat_accuracy(logits, label_ids)

	# Report the final accuracy for this validation run
	avg_val_accuracy = total_eval_accuracy / len(loader)
	print("Accuracy: {0:.2f}".format(avg_val_accuracy))

	return avg_val_accuracy



In [25]:
evaluation(model, train_loader, device)

Accuracy: 0.92


0.9247

In [26]:
evaluation(model, valid_loader, device)

Accuracy: 0.83


0.830078125

In [27]:
evaluation(model, test_loader, device)

Accuracy: 0.82


0.8154296875

In [105]:
del model
torch.cuda.empty_cache()

In [121]:
# Save the model's state dictionary
torch.save(model.state_dict(), 'bert_finetuned_qnli.bin')

# Optionally, save the entire model (not recommended due to potential issues when loading)
# torch.save(model, 'bert_finetuned_qnli_full_model.pth')

In [None]:
# Load the tokenizer and model architecture as before
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Load the model's state dictionary
model.load_state_dict(torch.load('bert_finetuned_qnli.bin'))

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)