In [1]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch
from transformers import AdamW, get_linear_schedule_with_warmup
import numpy as np
import random
import time
import datetime

In [2]:
dataset = load_dataset("nyu-mll/glue", "qnli")

In [4]:
# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenizer.decode([101,102,103, 104], return_tensors='pt')

In [6]:
tokenizer.encode("Hello, [SEP] my dog is cute")

[101, 8667, 117, 102, 1139, 3676, 1110, 10509, 102]

In [35]:
set(dataset['train']['label'])

IndexError: too many indices for tensor of dimension 2

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 104743
    })
    validation: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 5463
    })
    test: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 5463
    })
})

In [9]:
random_train = dataset['train'].select(range(2269,12269))
random_val = dataset['validation'].select(range(2269,3269))
random_test = dataset['validation'].select(range(3269, 4269))

In [10]:
sum(random_train['label'])

5013

In [12]:
random_train['question'][0:5]

["Which artist guested on a live version of Queen's The Show Must Go On?",
 'marshall jefferson got involved in house music after hearing whose music?',
 "What trend led to the decrease of Estonia's GDP?",
 'Which architects in the US and Britain still employ the Georgian style for private residences?',
 'Who was responsible for crafting a new look for all Apple products?']

In [15]:
# Assuming dataset is a list of dicts with 'question', 'sentence', and 'label' keys
def template(template_type, dataset):
	if template_type == 'normal':
		inputs= tokenizer([f"{q['question']}  [SEP] {q['sentence']}" for q in dataset],
						padding=True, truncation=True, return_tensors="pt", max_length=256)
	elif template_type == 'PCP':
		inputs= tokenizer([f"{q['question']}  [MASK] , {q['context']}" for q in dataset], 
					padding=True, truncation=True, return_tensors="pt", max_length=256)		
	train_labels = torch.tensor([q['label'] for q in dataset])
	return inputs, train_labels
	


In [26]:
train_inputs, train_outputs = template('normal',random_train)
valid_inputs, valid_outputs = template('normal',random_val)
test_inputs, test_outputs = template('normal',random_test)

In [None]:
train_data = TensorDataset(train_inputs['input_ids'],train_inputs['token_type_ids'],
                            train_inputs['attention_mask'], train_outputs)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

valid_data= TensorDataset(valid_inputs['input_ids'],valid_inputs['token_type_ids'],
							valid_inputs['attention_mask'], valid_outputs)
valid_loader = DataLoader(valid_data, batch_size=32 )

test_data = TensorDataset(test_inputs['input_ids'],test_inputs['token_type_ids'],
							test_inputs['attention_mask'], test_outputs)
test_loader = DataLoader(test_data, batch_size=32 )


In [None]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# model = torch.compile(model)

In [34]:
train_data[0][1].shape

torch.Size([256])

In [24]:

# Freeze word_embeddings
def setting_model(model):
	for param in model.bert.parameters():
		param.requires_grad = False

	for param in model.classifier.parameters():
		param.requires_grad = True 

	for param in model.bert.embeddings.word_embeddings.parameters():
		param.requires_grad = True 

# # Freeze position_embeddings
# for param in model.bert.embeddings.position_embeddings.parameters():
#     param.requires_grad = False

# # Freeze token_type_embeddings
# for param in model.bert.embeddings.token_type_embeddings.parameters():
#     param.requires_grad = False

In [25]:
# setting model
setting_model(model)

In [31]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
	pred_flat = np.argmax(preds, axis=1).flatten()
	labels_flat = labels.flatten()
	return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [40]:
# model.add_adapter("qnli", adapter_type=AdapterType.text_task)

In [None]:
# model.add_adapter("qnli", adapter_type=AdapterType.text_task, config="pfeiffer")

In [None]:
# Setup the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Number of training epochs
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]
total_steps = len(train_loader) * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
											num_warmup_steps=0,
											num_training_steps=total_steps)

# Seed setting for reproducibility
seed_val = 42
random.seed(seed_val+222)
np.random.seed(seed_val+222)
torch.manual_seed(seed_val+222)
torch.cuda.manual_seed_all(seed_val+222)


# Function for formatting elapsed times
def format_time(elapsed):
	elapsed_rounded = int(round((elapsed)))
	return str(datetime.timedelta(seconds=elapsed_rounded))


In [None]:
# Training loop
def train(model, loader, optimizer, scheduler, epochs, device):
	# # Set the seed value all over the place to make this reproducible.
	# seed_val = 42
	# random.seed(seed_val)
	# np.random.seed(seed_val)
	# torch.manual_seed(seed_val)
	# torch.cuda.manual_seed_all(seed_val)

	# Store the average loss after each epoch so we can plot them.
	loss_values = []
	for epoch_i in range(0, epochs):
		# Perform one full pass over the training set.
		print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
		# Measure how long the training epoch takes.
		t0 = time.time()
		
		# Reset the total loss for this epoch.
		total_loss = 0
		
		# Put the model into training mode.
		model.train()

		for step, batch in enumerate(loader):
			# Progress update every 40 batches.
			if step % 40 == 0 and not step == 0:
				print('  Batch {:>1,}  of  {:>1,}.    Elapsed: , Loss {:}'.format(step, len(loader) , total_loss / (step+1)))
			# `batch` contains three pytorch tensors:
			#   [0]: input ids 
			# 	[1]: token_type_ids
			#   [2]: attention masks
			#   [3]: labels 
			b_input_ids = batch[0].to(device)
			b_token_type_ids = batch[1].to(device)
			b_input_mask = batch[2].to(device)
			b_labels = batch[3].to(device)
			
			# Always clear any previously calculated gradients before performing a backward pass.
			model.zero_grad()        
			
			# Perform a forward pass (evaluate the model on this training batch).
			# This will return the loss (rather than the model output) because we have provided the `labels`.
			outputs = model(b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask, labels=b_labels)
			loss = outputs.loss
	
			loss.backward()
			torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
			optimizer.step()
			scheduler.step()
			total_loss += loss.item()
		avg_train_loss = total_loss / len(loader)
		print(f"Average training loss: {avg_train_loss:.2f}")

In [None]:
def evaluation(model, loader, device):
	model.eval()
	total_eval_accuracy = 0
	for batch in loader:
		b_input_ids, b_input_mask, b_labels = batch
		b_input_ids = b_input_ids.to(device)
		b_input_mask = b_input_mask.to(device)
		b_labels = b_labels.to(device)
		
		with torch.no_grad():
			outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
		
		logits = outputs.logits
		logits = logits.detach().cpu().numpy()
		label_ids = b_labels.to('cpu').numpy()
		
		# Calculate the accuracy for this batch of test sentences
		total_eval_accuracy += flat_accuracy(logits, label_ids)

	# Report the final accuracy for this validation run
	avg_val_accuracy = total_eval_accuracy / len(loader)
	print("Accuracy: {0:.2f}".format(avg_val_accuracy))

	return avg_val_accuracy



In [None]:
# Save the model's state dictionary
torch.save(model.state_dict(), 'bert_finetuned_qnli.bin')

# Optionally, save the entire model (not recommended due to potential issues when loading)
# torch.save(model, 'bert_finetuned_qnli_full_model.pth')

In [None]:
# Load the tokenizer and model architecture as before
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Load the model's state dictionary
model.load_state_dict(torch.load('bert_finetuned_qnli.bin'))

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [11]:
print('  Batch {:>2,}  of  {:>2,}.    Elapsed: {:}.'.format(3, 4 ,30))

  Batch  3  of   4.    Elapsed: 30.


In [None]:
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from datasets import load_dataset

# 1. Load the pre-trained model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# 2. Prepare your dataset
dataset = load_dataset('your_dataset')  # replace 'your_dataset' with your actual dataset
encoded_dataset = dataset.map(lambda examples: tokenizer(examples['text'], truncation=True, padding='max_length'), batched=True)

# 3. Define the training parameters
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

# 4. Train the model
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=encoded_dataset['train'],         # training dataset
    eval_dataset=encoded_dataset['validation'],             # evaluation dataset
)

trainer.train()

# 5. Evaluate the model
trainer.evaluate()

In [None]:
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from datasets import load_dataset

# 1. Load the pre-trained model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# 2. Prepare your dataset
def encode_examples(example):
    # Encode the question and sentence with truncation and padding
    encoding = tokenizer(example['question'], example['sentence'], truncation=True, padding='max_length', max_length=512)
    encoding['labels'] = example['label']
    return encoding

# Assuming dataset is your dataset
encoded_dataset = dataset.map(encode_examples, batched=True)

# 3. Define the training parameters
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

# 4. Train the model
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=encoded_dataset['train'],         # training dataset
    eval_dataset=encoded_dataset['validation'],             # evaluation dataset
)

trainer.train()

# 5. Evaluate the model
trainer.evaluate()

In [None]:

def mask_tokens(inputs, tokenizer, mlm_probability=0.15):
    labels = inputs.clone()
    # We sample a few tokens in each sequence for MLM training (with probability `mlm_probability`)
    probability_matrix = torch.full(labels.shape, mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # Replace 80% of masked tokens with [MASK]
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    # Replace 10% of masked tokens with random word
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    # The rest 10% of masked tokens are kept as their original token
    return inputs, labels

inputs, labels = mask_tokens(encodings["input_ids"], tokenizer)
