In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
FOLDER_PATH = '/content/drive/My Drive/Colab Notebooks/nlc/project'

In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math


USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

### Loading Datasets

In [4]:
import pandas as pd

# read from csv
df = pd.read_csv(os.path.join(FOLDER_PATH, 'combined_data.csv'))
df

Unnamed: 0,Question,Answer
0,how does randomised algorithm work,the algorithm typically uses uniformly random ...
1,what do you mean by bestfirst search,bestfirst search is a search algorithm which e...
2,how do you explain a daemon,daemon disk and execution monitor is a process...
3,what is phonetic algorithm,a phonetic algorithm is an algorithm for index...
4,what do you mean by uniform costsearch,a tree search that finds the lowestcost route ...
...,...,...
3770,explain biasvariance tradeoff,biasvariance tradeoff is a concept in machine ...
3771,what is stochastic gradient descent sgd in mac...,stochastic gradient descent sgd is an optimiza...
3772,explain stochastic gradient descent,stochastic gradient descent sgd is an optimiza...
3773,what is the backpropagation algorithm in machi...,the backpropagation algorithm is a widely used...


In [5]:
# convert the df to 2d list
qa_pairs = df.values.tolist()
len(qa_pairs), qa_pairs[0]

(3775,
 ['how does randomised algorithm work',
  'the algorithm typically uses uniformly random bits as an auxiliary input to guide its behavior in the hope of achieving good performance in the average case over all possible choices of random bits'])

### Functions

In [6]:
# Default word tokens
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token
UNK_token = 3  # Unknown word token

class Voc:
    def __init__(self, name):
        self.name = name
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS", UNK_token: "UNK"}
        self.num_words = 4  # Count SOS, EOS, PAD, UNK

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS", UNK_token: "UNK"}
        self.num_words = 4 # Count default tokens

        for word in keep_words:
            self.addWord(word)

In [7]:
# Turn a Unicode string to plain ASCII, thanks to
# http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

# Returns True iff both sentences in a pair 'p' are under the MAX_LENGTH threshold
def filterPair(p, max_length):
    # Input sequences need to preserve the last word for EOS token
    return len(p[0].split(' ')) < max_length and len(p[1].split(' ')) < max_length

# Filter pairs using filterPair condition
def filterPairs(pairs, max_length):
    return [pair for pair in pairs if filterPair(pair, max_length)]


In [8]:
voc = Voc("qa")
MAX_LENGTH = 50  # Maximum sentence length to consider

# filter pairs based on MAX_LENGTH
print(f'before filtering, no. of qa pairs: {len(qa_pairs)}')
qa_pairs = filterPairs(qa_pairs, MAX_LENGTH)
print(f'after filtering, no. of qa pairs: {len(qa_pairs)}')

before filtering, no. of qa pairs: 3775
after filtering, no. of qa pairs: 3145


In [9]:
# normalise each qa pair and add word from question and answer into vocab
for i in range(len(qa_pairs)):
    question, answer = qa_pairs[i]
    question = normalizeString(question)
    answer = normalizeString(answer)
    voc.addSentence(question)
    voc.addSentence(answer)
    qa_pairs[i] = [question, answer]

In [10]:
vocab_size = voc.num_words # words + 3 tokens (PAD, SOS, EOS)
vocab_size

4980

In [11]:
qa_pairs[:5]

[['how does randomised algorithm work',
  'the algorithm typically uses uniformly random bits as an auxiliary input to guide its behavior in the hope of achieving good performance in the average case over all possible choices of random bits'],
 ['what do you mean by bestfirst search',
  'bestfirst search is a search algorithm which explores a graph by expanding the most promising node chosen according to a specified rule'],
 ['how do you explain a daemon',
  'daemon disk and execution monitor is a process that runs in the background without users interaction they usually start at the booting time and terminate when the system is shut down'],
 ['what is phonetic algorithm',
  'a phonetic algorithm is an algorithm for indexing of words by their pronunciation'],
 ['what do you mean by uniform costsearch',
  'a tree search that finds the lowestcost route where costs vary']]

In [12]:
questions = [pair[0] for pair in qa_pairs]
answers = [pair[1] for pair in qa_pairs]
len(questions), len(answers)

(3145, 3145)

## Transformer Model Architecture

In [13]:
!pip install transformers



In [14]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import torch

class GPT2QA:
    """GPT-2 Model for Question Answering."""

    def __init__(self, model_name="gpt2"):
        self.tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    def generate_response(model, input_text):
        """Generate a response using the GPT-2 model with appropriate configurations."""
        inputs = model.tokenizer.encode(input_text, return_tensors="pt").to(model.device)

        # Generate response with the appropriate configurations
        outputs = model.model.generate(
            inputs,
            max_length=150,
            do_sample=True,  # Enable sampling for using top_p
            top_k=50,
            top_p=0.95,
            num_beams=2,  # Use beam search to leverage early stopping
            early_stopping=True,
            pad_token_id=model.tokenizer.eos_token_id  # Set padding token id
        )

        response = model.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response.strip()

### Training

In [15]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import time

class QADataset(Dataset):
    """Dataset class for question-answer pairs."""

    def __init__(self, qa_pairs, tokenizer, max_length=50):
        self.qa_pairs = qa_pairs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.qa_pairs)

    def __getitem__(self, idx):
        question, answer = self.qa_pairs[idx]
        inputs = self.tokenizer.encode_plus(
            question,
            answer,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': inputs['input_ids'].squeeze()  # For supervised training, labels are the same as input_ids
        }

def train_model(qa_pairs, model, batch_size=8, num_epochs=3):
    """Train the GPT-2 model on the question-answer pairs."""

    tokenizer = model.tokenizer
    dataset = QADataset(qa_pairs, tokenizer)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    optimizer = AdamW(model.model.parameters(), lr=5e-5)

    model.model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in dataloader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)
            labels = batch['labels'].to(model.device)

            # Start timing for total training time
            start_time = time.time()

            # Forward pass
            outputs = model.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            # Backward pass
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # End timing for current batch
            end_time = time.time()
            print(f"Time taken for batch: {end_time - start_time:.4f} seconds")

        # Average loss for the epoch
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

    # Save the fine-tuned model
    model.model.save_pretrained("fine_tuned_gpt2")
    model.tokenizer.save_pretrained("fine_tuned_gpt2")


### Run model

In [16]:
import pandas as pd
from torch.utils.data import DataLoader
from torch.optim import AdamW
import time

def load_data(file_path):
    """Load and preprocess the QA dataset."""
    df = pd.read_csv(file_path)
    qa_pairs = df.values.tolist()
    # Normalize each pair
    qa_pairs = [(q.lower().strip(), a.lower().strip()) for q, a in qa_pairs]
    return qa_pairs

def main():
    # Load the QA data
    file_path = '/content/drive/My Drive/Colab Notebooks/nlc/project/combined_data.csv'
    qa_pairs = load_data(file_path)

    # Initialize the model
    gpt2_qa_model = GPT2QA()

    # Add a padding token if not already set
    if gpt2_qa_model.tokenizer.pad_token is None:
        gpt2_qa_model.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        gpt2_qa_model.model.resize_token_embeddings(len(gpt2_qa_model.tokenizer))

    # Create DataLoader
    dataset = QADataset(qa_pairs, gpt2_qa_model.tokenizer)
    dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

    # Set up the optimizer
    optimizer = AdamW(gpt2_qa_model.model.parameters(), lr=5e-5)

    # Training loop
    gpt2_qa_model.model.train()  # Ensure model is in training mode
    num_epochs = 3

    for epoch in range(num_epochs):
        total_loss = 0
        start_epoch_time = time.time()  # Track epoch time

        for batch in dataloader:
            optimizer.zero_grad()  # Reset gradients

            input_ids = batch['input_ids'].to(gpt2_qa_model.device)
            attention_mask = batch['attention_mask'].to(gpt2_qa_model.device)
            labels = batch['labels'].to(gpt2_qa_model.device)

            # Start timing for total training time
            start_time = time.time()

            # Forward pass
            outputs = gpt2_qa_model.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            # Backward pass
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # End timing for current batch
            end_time = time.time()
            print(f"Time taken for batch: {end_time - start_time:.4f} seconds")

        # Average loss for the epoch
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

    # Save the fine-tuned model
    gpt2_qa_model.model.save_pretrained("fine_tuned_gpt2")
    gpt2_qa_model.tokenizer.save_pretrained("fine_tuned_gpt2")

if __name__ == "__main__":
    main()

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Time taken for batch: 1.8050 seconds
Time taken for batch: 0.1908 seconds
Time taken for batch: 0.1767 seconds
Time taken for batch: 0.1424 seconds
Time taken for batch: 0.1436 seconds
Time taken for batch: 0.1425 seconds
Time taken for batch: 0.1417 seconds
Time taken for batch: 0.1369 seconds
Time taken for batch: 0.1411 seconds
Time taken for batch: 0.1443 seconds
Time taken for batch: 0.1421 seconds
Time taken for batch: 0.1426 seconds
Time taken for batch: 0.1451 seconds
Time taken for batch: 0.1430 seconds
Time taken for batch: 0.1562 seconds
Time taken for batch: 0.1446 seconds
Time taken for batch: 0.1595 seconds
Time taken for batch: 0.1533 seconds
Time taken for batch: 0.1483 seconds
Time taken for batch: 0.1466 seconds
Time taken for batch: 0.1444 seconds
Time taken for batch: 0.1455 seconds
Time taken for batch: 0.1434 seconds
Time taken for batch: 0.1405 seconds
Time taken for batch: 0.1418 seconds
Time taken for batch: 0.1407 seconds
Time taken for batch: 0.1642 seconds
T

### Fine-tuning

In [18]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, AdamW

class CustomQADataset(Dataset):
    def __init__(self, qa_pairs, tokenizer, max_length=100):
        self.qa_pairs = qa_pairs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.qa_pairs)

    def __getitem__(self, idx):
        question, answer = self.qa_pairs[idx]
        text = f"Question: {question} Answer: {answer}"
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': inputs['input_ids'].squeeze()
        }

def fine_tune_gpt2(qa_pairs, model_name="gpt2", num_epochs=3, batch_size=8, learning_rate=5e-5):
    tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Set the padding token
    tokenizer.pad_token = tokenizer.eos_token

    # Prepare dataset and dataloader
    dataset = CustomQADataset(qa_pairs, tokenizer)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Set up the optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Training loop
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0

        for batch in dataloader:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

    # Save the fine-tuned model
    model.save_pretrained("fine_tuned_gpt2")
    tokenizer.save_pretrained("fine_tuned_gpt2")

if __name__ == "__main__":
    fine_tune_gpt2(qa_pairs)



Epoch 1/3, Average Loss: 1.2268
Epoch 2/3, Average Loss: 0.8775
Epoch 3/3, Average Loss: 0.6685


In [19]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

def load_fine_tuned_model(model_dir="fine_tuned_gpt2"):
    """Load the fine-tuned model and tokenizer."""
    tokenizer = GPT2TokenizerFast.from_pretrained(model_dir)
    model = GPT2LMHeadModel.from_pretrained(model_dir)
    model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

    return model, tokenizer

def generate_response(model, tokenizer, input_text):
    """Generate a response using the fine-tuned GPT-2 model."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Alter the input to guide the model toward a more appropriate response
    prompt = f"Q: {input_text}\nA:"
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)

    outputs = model.generate(
        inputs,
        max_length=150,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Process the response to remove any leading prompts, if necessary
    response = response.split("A:", 1)[-1].strip()
    return response

### Begin Interaction

In [30]:
def chat():
    model, tokenizer = load_fine_tuned_model()

    print("What can I answer for you today? Type 'q' or 'quit' to exit.")
    while True:
        user_input = input("You: ")

        if user_input.lower() in ['q', 'quit']:
            print("Exiting chatbot! All the best for your job search :)")
            break

        user_input = user_input.strip()  # Simple text normalization
        response = generate_response(model, tokenizer, user_input)
        print("Bot:", response)

if __name__ == "__main__":
    chat()

What can I answer for you today? Type 'q' or 'quit' to exit.
You: define recursion
Bot: recursive algorithm is one that invokes some or all of its outer nodes in a recursive program
You: please explain recursion
Bot: recursion is a recursive algorithm used to solve a class of computational problems by enclosing an accumulator with an upper bound on the number of possible solutions that may be made by taking the solution of a recursion as a whole and dividing it into parts to solve the bigger problem
You: what is a decision tree?
Bot: a decision tree is a tree which represents a decision in terms of its children
You: explain binary search
Bot: binary search is a search algorithm that finds a shortest path between two vertices or edges using binary search algorithms often used in computer graphics and cryptography
You: which furniture would you describe yourself as?
Bot: a computer programmer
You: q
Exiting chatbot! All the best for your job search :)


#### BERT Score evaluation

In [None]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert_score
Successfully installed bert_score-0.3.13


In [32]:
import torch
from bert_score import score as bert_score
import random
from transformers import logging

# Set logging level to suppress informational logs
logging.set_verbosity_error()  # Only errors will be shown

model, tokenizer = load_fine_tuned_model()
generated_responses = []
all_results = []  # For storing results

for question, answer in zip(questions, answers):
    generated_response = generate_response(model, tokenizer, question)
    generated_responses.append(generated_response)

    # Prepare candidates and references for BERTScore calculation
    candidates = [generated_response]  # Generated response
    references = [answer]              # Ground truth answer

    # Calculate BERTScore
    P, R, F1 = bert_score(candidates, references, lang='en', device='cuda' if torch.cuda.is_available() else 'cpu')

    # Append results to the list
    all_results.append({
        "Question": question,
        "Ground Truth": answer,
        "Generated Response": generated_response,
        "Precision": P.mean().item(),
        "Recall": R.mean().item(),
        "F1": F1.mean().item()
    })

# Create a DataFrame for results
results_df = pd.DataFrame(all_results)

# Save the DataFrame to a CSV file
results_df.to_csv('/content/drive/My Drive/Colab Notebooks/nlc/project/gpt2_bertscores.csv', index=False)

# Calculate average BERTScores
avg_precision = results_df["Precision"].mean()
avg_recall = results_df["Recall"].mean()
avg_f1 = results_df["F1"].mean()

# Print average scores
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1 Score: {avg_f1:.4f}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]



Average Precision: 0.8790
Average Recall: 0.8724
Average F1 Score: 0.8755
