In [1]:
!pip install transformers datasets rouge_score sacrebleu nltk

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)

In [3]:
import pandas as pd
from datasets import Dataset
from transformers import RobertaTokenizer, RobertaForQuestionAnswering, pipeline
from rouge_score import rouge_scorer
import sacrebleu
import nltk
from nltk.translate.meteor_score import meteor_score

# Ensure NLTK resources are downloaded
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:

# Load CSV files
train_df = pd.read_csv('https://raw.githubusercontent.com/viswa3024/aiml-capstone-project-email/main/question-answering/Dataset-2/train.csv')
dev_df = pd.read_csv('https://raw.githubusercontent.com/viswa3024/aiml-capstone-project-email/main/question-answering/Dataset-2/dev.csv')
test_df = pd.read_csv('https://raw.githubusercontent.com/viswa3024/aiml-capstone-project-email/main/question-answering/Dataset-2/test.csv')

# Convert DataFrames to Datasets
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)

In [5]:
print(train_dataset[0])
print(dev_dataset[0])
print(test_dataset[0])

{'question': 'What is the role of Siamese networks in domain adaptation?', 'answer': 'Siamese networks can be used to align the representations of source and target domains by minimizing the discrepancy between them, making the model more robust to domain shifts.'}
{'question': 'How do U-Nets skip connections contribute to better segmentation performance?', 'answer': 'Skip connections in U-Net preserve fine-grained spatial information and improving localization and detail in segmentation tasks.'}
{'question': 'What is feature selection?', 'answer': 'The process of selecting a subset of relevant features to improve model performance and reduce complexity.'}


In [18]:
def preprocess_function(examples):
    # Tokenize input
    inputs = tokenizer(
        examples['question'],
        examples['answer'],
        truncation=True,
        padding='max_length'
    )

    # Dummy values for start and end positions (change as needed)
    inputs['start_positions'] = [0] * len(examples['question'])
    inputs['end_positions'] = [min(len(examples['answer']) - 1, 0)] * len(examples['question'])

    return inputs


In [19]:
from transformers import RobertaTokenizer

# Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')



In [20]:


# Apply tokenization
train_dataset = train_dataset.map(preprocess_function, batched=True)
dev_dataset = dev_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/1985 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

In [21]:
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(dev_dataset)}")

Train dataset size: 1985
Validation dataset size: 248


In [22]:
from transformers import RobertaForQuestionAnswering, Trainer, TrainingArguments

In [25]:
from transformers import Trainer

class CustomTrainer(Trainer):
    def log(self, logs: dict) -> None:
        super().log(logs)
        if 'loss' in logs:
            print(f"Training Loss: {logs['loss']}")

# Initialize model
model = RobertaForQuestionAnswering.from_pretrained('roberta-base')

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',              # Directory for logs
    logging_steps=10,                  # Log every 10 steps
    evaluation_strategy="epoch",
    save_strategy="epoch",
    report_to='tensorboard',           # Optional: Log to TensorBoard
)

# Initialize Trainer with custom logging
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset
)


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0001,3.7e-05
2,0.0001,2.1e-05
3,0.0001,1.9e-05


TrainOutput(global_step=747, training_loss=0.09240546192938943, metrics={'train_runtime': 676.6603, 'train_samples_per_second': 8.801, 'train_steps_per_second': 1.104, 'total_flos': 1556022186362880.0, 'train_loss': 0.09240546192938943, 'epoch': 3.0})

In [40]:
# Define the question-answering pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Generate predictions
predictions = []
references = []
for example in test_dataset:
    result = qa_pipeline(question=example['question'], context=example['answer'])
    predictions.append(result['answer'])
    references.append(example['answer'])  # Use the actual answer as reference

# Compute ROUGE scores
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
num_examples = len(predictions)

for idx in range(num_examples):
    reference = references[idx]
    prediction = predictions[idx]
    score = rouge.score(reference, prediction)
    for key in rouge_scores:
        rouge_scores[key] += score[key].fmeasure

# Average ROUGE scores
for key in rouge_scores:
    rouge_scores[key] /= num_examples

print("ROUGE Scores:", rouge_scores)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


ROUGE Scores: {'rouge1': 0.33395054544809144, 'rouge2': 0.2638204572310244, 'rougeL': 0.33395054544809144}


In [41]:
from transformers import pipeline
import pandas as pd
import sacrebleu
from rouge_score import rouge_scorer

# Initialize the question-answering pipeline with your trained model
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)


# Generate predictions
def generate_answers(question, context):
    result = qa_pipeline(question=question, context=context)
    return result['answer']

# Apply the function to the test dataset
test_df["Generated"] = test_df.apply(lambda row: generate_answers(row['question'], row['answer']), axis=1)

# Save the updated DataFrame to a CSV file
test_df.to_csv('robertabase_qna_dataset_2_v1.csv', index=False)

# Prepare data for BLEU score calculation
predictions = test_df["Generated"].tolist()
references = test_df["answer"].tolist()

# Tokenize text for BLEU calculation (split-based tokenizer)
def tokenize(text):
    return text.split()

# Convert texts to tokenized lists
# Note that sacrebleu expects references to be a list of lists of lists
tokenized_references = [[tokenize(ref)] for ref in references]  # Each reference is a list containing one list of tokens
tokenized_predictions = [tokenize(pred) for pred in predictions]  # Each prediction is a list of tokens



# Calculate ROUGE scores
# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Calculate ROUGE scores
def calculate_rouge_scores(predictions, references):
    scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    for pred, ref in zip(predictions, references):
        score = scorer.score(ref, pred)
        scores['rouge1'] += score['rouge1'].fmeasure
        scores['rouge2'] += score['rouge2'].fmeasure
        scores['rougeL'] += score['rougeL'].fmeasure
    num_samples = len(predictions)
    scores = {k: v / num_samples for k, v in scores.items()}
    return scores

rouge_scores = calculate_rouge_scores(predictions, references)
print("ROUGE Scores:", rouge_scores)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


ROUGE Scores: {'rouge1': 0.33395054544809144, 'rouge2': 0.2638204572310244, 'rougeL': 0.33395054544809144}


In [42]:
test_df.to_csv('robertabase_qna_dataset_2_v1.csv', index=False)

In [43]:
df_generated = pd.read_csv('/content/robertabase_qna_dataset_2_v1.csv')

In [44]:
df_generated.head(10)

Unnamed: 0,question,answer,Generated
0,What is feature selection?,The process of selecting a subset of relevant ...,process of selecting a subset of relevant feat...
1,"What are Markov decision processes (MDP), and ...",MDPs are mathematical frameworks for modeling ...,"decision-maker,"
2,What is batch normalization and how does it work?,Normalizes the layer's input over a mini-batch,layer's input
3,How does Word2Vec create word embeddings?,Word2Vec creates word embeddings by training o...,embeddings
4,What is the role of hidden layers in MLP?,Hidden layers in MLP help capture complex patt...,output
5,What is a Decision Tree?,A Decision Tree is a predictive learning algor...,decision-making by splitting data into branches
6,What is variance in machine learning?,Variance refers to the error introduced by the...,Variance
7,What are common methods to diagnose bias and v...,Common methods include analyzing learning curv...,cross-validation
8,What is the difference between L1 and L2 regul...,L1 regularization adds the absolute values of ...,regularization
9,What is feature engineering for unstructured d...,Feature engineering for unstructured data invo...,or
