In [1]:
!pip install transformers datasets evaluate rouge_score pandas scikit-learn

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 k

In [2]:
import pandas as pd
from datasets import Dataset
from transformers import RobertaTokenizer, RobertaForQuestionAnswering, pipeline
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from rouge_score import rouge_scorer
import nltk
from nltk.translate.meteor_score import meteor_score

# Ensure NLTK resources are downloaded
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:

# Load CSV files
train_df = pd.read_csv('https://raw.githubusercontent.com/viswa3024/aiml-capstone-project-email/main/question-answering/Dataset-2/train.csv')
dev_df = pd.read_csv('https://raw.githubusercontent.com/viswa3024/aiml-capstone-project-email/main/question-answering/Dataset-2/dev.csv')
test_df = pd.read_csv('https://raw.githubusercontent.com/viswa3024/aiml-capstone-project-email/main/question-answering/Dataset-2/test.csv')

# Convert DataFrames to Datasets
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)

In [4]:
print(train_dataset[0])
print(dev_dataset[0])
print(test_dataset[0])

{'question': 'What is the role of Siamese networks in domain adaptation?', 'answer': 'Siamese networks can be used to align the representations of source and target domains by minimizing the discrepancy between them, making the model more robust to domain shifts.'}
{'question': 'How do U-Nets skip connections contribute to better segmentation performance?', 'answer': 'Skip connections in U-Net preserve fine-grained spatial information and improving localization and detail in segmentation tasks.'}
{'question': 'What is feature selection?', 'answer': 'The process of selecting a subset of relevant features to improve model performance and reduce complexity.'}


In [14]:
import pandas as pd
from datasets import Dataset
from transformers import BartConfig

In [21]:
model_name = "facebook/bart-large"

config = BartConfig.from_pretrained(model_name)
# Ensure generation-specific parameters are not set in the training configuration
config.update({
    'early_stopping': True,
    'num_beams': 5,
    'no_repeat_ngram_size': 3,
    'forced_bos_token_id': 0,
    'forced_eos_token_id': 2
})


tokenizer = BartTokenizer.from_pretrained(model_name)
#model = BartForConditionalGeneration.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name, config=config)



In [22]:
# Tokenize the dataset
def preprocess_function(examples):
    inputs = examples["question"]
    targets = examples["answer"]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [23]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
dev_dataset = dev_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1985 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

In [10]:
import evaluate

In [11]:
rouge_metric = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [19]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)

In [29]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=15,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",  # Set to "tensorboard" if using TensorBoard
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    #compute_metrics=compute_metrics
)

# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1572,0.738001
2,0.1884,0.641267
3,0.2789,0.58907
4,0.2219,0.625619
5,0.1592,0.647413
6,0.1262,0.685919
7,0.1075,0.715444
8,0.0786,0.751628
9,0.0713,0.760206
10,0.048,0.777197


Non-default generation parameters: {'early_stopping': True, 'num_beams': 5, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 5, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 5, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 5, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 5, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 5, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams

TrainOutput(global_step=3735, training_loss=0.1066516766586457, metrics={'train_runtime': 5040.1123, 'train_samples_per_second': 5.908, 'train_steps_per_second': 0.741, 'total_flos': 8065692440985600.0, 'train_loss': 0.1066516766586457, 'epoch': 15.0})

In [30]:
import numpy as np

In [32]:
import json
import pandas as pd

def save_predictions_and_labels(test_dataset, trainer, file_prefix=''):
    predictions = []
    labels = []

    for batch in trainer.get_test_dataloader(test_dataset):
        outputs = trainer.model(**batch)
        logits = outputs.logits
        pred_ids = np.argmax(logits.detach().cpu().numpy(), axis=-1)

        batch_preds = pred_ids.tolist()
        batch_labels = batch['labels'].tolist()

        predictions.extend(batch_preds)
        labels.extend(batch_labels)

        # Save periodically to manage memory
        if len(predictions) % 1000 == 0:
            pd.DataFrame({'predictions': [json.dumps(p) for p in predictions], 'labels': [json.dumps(l) for l in labels]}).to_csv(f'{file_prefix}_predictions_labels.csv', index=False)

    # Final save
    pd.DataFrame({'predictions': [json.dumps(p) for p in predictions], 'labels': [json.dumps(l) for l in labels]}).to_csv(f'{file_prefix}_predictions_labels.csv', index=False)

save_predictions_and_labels(test_dataset, trainer, 'test')

In [33]:
import pandas as pd
import json
import evaluate

# Load ROUGE metric
rouge_metric = evaluate.load("rouge")

def compute_rouge_from_file(file_path):
    df = pd.read_csv(file_path)
    # Convert JSON strings back to lists
    decoded_preds = [json.loads(p) for p in df['predictions']]
    decoded_labels = [json.loads(l) for l in df['labels']]

    # Decode text
    decoded_preds_text = tokenizer.batch_decode(decoded_preds, skip_special_tokens=True)
    decoded_labels_text = tokenizer.batch_decode(decoded_labels, skip_special_tokens=True)

    return rouge_metric.compute(predictions=decoded_preds_text, references=decoded_labels_text)

# Compute ROUGE scores
rouge_scores = compute_rouge_from_file('test_predictions_labels.csv')

print("ROUGE Scores:")
print(rouge_scores)


ROUGE Scores:
{'rouge1': 0.5213163118398241, 'rouge2': 0.2832376853454525, 'rougeL': 0.48916521974584165, 'rougeLsum': 0.4894790327733146}


In [34]:
from transformers import GenerationConfig

# Define generation-specific parameters separately
generation_config = GenerationConfig(
    early_stopping=True,
    num_beams=5,
    no_repeat_ngram_size=3,
    forced_bos_token_id=0,
    forced_eos_token_id=2
)

In [35]:
def generate_predictions(test_dataset, trainer):
    predictions = []
    for batch in trainer.get_test_dataloader(test_dataset):
        outputs = trainer.model.generate(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'],  generation_config=generation_config)
        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend(decoded_preds)
    return predictions

# Get final output
final_output = generate_predictions(test_dataset, trainer)



In [36]:
test_df["Generated"] = final_output
test_df.to_csv('bart_large_qna_dataset_2_v1.csv')

In [37]:
df_generated = pd.read_csv('bart_large_qna_dataset_2_v1.csv')

In [38]:
df_generated.head(10)

Unnamed: 0.1,Unnamed: 0,question,answer,Generated
0,0,What is feature selection?,The process of selecting a subset of relevant ...,The process of selecting a subset of relevant ...
1,1,"What are Markov decision processes (MDP), and ...",MDPs are mathematical frameworks for modeling ...,MDPs are a type of neural network architecture...
2,2,What is batch normalization and how does it work?,Normalizes the layer's input over a mini-batch,Batch normalization normalizes the inputs to e...
3,3,How does Word2Vec create word embeddings?,Word2Vec creates word embeddings by training o...,Word2Vec uses neural networks to learn vector ...
4,4,What is the role of hidden layers in MLP?,Hidden layers in MLP help capture complex patt...,Hidden layers transform inputs into higher-lev...
5,5,What is a Decision Tree?,A Decision Tree is a predictive learning algor...,A Decision Tree is a supervised learning algor...
6,6,What is variance in machine learning?,Variance refers to the error introduced by the...,The variability of a model's predictions acros...
7,7,What are common methods to diagnose bias and v...,Common methods include analyzing learning curv...,Common methods to diagnose bias and variance i...
8,8,What is the difference between L1 and L2 regul...,L1 regularization adds the absolute values of ...,L1 regularization penalizes the sum of squared...
9,9,What is feature engineering for unstructured d...,Feature engineering for unstructured data invo...,Feature engineering for unstructured data invo...


In [39]:
model.save_pretrained("bart_large_qna_dataset_2_v1")
tokenizer.save_pretrained("bart_large_qna_dataset_2_v1")

Non-default generation parameters: {'early_stopping': True, 'num_beams': 5, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('bart_large_qna_dataset_2_v1/tokenizer_config.json',
 'bart_large_qna_dataset_2_v1/special_tokens_map.json',
 'bart_large_qna_dataset_2_v1/vocab.json',
 'bart_large_qna_dataset_2_v1/merges.txt',
 'bart_large_qna_dataset_2_v1/added_tokens.json')

In [40]:
from huggingface_hub import notebook_login

In [41]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [42]:
repo_name="bart_large_qna_dataset_2_v1"

model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

Non-default generation parameters: {'early_stopping': True, 'num_beams': 5, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kkasiviswanath/bart_large_qna_dataset_2_v1/commit/b624294944d5e9f850aef0ad999f1b8314656e0d', commit_message='Upload tokenizer', commit_description='', oid='b624294944d5e9f850aef0ad999f1b8314656e0d', pr_url=None, pr_revision=None, pr_num=None)

In [43]:
from transformers import pipeline

In [46]:

model_name = 'kkasiviswanath/bart_large_qna_dataset_2_v1'

# Load the model and tokenizer using the pipeline
qa_pipeline = pipeline('text2text-generation', model=model_name, tokenizer=model_name)

# Define a question
question = "Can we utilize an autoencoder to perform dimensionality reduction on numerical datasets?"

# Define your custom generation parameters
generation_kwargs = {
    'num_beams': 5,
    'no_repeat_ngram_size': 3,
    'early_stopping': True,
    'forced_bos_token_id': 0,
    'forced_eos_token_id': 2,
    'temperature':1.0,
    'top_p':0.9,
    'max_length': 100  # You can also set other parameters like max_length here
}

# Generate a prediction
result = qa_pipeline(question, **generation_kwargs)

# Print the result
print(result)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'generated_text': 'Yes, we can use an autoencoder to perform dimensionality reduction on numerical datasets by encoding the data into a lower-dimensional code layer.'}]


In [47]:
# Define a question
question = "How does Word2Vec create word embeddings?"

# Generate a prediction
result = qa_pipeline(question)

# Print the result
print(result)

[{'generated_text': 'Word2Vec uses neural networks to learn vector representations of words based on their context'}]


In [49]:
# Define your question
question = "How does Word2Vec create word embeddings"

# Generate the answer with adjusted parameters
result = qa_pipeline(question, max_length=100, num_beams=5, temperature=1.0, top_p=0.9)

# Output the result
print(f"Question: {question}")
print(f"Answer: {result[0]['generated_text']}")

Question: How does Word2Vec create word embeddings
Answer: Word2Vec creates word embeddings using vector representations of the words.


In [50]:
# Define your question
question = "Can we utilize an autoencoder to perform dimensionality reduction on numerical datasets?"

# Generate the answer with adjusted parameters
result = qa_pipeline(question, max_length=100, num_beams=5, temperature=1.0, top_p=0.9)

# Output the result
print(f"Question: {question}")
print(f"Answer: {result[0]['generated_text']}")

Question: Can we utilize an autoencoder to perform dimensionality reduction on numerical datasets?
Answer: Yes, we can use an autoencoder to perform dimensionality reduction on numerical datasets by encoding the data into a lower-dimensional code layer.


In [51]:
# Define your question
question = "What is the role of hidden layers in MLP?"

# Generate the answer with adjusted parameters
result = qa_pipeline(question, max_length=100, num_beams=5, temperature=1.0, top_p=0.9)

# Output the result
print(f"Question: {question}")
print(f"Answer: {result[0]['generated_text']}")

Question: What is the role of hidden layers in MLP?
Answer: Hidden layers transform inputs into higher-level representations, allowing MLPs to learn complex relationships in the data.
