In [1]:
!pip install transformers datasets evaluate rouge_score pandas scikit-learn

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 

In [2]:
import pandas as pd
from datasets import Dataset
from transformers import RobertaTokenizer, RobertaForQuestionAnswering, pipeline
from rouge_score import rouge_scorer
import nltk
from nltk.translate.meteor_score import meteor_score

# Ensure NLTK resources are downloaded
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:

# Load CSV files
train_df = pd.read_csv('https://raw.githubusercontent.com/viswa3024/aiml-capstone-project-email/main/question-answering/Dataset-2/train.csv')
dev_df = pd.read_csv('https://raw.githubusercontent.com/viswa3024/aiml-capstone-project-email/main/question-answering/Dataset-2/dev.csv')
test_df = pd.read_csv('https://raw.githubusercontent.com/viswa3024/aiml-capstone-project-email/main/question-answering/Dataset-2/test.csv')

# Convert DataFrames to Datasets
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)

In [4]:
print(train_dataset[0])
print(dev_dataset[0])
print(test_dataset[0])

{'question': 'What is the role of Siamese networks in domain adaptation?', 'answer': 'Siamese networks can be used to align the representations of source and target domains by minimizing the discrepancy between them, making the model more robust to domain shifts.'}
{'question': 'How do U-Nets skip connections contribute to better segmentation performance?', 'answer': 'Skip connections in U-Net preserve fine-grained spatial information and improving localization and detail in segmentation tasks.'}
{'question': 'What is feature selection?', 'answer': 'The process of selecting a subset of relevant features to improve model performance and reduce complexity.'}


In [5]:
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

In [6]:
# Load tokenizer and model
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:01<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [7]:
# Tokenize the dataset
def preprocess_function(examples):
    inputs = examples["question"]
    targets = examples["answer"]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [8]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
dev_dataset = dev_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1985 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

Map:   0%|          | 0/249 [00:00<?, ? examples/s]

In [9]:
import evaluate

In [10]:
rouge_metric = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [11]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)

In [12]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=15,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",  # Set to "tensorboard" if using TensorBoard
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    #compute_metrics=compute_metrics
)

# Fine-tune the model
trainer.train()



Epoch,Training Loss,Validation Loss
1,0.8142,0.679704
2,0.6762,0.635894
3,0.6405,0.612548
4,0.643,0.599981
5,0.5489,0.587778
6,0.5267,0.582299
7,0.5439,0.576777
8,0.4856,0.572881
9,0.5119,0.570709
10,0.4748,0.567497


TrainOutput(global_step=3735, training_loss=0.7670938555654911, metrics={'train_runtime': 2718.8243, 'train_samples_per_second': 10.951, 'train_steps_per_second': 1.374, 'total_flos': 5097162542284800.0, 'train_loss': 0.7670938555654911, 'epoch': 15.0})

In [13]:
import numpy as np

In [14]:
import json
import pandas as pd

def save_predictions_and_labels(test_dataset, trainer, file_prefix=''):
    predictions = []
    labels = []

    for batch in trainer.get_test_dataloader(test_dataset):
        outputs = trainer.model(**batch)
        logits = outputs.logits
        pred_ids = np.argmax(logits.detach().cpu().numpy(), axis=-1)

        batch_preds = pred_ids.tolist()
        batch_labels = batch['labels'].tolist()

        predictions.extend(batch_preds)
        labels.extend(batch_labels)

        # Save periodically to manage memory
        if len(predictions) % 1000 == 0:
            pd.DataFrame({'predictions': [json.dumps(p) for p in predictions], 'labels': [json.dumps(l) for l in labels]}).to_csv(f'{file_prefix}_predictions_labels.csv', index=False)

    # Final save
    pd.DataFrame({'predictions': [json.dumps(p) for p in predictions], 'labels': [json.dumps(l) for l in labels]}).to_csv(f'{file_prefix}_predictions_labels.csv', index=False)

save_predictions_and_labels(test_dataset, trainer, 'test')


In [15]:
import pandas as pd
import json
import evaluate
from transformers import T5Tokenizer



# Load ROUGE metric
rouge_metric = evaluate.load("rouge")

def compute_rouge_from_file(file_path):
    df = pd.read_csv(file_path)
    # Convert JSON strings back to lists
    decoded_preds = [json.loads(p) for p in df['predictions']]
    decoded_labels = [json.loads(l) for l in df['labels']]

    # Decode text
    decoded_preds_text = tokenizer.batch_decode(decoded_preds, skip_special_tokens=True)
    decoded_labels_text = tokenizer.batch_decode(decoded_labels, skip_special_tokens=True)

    return rouge_metric.compute(predictions=decoded_preds_text, references=decoded_labels_text)

# Compute ROUGE scores
rouge_scores = compute_rouge_from_file('test_predictions_labels.csv')

print("ROUGE Scores:")
print(rouge_scores)


ROUGE Scores:
{'rouge1': 0.5128102388172548, 'rouge2': 0.24481161454956896, 'rougeL': 0.4832106540375619, 'rougeLsum': 0.4838495903347899}


In [16]:
# Generate predictions
def generate_predictions(test_dataset, trainer):
    predictions = []
    for batch in trainer.get_test_dataloader(test_dataset):
        outputs = trainer.model.generate(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend(decoded_preds)
    return predictions

# Get final output
final_output = generate_predictions(test_dataset, trainer)



In [17]:
test_df["Generated"] = final_output
test_df.to_csv('google_flan_t5_base_qna_dataset_2_v1.csv')

In [18]:
df_generated = pd.read_csv('google_flan_t5_base_qna_dataset_2_v1.csv')

In [19]:
df_generated.head(10)

Unnamed: 0.1,Unnamed: 0,question,answer,Generated
0,0,What is feature selection?,The process of selecting a subset of relevant ...,Feature selection is the process of selecting ...
1,1,"What are Markov decision processes (MDP), and ...",MDPs are mathematical frameworks for modeling ...,MDPs are a type of decision-making algorithm t...
2,2,What is batch normalization and how does it work?,Normalizes the layer's input over a mini-batch,Batch normalization normalizes the inputs of a...
3,3,How does Word2Vec create word embeddings?,Word2Vec creates word embeddings by training o...,Word2Vec creates word embeddings by capturing ...
4,4,What is the role of hidden layers in MLP?,Hidden layers in MLP help capture complex patt...,Hidden layers in MLP allow the model to learn ...
5,5,What is a Decision Tree?,A Decision Tree is a predictive learning algor...,A Decision Tree is a type of classification al...
6,6,What is variance in machine learning?,Variance refers to the error introduced by the...,Variance is the ratio of predictors to the tot...
7,7,What are common methods to diagnose bias and v...,Common methods include analyzing learning curv...,Common methods include comparing model perform...
8,8,What is the difference between L1 and L2 regul...,L1 regularization adds the absolute values of ...,L1 regularization regularizes the input sequen...
9,9,What is feature engineering for unstructured d...,Feature engineering for unstructured data invo...,Feature engineering for unstructured data invo...


In [20]:
model.save_pretrained("google_flan_t5_base_qna_dataset_2_v1")
tokenizer.save_pretrained("google_flan_t5_base_qna_dataset_2_v1")

('google_flan_t5_base_qna_dataset_2_v1/tokenizer_config.json',
 'google_flan_t5_base_qna_dataset_2_v1/special_tokens_map.json',
 'google_flan_t5_base_qna_dataset_2_v1/spiece.model',
 'google_flan_t5_base_qna_dataset_2_v1/added_tokens.json')

In [21]:
from huggingface_hub import notebook_login

In [22]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
repo_name="google_flan_t5_base_qna_dataset_2_v1"

model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kkasiviswanath/google_flan_t5_base_qna_dataset_2_v1/commit/9732fb369dbf6770511b6f7e43819e3d1790208f', commit_message='Upload tokenizer', commit_description='', oid='9732fb369dbf6770511b6f7e43819e3d1790208f', pr_url=None, pr_revision=None, pr_num=None)

In [25]:
from transformers import pipeline

In [26]:

model_name = 'kkasiviswanath/google_flan_t5_small_qna_dataset_2_v1'

# Load the model and tokenizer using the pipeline
qa_pipeline = pipeline('text2text-generation', model=model_name, tokenizer=model_name)

# Define a question
question = "Can we utilize an autoencoder to perform dimensionality reduction on numerical datasets?"

# Generate a prediction
result = qa_pipeline(question)

# Print the result
print(result)

config.json:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'generated_text': 'Autoencoders can be used to reduce the dimensionality of numerical datasets by'}]


In [27]:
# Define a question
question = "How does Word2Vec create word embeddings?"

# Generate a prediction
result = qa_pipeline(question)

# Print the result
print(result)

[{'generated_text': 'Word2Vec creates word embeddings using a vector based on'}]


In [28]:
from transformers import pipeline

# Define the model path
model_name = 'kkasiviswanath/google_flan_t5_base_qna_dataset_2_v1'

# Load the QA pipeline with your model
qa_pipeline = pipeline('text2text-generation', model=model_name, tokenizer=model_name)

# Define your question
question = "How does Word2Vec create word embeddings"

# Generate the answer with adjusted parameters
result = qa_pipeline(question, max_length=100, num_beams=5, temperature=1.0, top_p=0.9)

# Output the result
print(f"Question: {question}")
print(f"Answer: {result[0]['generated_text']}")


config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Question: How does Word2Vec create word embeddings
Answer: Word2Vec creates word embeddings by capturing the semantic relationships between words.


In [29]:
# Define your question
question = "Can we utilize an autoencoder to perform dimensionality reduction on numerical datasets?"

# Generate the answer with adjusted parameters
result = qa_pipeline(question, max_length=100, num_beams=5, temperature=1.0, top_p=0.9)

# Output the result
print(f"Question: {question}")
print(f"Answer: {result[0]['generated_text']}")

Question: Can we utilize an autoencoder to perform dimensionality reduction on numerical datasets?
Answer: Yes, we can use an autoencoder to perform dimensionality reduction on numerical datasets by encoding the input data into a lower-dimensional representation, reducing the dimensionality of the data.


In [30]:
# Define your question
question = "What is the role of hidden layers in MLP?"

# Generate the answer with adjusted parameters
result = qa_pipeline(question, max_length=100, num_beams=5, temperature=1.0, top_p=0.9)

# Output the result
print(f"Question: {question}")
print(f"Answer: {result[0]['generated_text']}")

Question: What is the role of hidden layers in MLP?
Answer: Hidden layers in MLP allow the model to capture hidden patterns in the data, allowing it to learn complex relationships between data points.
