In [None]:
# Install required libraries
!pip install torch
!pip install transformers
!pip install datasets

In [None]:
# Load and Move the FLAN-T5 Model and its tokenizer to the GPU
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "google/flan-t5-base"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Check if the tokenizer works by testing it on a sample text
sample_text = "I am testing"
tokens = tokenizer(sample_text)
print(tokens)


In [None]:
# Load the dataset
from datasets import load_dataset

# Load the original dataset and reduce its size
ds = load_dataset("rajpurkar/squad")
train_ds = ds['train'].shuffle(seed=42).select(range(10000))  # Adjust the range if needed


In [None]:
# Define the preprocess function
def preprocess_function(examples):
    # Tokenize the context and question
    inputs = tokenizer(
        examples['question'],
        examples['context'],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )

    # Initialize a list to hold the first answers
    first_answers = []
    for answer in examples['answers']:
        if len(answer['text']) > 0:  # Check if the text list is not empty
            first_answers.append(answer['text'][0])  # Append the first answer
        else:
            first_answers.append("")  # Append an empty string if no answers

    # Tokenize the answers
    labels = tokenizer(
        first_answers,
        truncation=True,
        padding="max_length",
        max_length=128,
        return_tensors="pt"
    )['input_ids']

    inputs['labels'] = labels
    return inputs

# Preprocess the dataset using the reduced training set
tokenized_ds = train_ds.map(preprocess_function, batched=True)


In [None]:
# Split the tokenized dataset into training and validation sets (80% train, 20% validation)
train_test_split = tokenized_ds.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']


In [None]:
from transformers import TrainingArguments

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Save model at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,  # Load the best checkpoint after training
)


In [None]:
from transformers import Trainer

# Initialize the trainer with model, dataset, and training arguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Start training
trainer.train()


In [None]:
# Function to generate an answer based on a context and question
def generate_answer(context, question):
    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, padding=True).to("cuda")
    outputs = model.generate(inputs.input_ids)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Example context and question
context = "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France. It was constructed between 1887 and 1889 as the entrance arch for the 1889 World's Fair."
question = "When was the Eiffel Tower constructed?"

# Generate the answer
answer = generate_answer(context, question)
print("Question:", question)
print("Answer:", answer)


In [None]:
# Evaluate the model on the validation set
metrics = trainer.evaluate()
print(metrics)


In [None]:
# Save the trained model and tokenizer
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")


In [None]:
!huggingface-cli login


In [None]:
# Push the model and tokenizer to Hugging Face with a unique name
model.push_to_hub("tootooba/flan-t5-qa-study-assistant")
tokenizer.push_to_hub("tootooba/flan-t5-qa-study-assistant")
