In [1]:
! pip install torch transformers datasets tensorflow accelerate huggingface_hub



In [2]:
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
from datasets import load_dataset
import numpy as np
import os

In [3]:
# Load the dataset from Hugging Face
dataset = load_dataset("squad_v2")

# Print sample data
print(dataset["train"][0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'id': '56be85543aeaaa14008c9063', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".', 'question': 'When did Beyonce start becoming popular?', 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}


In [4]:
print(dataset["train"][1])

{'id': '56be85543aeaaa14008c9065', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".', 'question': 'What areas did Beyonce compete in when she was growing up?', 'answers': {'text': ['singing and dancing'], 'answer_start': [207]}}


In [5]:
def preprocess_answers(example):
    """Convert nested answers dictionary into a single string."""
    if example["answers"]["text"]:
        example["answer"] = example["answers"]["text"][0]  # Take the first answer
    else:
        example["answer"] = ""
    return example

# Apply preprocessing
dataset = dataset.map(preprocess_answers)


In [6]:
for example in dataset["train"].select(range(5)):
    print(f"Question: {example['question']}")
    print(f"Context: {example['context']}")
    print(f"Answer: {example['answers']}\n")


Question: When did Beyonce start becoming popular?
Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Answer: {'text': ['in the late 1990s'], 'answer_start': [269]}

Question: What areas did Beyonce compete in when she was growing up?
Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singe

In [7]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a default padding token

example = dataset["train"][0]  # Take first example
question = str(example["question"]) if example["question"] else ""
context = str(example["context"]) if example["context"] else ""
answer = example["answers"]["text"][0] if example["answers"]["text"] else ""

inputs = tokenizer(question + " " + context, padding="max_length", truncation=True, max_length=512)
outputs = tokenizer(answer, padding="max_length", truncation=True, max_length=512)

print(f"Input Length: {len(inputs['input_ids'])}, Label Length: {len(outputs['input_ids'])}")


Input Length: 512, Label Length: 512


In [8]:
def tokenize_function(batch):
    """Handle batch processing correctly"""
    # Process entire batches of text
    questions = [str(q) for q in batch["question"]]
    contexts = [str(c) for c in batch["context"]]
    answers = [str(a) for a in batch["answer"]]

    # Create full input sequences
    input_texts = [f"{q} {c}" for q, c in zip(questions, contexts)]

    # Tokenize inputs in batch mode
    inputs = tokenizer(
        input_texts,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="tf"  # Important for TensorFlow compatibility
    )

    # Tokenize answers separately
    outputs = tokenizer(
        answers,
        padding="max_length",
        truncation=True,
        max_length=50,
        return_tensors="tf"
    )

    return {
        "input_ids": inputs["input_ids"].numpy(),
        "attention_mask": inputs["attention_mask"].numpy(),
        "labels": outputs["input_ids"].numpy()
    }

# Apply with adjusted batch size
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=32  # Reduce if you encounter memory issues
)

In [9]:
print(tokenized_datasets["train"][0])

{'id': '56be85543aeaaa14008c9063', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".', 'question': 'When did Beyonce start becoming popular?', 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}, 'answer': 'in the late 1990s', 'input_ids': [2215, 750, 37361, 344, 923, 5033, 2968, 30, 37361, 32682, 402, 27

In [10]:
def tokenize_function(batch):
    """Handle batch processing correctly"""
    questions = [str(q) for q in batch["question"]]
    contexts = [str(c) for c in batch["context"]]

    # Handle missing answers safely
    answers = [a["text"][0] if "text" in a and a["text"] else "No answer" for a in batch["answers"]]

    # Create full input sequences
    input_texts = [f"{q} {c}" for q, c in zip(questions, contexts)]

    # Tokenize inputs in batch mode
    inputs = tokenizer(
        input_texts,
        padding="max_length",
        truncation=True,
        max_length=512
    )

    # Tokenize answers separately
    outputs = tokenizer(
        answers,
        padding="max_length",
        truncation=True,
        max_length=50
    )

    # Replace padding tokens in labels with -100 (ignored during loss computation)
    labels = outputs["input_ids"]
    labels = [[token if token != tokenizer.pad_token_id else -100 for token in label] for label in labels]

    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels
    }

# Apply tokenization function again
tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True
)


In [10]:
# Remove unnecessary columns
tokenized_datasets = tokenized_datasets.remove_columns(["id", "title", "context", "question", "answers"])


In [26]:


train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask"],  # Input features
    label_cols="labels",  # Explicitly set as a single string
    shuffle=True,
    batch_size=8
)


OR

In [11]:
train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask"],  # only features here
    label_cols=["labels"],                   # this becomes y
    shuffle=True,
    batch_size=8
)


Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


In [31]:
"""from transformers import default_data_collator

train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],  # Include labels in columns
    shuffle=True,
    batch_size=8,
    collate_fn=default_data_collator,  # Good practice for GPT-2
    drop_remainder=True
)
"""

#Fine Tunning

In [12]:
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer, create_optimizer

# Load the pre-trained GPT-2 model with language modeling head
model = TFGPT2LMHeadModel.from_pretrained("gpt2")


All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [13]:
# Training Parameters
EPOCHS = 1  # Adjust as needed
LEARNING_RATE = 3e-5
BATCH_SIZE = 64
STEPS_PER_EPOCH = len(train_dataset) // BATCH_SIZE

# Create Optimizer
optimizer, lr_schedule = create_optimizer(
    init_lr=LEARNING_RATE,
    num_train_steps=STEPS_PER_EPOCH * EPOCHS,
    weight_decay_rate=0.01,
    num_warmup_steps=0
)

# Compile the model
model.compile(optimizer=optimizer)


In [None]:
# Train the model
model.fit(
    train_dataset,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE
)


#Test Don't edit
#Expected
Generated Answer: in the late 1990s


In [None]:
def generate_answer(question, context):
    input_text = f"{question} {context}"
    input_ids = tokenizer.encode(input_text, return_tensors="tf")

    output = model.generate(input_ids, max_length=100, num_return_sequences=1)
    answer = tokenizer.decode(output[0], skip_special_tokens=True)

    return answer

# Test Example
question = "When did Beyonce start becoming popular?"
context = "Beyoncé rose to fame in the late 1990s as the lead singer of Destiny's Child."
print("Generated Answer:", generate_answer(question, context))


#New Code


In [None]:
from datasets import load_dataset

dataset = load_dataset("squad_v2")

# Select half of the training data (e.g., first half)
train_dataset = dataset["train"]
half_size = len(train_dataset) // 2

# Option 1: Just take the first half
train_subset = train_dataset.select(range(half_size))

# OR Option 2: Shuffle first, then take half for a random subset
train_subset = train_dataset.shuffle(seed=42).select(range(half_size))





# Then apply your existing tokenization function
tokenized_subset = train_subset.map(tokenize_function, batched=True)


# Continue as usual
model.compile(optimizer=optimizer, loss=model.compute_loss)
model.fit(train_tf_dataset, epochs=EPOCHS)


In [2]:
import tensorflow as tf
from transformers import GPT2Tokenizer, TFGPT2LMHeadModel
from datasets import load_dataset
import numpy as np

# Load dataset and preprocess
dataset = load_dataset("squad_v2")

#dataset = load_dataset("squad_v2", split="train[:30%]")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
"""# Select half of the training data (e.g., first half)
train_dataset = dataset
half_size = len(train_dataset) // 4

# Option 1: Just take the first half
train_subset = train_dataset.select(range(half_size))

"""

In [3]:
def preprocess_answers(example):
    """Convert nested answers into a single string."""
    example["answer"] = example["answers"]["text"][0] if example["answers"]["text"] else ""
    return example

dataset = dataset.map(preprocess_answers)
#train_subset = train_subset.map(preprocess_answers)

In [4]:
# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [5]:


def tokenize_function(batch):
    """Tokenize inputs with answer masking for training."""
    questions = [str(q) for q in batch["question"]]
    contexts = [str(c) for c in batch["context"]]
    answers = [a["text"][0] if a["text"] else "" for a in batch["answers"]]

    # Create full input text with structured format
    input_texts = [
        f"Question: {q} Context: {c} Answer: {a}"
        for q, c, a in zip(questions, contexts, answers)
    ]

    # Tokenize entire sequence
    tokenized = tokenizer(
        input_texts,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="np"
    )

    # Create labels mask
    labels = tokenized["input_ids"].copy()
    for idx in range(len(input_texts)):
        # Find approximate answer position using text format
        prefix = f"Question: {questions[idx]} Context: {contexts[idx]} Answer: "
        prefix_tokens = tokenizer.encode(prefix, add_special_tokens=False, max_length=512, truncation=True)
        answer_start = len(prefix_tokens)

        # Mask everything before answer and padding
        labels[idx, :answer_start] = -100
        padding_mask = tokenized["input_ids"][idx] == tokenizer.pad_token_id
        labels[idx][padding_mask] = -100

    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "labels": labels.tolist()
    }

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

#tokenized_datasets = train_subset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["id", "title", "context", "question", "answers"])


In [6]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11873
    })
})

In [7]:
from datasets import DatasetDict

subset = DatasetDict({
    "train": tokenized_datasets["train"].shuffle(seed=42).select(range(int(0.01 * len(tokenized_datasets["train"])))),
    "validation": tokenized_datasets["validation"].shuffle(seed=42).select(range(int(0.01 * len(tokenized_datasets["validation"]))))
})

In [8]:
print(subset)

DatasetDict({
    train: Dataset({
        features: ['answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1303
    })
    validation: Dataset({
        features: ['answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 118
    })
})


In [9]:
tokenized_datasets = subset

In [10]:
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1303
    })
    validation: Dataset({
        features: ['answer', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 118
    })
})


In [11]:
# Prepare TF dataset
train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    batch_size=8
)

# Load model
#model = TFGPT2LMHeadModel.from_pretrained("gpt2")


# In model loading
model = TFGPT2LMHeadModel.from_pretrained("gpt2",
    use_cache=False,  # Saves 20% memory
    output_hidden_states=False,
    output_attentions=False
)

# Training setup
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(optimizer=optimizer)


All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [None]:
# Train
model.fit(train_dataset, epochs=3)

Epoch 1/3
