<a href="https://colab.research.google.com/github/tripathishiva0123/GPT--2-installation/blob/main/Untitled11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments

# Step 1: Prepare Your Dataset
# Embed the JSON dataset directly in Python code
dataset = [
    {
        "prompt": "How does photosynthesis work?",
        "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose. They use sunlight to transform carbon dioxide and water into glucose and oxygen."
    },
    {
        "prompt": "Explain the theory of relativity.",
        "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity. Special relativity addresses the relationship between space and time, while general relativity deals with gravity and the curvature of spacetime."
    },
    {
        "prompt": "What is the Pythagorean theorem?",
        "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the hypotenuse is equal to the sum of the squares of the other two sides."
    },
    {
        "prompt": "What causes rainbows?",
        "response": "Rainbows are caused by the refraction, reflection, and dispersion of light in water droplets, resulting in a spectrum of light appearing in the sky."
    }
]

# Step 2: Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-1.3B')

# Add a padding token if it's not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Step 3: Preprocess the dataset
def preprocess_data(dataset):
    inputs = []
    labels = []
    for entry in dataset:
        prompt = entry['prompt']
        response = entry['response']
        input_ids = tokenizer(prompt, return_tensors='pt', truncation=True, padding='max_length', max_length=64).input_ids
        label_ids = tokenizer(response, return_tensors='pt', truncation=True, padding='max_length', max_length=64).input_ids
        inputs.append(input_ids.squeeze())
        labels.append(label_ids.squeeze())
    return inputs, labels

train_inputs, train_labels = preprocess_data(dataset)

# Create a custom dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'labels': self.labels[idx]}

train_dataset = CustomDataset(train_inputs, train_labels)

# Step 4: Load the pre-trained model
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-1.3B')

# Step 5: Fine-tune the model
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    fp16=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

# Step 6: Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Step 7: Generate text using the fine-tuned model
from transformers import pipeline

text_generator = pipeline('text-generation', model='./fine-tuned-model', tokenizer='./fine-tuned-model')
output = text_generator("What causes rainbows?", max_length=50)
print(output)


In [2]:
import json
from transformers import GPTNeoForCausalLM, GPTNeoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset

# Embedded JSON dataset
dataset = [
    {
        "prompt": "How does photosynthesis work?",
        "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose. They use sunlight to transform carbon dioxide and water into glucose and oxygen."
    },
    {
        "prompt": "Explain the theory of relativity.",
        "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity. Special relativity addresses the relationship between space and time, while general relativity deals with gravity and the curvature of spacetime."
    },
    {
        "prompt": "What causes rainbows?",
        "response": "Rainbows are caused by the refraction, dispersion, and reflection of light, typically sunlight, in water droplets resulting in a spectrum of light appearing in the sky."
    },
    {
        "prompt": "What is the Pythagorean theorem?",
        "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the length of the hypotenuse is equal to the sum of the squares of the lengths of the other two sides."
    }
]

# Load the tokenizer
tokenizer = GPTNeoTokenizer.from_pretrained('EleutherAI/gpt-neo-125M')

# Preprocess the dataset
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        prompt = item['prompt']
        response = item['response']
        inputs = self.tokenizer(prompt, return_tensors='pt', truncation=True, padding='max_length', max_length=256)
        labels = self.tokenizer(response, return_tensors='pt', truncation=True, padding='max_length', max_length=256).input_ids
        labels[labels == self.tokenizer.pad_token_id] = -100  # Mask out padding tokens for loss calculation
        inputs['labels'] = labels
        return {k: v.squeeze(0) for k, v in inputs.items()}

# Create the dataset
train_dataset = CustomDataset(dataset, tokenizer)

# Load the pre-trained model
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M')

# Set up training arguments with optimizations
training_args = TrainingArguments(
    per_device_train_batch_size=1,          # Smaller batch size to reduce memory usage
    gradient_accumulation_steps=4,          # Accumulate gradients to simulate a larger batch size
    num_train_epochs=3,
    fp16=True,                              # Enable mixed precision training
    logging_dir='./logs',
    output_dir='./fine-tuned-model',
    save_total_limit=2,
    save_steps=500
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Set up the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Generate text with the fine-tuned model
from transformers import pipeline

text_generator = pipeline('text-generation', model='./fine-tuned-model')
output = text_generator("How does photosynthesis work?")
print(output)


ImportError: cannot import name 'GPTNeoTokenizer' from 'transformers' (/usr/local/lib/python3.10/dist-packages/transformers/__init__.py)

In [3]:
import json
from transformers import GPTNeoForCausalLM, GPTNeoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset

# Embedded JSON dataset
dataset = [
    {
        "prompt": "How does photosynthesis work?",
        "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose. They use sunlight to transform carbon dioxide and water into glucose and oxygen."
    },
    {
        "prompt": "Explain the theory of relativity.",
        "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity. Special relativity addresses the relationship between space and time, while general relativity deals with gravity and the curvature of spacetime."
    },
    {
        "prompt": "What causes rainbows?",
        "response": "Rainbows are caused by the refraction, dispersion, and reflection of light, typically sunlight, in water droplets resulting in a spectrum of light appearing in the sky."
    },
    {
        "prompt": "What is the Pythagorean theorem?",
        "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the length of the hypotenuse is equal to the sum of the squares of the lengths of the other two sides."
    }
]

# Load the tokenizer
tokenizer = GPTNeoTokenizer.from_pretrained('EleutherAI/gpt-neo-125M')

# Preprocess the dataset
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        prompt = item['prompt']
        response = item['response']
        inputs = self.tokenizer(prompt, return_tensors='pt', truncation=True, padding='max_length', max_length=256)
        labels = self.tokenizer(response, return_tensors='pt', truncation=True, padding='max_length', max_length=256).input_ids
        labels[labels == self.tokenizer.pad_token_id] = -100  # Mask out padding tokens for loss calculation
        inputs['labels'] = labels
        return {k: v.squeeze(0) for k, v in inputs.items()}

# Create the dataset
train_dataset = CustomDataset(dataset, tokenizer)

# Load the pre-trained model
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M')

# Set up training arguments with optimizations
training_args = TrainingArguments(
    per_device_train_batch_size=1,          # Smaller batch size to reduce memory usage
    gradient_accumulation_steps=4,          # Accumulate gradients to simulate a larger batch size
    num_train_epochs=3,
    fp16=True,                              # Enable mixed precision training
    logging_dir='./logs',
    output_dir='./fine-tuned-model',
    save_total_limit=2,
    save_steps=500
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Set up the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Generate text with the fine-tuned model
from transformers import pipeline

text_generator = pipeline('text-generation', model='./fine-tuned-model')
output = text_generator("How does photosynthesis work?")
print(output)


ImportError: cannot import name 'GPTNeoTokenizer' from 'transformers' (/usr/local/lib/python3.10/dist-packages/transformers/__init__.py)

In [None]:
import json
import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments

# Step 1: Prepare Your Dataset
# Embed the JSON dataset directly in Python code
dataset = [
    {
        "prompt": "How does photosynthesis work?",
        "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose. They use sunlight to transform carbon dioxide and water into glucose and oxygen."
    },
    {
        "prompt": "Explain the theory of relativity.",
        "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity. Special relativity addresses the relationship between space and time, while general relativity deals with gravity and the curvature of spacetime."
    },
    {
        "prompt": "What is the Pythagorean theorem?",
        "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the hypotenuse is equal to the sum of the squares of the other two sides."
    },
    {
        "prompt": "What causes rainbows?",
        "response": "Rainbows are caused by the refraction, reflection, and dispersion of light in water droplets, resulting in a spectrum of light appearing in the sky."
    }
]

# Step 2: Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-1.3B')

# Add a padding token if it's not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Step 3: Preprocess the dataset
def preprocess_data(dataset):
    inputs = []
    labels = []
    for entry in dataset:
        prompt = entry['prompt']
        response = entry['response']
        input_ids = tokenizer(prompt, return_tensors='pt', truncation=True, padding='max_length', max_length=64).input_ids
        label_ids = tokenizer(response, return_tensors='pt', truncation=True, padding='max_length', max_length=64).input_ids
        inputs.append(input_ids.squeeze())
        labels.append(label_ids.squeeze())
    return inputs, labels

train_inputs, train_labels = preprocess_data(dataset)

# Create a custom dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'labels': self.labels[idx]}

train_dataset = CustomDataset(train_inputs, train_labels)

# Step 4: Load the pre-trained model
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-1.3B')

# Step 5: Fine-tune the model
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    fp16=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

# Step 6: Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Step 7: Generate text using the fine-tuned model
from transformers import pipeline

text_generator = pipeline('text-generation', model='./fine-tuned-model', tokenizer='./fine-tuned-model')
output = text_generator("What causes rainbows?", max_length=50)
print(output)


In [1]:
import json
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset

# Embedded JSON dataset
dataset = [
    {
        "prompt": "How does photosynthesis work?",
        "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose. They use sunlight to transform carbon dioxide and water into glucose and oxygen."
    },
    {
        "prompt": "Explain the theory of relativity.",
        "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity. Special relativity addresses the relationship between space and time, while general relativity deals with gravity and the curvature of spacetime."
    },
    {
        "prompt": "What causes rainbows?",
        "response": "Rainbows are caused by the refraction, dispersion, and reflection of light, typically sunlight, in water droplets resulting in a spectrum of light appearing in the sky."
    },
    {
        "prompt": "What is the Pythagorean theorem?",
        "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the length of the hypotenuse is equal to the sum of the squares of the lengths of the other two sides."
    }
]

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-125M')
tokenizer.pad_token = tokenizer.eos_token  # Set pad token as EOS token to avoid errors

# Preprocess the dataset
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        prompt = item['prompt']
        response = item['response']
        inputs = self.tokenizer(prompt, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
        labels = self.tokenizer(response, return_tensors='pt', truncation=True, padding='max_length', max_length=128).input_ids
        labels[labels == tokenizer.pad_token_id] = -100  # Mask out padding tokens for loss calculation
        inputs['labels'] = labels
        return {k: v.squeeze(0) for k, v in inputs.items()}

# Create the dataset
train_dataset = CustomDataset(dataset, tokenizer)

# Load the pre-trained model
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M')

# Set up training arguments with optimizations
training_args = TrainingArguments(
    per_device_train_batch_size=1,          # Smaller batch size to reduce memory usage
    gradient_accumulation_steps=2,          # Accumulate gradients to simulate a larger batch size
    num_train_epochs=1,                     # Fewer epochs to reduce runtime
    logging_dir='./logs',
    output_dir='./fine-tuned-model',
    save_total_limit=1,                     # Limit the number of saved checkpoints
    save_steps=200                          # Save model every 200 steps
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Set up the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Generate text with the fine-tuned model
from transformers import pipeline

text_generator = pipeline('text-generation', model='./fine-tuned-model')
output = text_generator("How does photosynthesis work?", max_length=100)
print(output)


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Step,Training Loss


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'How does photosynthesis work?\n\nPhotolysis is a process that occurs when photosynthesis occurs in a photosynthetic organism. Photosynthesis is the process of converting light into energy. The photosynthetic process is the process of converting light into energy.\n\nPhotolysis is a process that occurs when photosynthesis occurs in a photosynthetic organism. Photosynthesis is the process of converting light into energy.\n\nPhotolysis is a process that occurs when photosynthesis occurs in a'}]


In [2]:
import json
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset

# Embedded JSON dataset
dataset = [
    {
        "prompt": "How does photosynthesis work?",
        "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose. They use sunlight to transform carbon dioxide and water into glucose and oxygen."
    },
    {
        "prompt": "Explain the theory of relativity.",
        "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity. Special relativity addresses the relationship between space and time, while general relativity deals with gravity and the curvature of spacetime."
    },
    {
        "prompt": "What causes rainbows?",
        "response": "Rainbows are caused by the refraction, dispersion, and reflection of light, typically sunlight, in water droplets resulting in a spectrum of light appearing in the sky."
    },
    {
        "prompt": "What is the Pythagorean theorem?",
        "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the length of the hypotenuse is equal to the sum of the squares of the lengths of the other two sides."
    }
]

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-125M')
tokenizer.pad_token = tokenizer.eos_token  # Set pad token as EOS token to avoid errors

# Preprocess the dataset
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        prompt = item['prompt']
        response = item['response']
        inputs = self.tokenizer(prompt, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
        labels = self.tokenizer(response, return_tensors='pt', truncation=True, padding='max_length', max_length=128).input_ids
        labels[labels == tokenizer.pad_token_id] = -100  # Mask out padding tokens for loss calculation
        inputs['labels'] = labels
        return {k: v.squeeze(0) for k, v in inputs.items()}

# Create the dataset
train_dataset = CustomDataset(dataset, tokenizer)

# Load the pre-trained model
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M')

# Set up training arguments with optimizations
training_args = TrainingArguments(
    per_device_train_batch_size=1,          # Smaller batch size to reduce memory usage
    gradient_accumulation_steps=2,          # Accumulate gradients to simulate a larger batch size
    num_train_epochs=1,                     # Fewer epochs to reduce runtime
    logging_dir='./logs',
    output_dir='./fine-tuned-model',
    save_total_limit=1,                     # Limit the number of saved checkpoints
    save_steps=200                          # Save model every 200 steps
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Set up the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Generate text with the fine-tuned model
from transformers import pipeline

text_generator = pipeline('text-generation', model='./fine-tuned-model')
output = text_generator("How does photosynthesis work?", max_length=100)
print(output)


Step,Training Loss


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'How does photosynthesis work?\n\nPhotolysis is a process that occurs when photosynthesis occurs in a photosynthetic organism. Photosynthesis is the process of converting light into energy. The photosynthetic process is the process of converting light into energy.\n\nPhotolysis is a process that occurs when photosynthesis occurs in a photosynthetic organism. Photosynthesis is the process of converting light into energy.\n\nPhotolysis is a process that occurs when photosynthesis occurs in a'}]


In [3]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import torch

# Embedded JSON dataset
dataset_json = [
    {
        "prompt": "How does photosynthesis work?",
        "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose."
    },
    {
        "prompt": "Explain the theory of relativity.",
        "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity."
    },
    {
        "prompt": "What is the Pythagorean theorem?",
        "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the length of the hypotenuse is equal to the sum of the squares of the lengths of the other two sides."
    },
    {
        "prompt": "How do vaccines work?",
        "response": "Vaccines work by stimulating the immune system to recognize and fight specific pathogens, such as viruses or bacteria."
    }
]

# Convert the JSON dataset into a custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data[idx]['prompt']
        response = self.data[idx]['response']
        input_encodings = self.tokenizer(prompt, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        response_encodings = self.tokenizer(response, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        input_ids = input_encodings['input_ids'].squeeze()
        attention_mask = input_encodings['attention_mask'].squeeze()
        labels = response_encodings['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Load tokenizer and model
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Prepare the dataset
train_dataset = CustomDataset(dataset_json, tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_dir='./logs',
    logging_steps=10,
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Load the fine-tuned model and tokenizer for text generation
model = GPT2LMHeadModel.from_pretrained('./fine-tuned-model')
tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-model')

# Interactive loop for user input
print("Enter your prompt (type 'exit' to quit):")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break

    # Tokenize user input
    inputs = tokenizer(user_input, return_tensors='pt')

    # Generate response
    outputs = model.generate(inputs['input_ids'], max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

    # Decode and print the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("AI: ", response)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [4]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import torch

# Larger JSON dataset embedded within the code
dataset_json = [
    {
        "prompt": "How does photosynthesis work?",
        "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose."
    },
    {
        "prompt": "Explain the theory of relativity.",
        "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity."
    },
    {
        "prompt": "What is the Pythagorean theorem?",
        "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the length of the hypotenuse is equal to the sum of the squares of the lengths of the other two sides."
    },
    {
        "prompt": "How do vaccines work?",
        "response": "Vaccines work by stimulating the immune system to recognize and fight specific pathogens, such as viruses or bacteria."
    },
    # Additional data entries
    {
        "prompt": "What causes rainbows?",
        "response": "Rainbows are caused by the refraction, dispersion, and reflection of light, typically from the sun, in water droplets, resulting in a spectrum of light appearing in the sky."
    },
    {
        "prompt": "What is the capital of France?",
        "response": "The capital of France is Paris."
    },
    {
        "prompt": "How does the internet work?",
        "response": "The internet is a global network of computers that communicate with each other using the Internet Protocol (IP) and other protocols."
    },
    {
        "prompt": "Explain the process of evaporation.",
        "response": "Evaporation is the process by which water changes from a liquid to a gas or vapor."
    },
    {
        "prompt": "What is gravity?",
        "response": "Gravity is a force by which a planet or other body draws objects toward its center. The force of gravity keeps all of the planets in orbit around the sun."
    },
    {
        "prompt": "Who wrote 'To Kill a Mockingbird'?",
        "response": "'To Kill a Mockingbird' was written by Harper Lee."
    },
    # You can continue adding more entries here...
]

# Convert the JSON dataset into a custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data[idx]['prompt']
        response = self.data[idx]['response']
        input_encodings = self.tokenizer(prompt, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        response_encodings = self.tokenizer(response, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        input_ids = input_encodings['input_ids'].squeeze()
        attention_mask = input_encodings['attention_mask'].squeeze()
        labels = response_encodings['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Load tokenizer and model
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add a padding token if it does not exist
tokenizer.pad_token = tokenizer.eos_token

# Load the model
model = GPT2LMHeadModel.from_pretrained(model_name)

# Prepare the dataset
train_dataset = CustomDataset(dataset_json, tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=1,  # Increase the number of epochs for larger datasets
    logging_dir='./logs',
    logging_steps=10,
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Load the fine-tuned model and tokenizer for text generation
model = GPT2LMHeadModel.from_pretrained('./fine-tuned-model')
tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-model')

# Interactive loop for user input
print("Enter your prompt (type 'exit' to quit):")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break

    # Tokenize user input
    inputs = tokenizer(user_input, return_tensors='pt')

    # Generate response
    outputs = model.generate(inputs['input_ids'], max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

    # Decode and print the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("AI: ", response)


Step,Training Loss


Enter your prompt (type 'exit' to quit):
You: What causes rainbows ?


The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


AI:  What causes rainbows?

Rainbows are the most common type of rainbows. They are usually found in the tropics and in the tropics of the Pacific Ocean. They are found in the tropics of the Pacific Ocean, the Caribbean, and the Indian Ocean.

Rainbows are formed when the air is heated by the sun. The air is heated by the sun, and the air is cooled by the sun. The air is cooled by the sun, and the air is
You: Explain theory of relativity ?
AI:  Explain theory of relativity?

The theory of relativity is a theory of relativity that describes the laws of nature. It is a theory of relativity that describes the laws of nature. It is a theory of relativity that describes the laws of nature. It is a theory of relativity that describes the laws of nature. It is a theory of relativity that describes the laws of nature. It is a theory of relativity that describes the laws of nature. It is a theory of relativity that describes the laws of
You: What is gravity ?
AI:  What is gravity?

Gravity is t

In [5]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import torch

# Larger JSON dataset embedded within the code
dataset_json = [
    {
        "prompt": "How does photosynthesis work?",
        "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose."
    },
    {
        "prompt": "Explain the theory of relativity.",
        "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity."
    },
    {
        "prompt": "What is the Pythagorean theorem?",
        "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the length of the hypotenuse is equal to the sum of the squares of the lengths of the other two sides."
    },
    {
        "prompt": "How do vaccines work?",
        "response": "Vaccines work by stimulating the immune system to recognize and fight specific pathogens, such as viruses or bacteria."
    },
    # Additional data entries
    {
        "prompt": "What causes rainbows?",
        "response": "Rainbows are caused by the refraction, dispersion, and reflection of light, typically from the sun, in water droplets, resulting in a spectrum of light appearing in the sky."
    },
    {
        "prompt": "What is the capital of France?",
        "response": "The capital of France is Paris."
    },
    {
        "prompt": "How does the internet work?",
        "response": "The internet is a global network of computers that communicate with each other using the Internet Protocol (IP) and other protocols."
    },
    {
        "prompt": "Explain the process of evaporation.",
        "response": "Evaporation is the process by which water changes from a liquid to a gas or vapor."
    },
    {
        "prompt": "What is gravity?",
        "response": "Gravity is a force by which a planet or other body draws objects toward its center. The force of gravity keeps all of the planets in orbit around the sun."
    },
    {
        "prompt": "Who wrote 'To Kill a Mockingbird'?",
        "response": "'To Kill a Mockingbird' was written by Harper Lee."
    },
    # You can continue adding more entries here...
]

# Convert the JSON dataset into a custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data[idx]['prompt']
        response = self.data[idx]['response']
        input_encodings = self.tokenizer(prompt, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        response_encodings = self.tokenizer(response, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        input_ids = input_encodings['input_ids'].squeeze()
        attention_mask = input_encodings['attention_mask'].squeeze()
        labels = response_encodings['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Load tokenizer and model
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add a padding token if it does not exist
tokenizer.pad_token = tokenizer.eos_token

# Load the model
model = GPT2LMHeadModel.from_pretrained(model_name)

# Prepare the dataset
train_dataset = CustomDataset(dataset_json, tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=1,  # Increase the number of epochs for larger datasets
    logging_dir='./logs',
    logging_steps=10,
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Load the fine-tuned model and tokenizer for text generation
model = GPT2LMHeadModel.from_pretrained('./fine-tuned-model')
tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-model')

# Interactive loop for user input
print("Enter your prompt (type 'exit' to quit):")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break

    # Tokenize user input
    inputs = tokenizer(user_input, return_tensors='pt')

    # Generate response
    outputs = model.generate(inputs['input_ids'], max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

    # Decode and print the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("AI: ", response)


Step,Training Loss


Enter your prompt (type 'exit' to quit):
You: What cause rainbows ?
AI:  What cause rainbows?

Rainbows are the most common type of rainbows. They are usually found in the tropics and in the tropics of the Pacific Ocean. They are found in the tropics of the Pacific Ocean, the Caribbean, and the Indian Ocean.

Rainbows are formed when the air is heated by the sun. The air is heated by the sun, and the air is cooled by the sun. The air is cooled by the sun, and the air is
You: How do vaccines work ?
AI:  How do vaccines work?

The answer is that vaccines work by preventing the development of immunity. The immune system is the body's natural defense against infectious diseases. The immune system is the body's natural defense against infectious diseases.

The immune system is the body's natural defense against infectious diseases.

The immune system is the body's natural defense against infectious diseases.

The immune system is the body's natural defense against infectious diseases.

The 

In [None]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import torch

# Larger JSON dataset embedded within the code
dataset_json = [
    {
        "prompt": "How does photosynthesis work?",
        "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose."
    },
    {
        "prompt": "Explain the theory of relativity.",
        "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity."
    },
    {
        "prompt": "What is the Pythagorean theorem?",
        "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the length of the hypotenuse is equal to the sum of the squares of the lengths of the other two sides."
    },
    {
        "prompt": "How do vaccines work?",
        "response": "Vaccines work by stimulating the immune system to recognize and fight specific pathogens, such as viruses or bacteria."
    },
    {
        "prompt": "What causes rainbows?",
        "response": "Rainbows are caused by the refraction, dispersion, and reflection of light, typically from the sun, in water droplets, resulting in a spectrum of light appearing in the sky."
    },
    {
        "prompt": "What is the capital of France?",
        "response": "The capital of France is Paris."
    },
    {
        "prompt": "How does the internet work?",
        "response": "The internet is a global network of computers that communicate with each other using the Internet Protocol (IP) and other protocols."
    },
    {
        "prompt": "Explain the process of evaporation.",
        "response": "Evaporation is the process by which water changes from a liquid to a gas or vapor."
    },
    {
        "prompt": "What is gravity?",
        "response": "Gravity is a force by which a planet or other body draws objects toward its center. The force of gravity keeps all of the planets in orbit around the sun."
    },
    {
        "prompt": "Who wrote 'To Kill a Mockingbird'?",
        "response": "'To Kill a Mockingbird' was written by Harper Lee."
    },
    # Additional data entries can be added here...
]

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data[idx]['prompt']
        response = self.data[idx]['response']
        input_encodings = self.tokenizer(prompt, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        response_encodings = self.tokenizer(response, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        input_ids = input_encodings['input_ids'].squeeze()
        attention_mask = input_encodings['attention_mask'].squeeze()
        labels = response_encodings['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)

train_dataset = CustomDataset(dataset_json, tokenizer)

training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=3,  # Increased number of epochs
    logging_dir='./logs',
    logging_steps=10,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

trainer.train()

model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

model = GPT2LMHeadModel.from_pretrained('./fine-tuned-model')
tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-model')

print("Enter your prompt (type 'exit' to quit):")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break

    inputs = tokenizer(user_input, return_tensors='pt', padding='max_length', truncation=True, max_length=512)

    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=100,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("AI: ", response)
