<a href="https://colab.research.google.com/github/tripathishiva0123/GPT--2-installation/blob/main/Untitled11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import json
import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments

# Step 1: Prepare Your Dataset
# Embed the JSON dataset directly in Python code
dataset = [
    {
        "prompt": "How does photosynthesis work?",
        "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose. They use sunlight to transform carbon dioxide and water into glucose and oxygen."
    },
    {
        "prompt": "Explain the theory of relativity.",
        "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity. Special relativity addresses the relationship between space and time, while general relativity deals with gravity and the curvature of spacetime."
    },
    {
        "prompt": "What is the Pythagorean theorem?",
        "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the hypotenuse is equal to the sum of the squares of the other two sides."
    },
    {
        "prompt": "What causes rainbows?",
        "response": "Rainbows are caused by the refraction, reflection, and dispersion of light in water droplets, resulting in a spectrum of light appearing in the sky."
    }
]

# Step 2: Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-1.3B')

# Add a padding token if it's not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Step 3: Preprocess the dataset
def preprocess_data(dataset):
    inputs = []
    labels = []
    for entry in dataset:
        prompt = entry['prompt']
        response = entry['response']
        input_ids = tokenizer(prompt, return_tensors='pt', truncation=True, padding='max_length', max_length=64).input_ids
        label_ids = tokenizer(response, return_tensors='pt', truncation=True, padding='max_length', max_length=64).input_ids
        inputs.append(input_ids.squeeze())
        labels.append(label_ids.squeeze())
    return inputs, labels

train_inputs, train_labels = preprocess_data(dataset)

# Create a custom dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'labels': self.labels[idx]}

train_dataset = CustomDataset(train_inputs, train_labels)

# Step 4: Load the pre-trained model
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-1.3B')

# Step 5: Fine-tune the model
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    fp16=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

# Step 6: Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Step 7: Generate text using the fine-tuned model
from transformers import pipeline

text_generator = pipeline('text-generation', model='./fine-tuned-model', tokenizer='./fine-tuned-model')
output = text_generator("What causes rainbows?", max_length=50)
print(output)


In [2]:
import json
from transformers import GPTNeoForCausalLM, GPTNeoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset

# Embedded JSON dataset
dataset = [
    {
        "prompt": "How does photosynthesis work?",
        "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose. They use sunlight to transform carbon dioxide and water into glucose and oxygen."
    },
    {
        "prompt": "Explain the theory of relativity.",
        "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity. Special relativity addresses the relationship between space and time, while general relativity deals with gravity and the curvature of spacetime."
    },
    {
        "prompt": "What causes rainbows?",
        "response": "Rainbows are caused by the refraction, dispersion, and reflection of light, typically sunlight, in water droplets resulting in a spectrum of light appearing in the sky."
    },
    {
        "prompt": "What is the Pythagorean theorem?",
        "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the length of the hypotenuse is equal to the sum of the squares of the lengths of the other two sides."
    }
]

# Load the tokenizer
tokenizer = GPTNeoTokenizer.from_pretrained('EleutherAI/gpt-neo-125M')

# Preprocess the dataset
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        prompt = item['prompt']
        response = item['response']
        inputs = self.tokenizer(prompt, return_tensors='pt', truncation=True, padding='max_length', max_length=256)
        labels = self.tokenizer(response, return_tensors='pt', truncation=True, padding='max_length', max_length=256).input_ids
        labels[labels == self.tokenizer.pad_token_id] = -100  # Mask out padding tokens for loss calculation
        inputs['labels'] = labels
        return {k: v.squeeze(0) for k, v in inputs.items()}

# Create the dataset
train_dataset = CustomDataset(dataset, tokenizer)

# Load the pre-trained model
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M')

# Set up training arguments with optimizations
training_args = TrainingArguments(
    per_device_train_batch_size=1,          # Smaller batch size to reduce memory usage
    gradient_accumulation_steps=4,          # Accumulate gradients to simulate a larger batch size
    num_train_epochs=3,
    fp16=True,                              # Enable mixed precision training
    logging_dir='./logs',
    output_dir='./fine-tuned-model',
    save_total_limit=2,
    save_steps=500
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Set up the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Generate text with the fine-tuned model
from transformers import pipeline

text_generator = pipeline('text-generation', model='./fine-tuned-model')
output = text_generator("How does photosynthesis work?")
print(output)


ImportError: cannot import name 'GPTNeoTokenizer' from 'transformers' (/usr/local/lib/python3.10/dist-packages/transformers/__init__.py)

In [3]:
import json
from transformers import GPTNeoForCausalLM, GPTNeoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset

# Embedded JSON dataset
dataset = [
    {
        "prompt": "How does photosynthesis work?",
        "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose. They use sunlight to transform carbon dioxide and water into glucose and oxygen."
    },
    {
        "prompt": "Explain the theory of relativity.",
        "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity. Special relativity addresses the relationship between space and time, while general relativity deals with gravity and the curvature of spacetime."
    },
    {
        "prompt": "What causes rainbows?",
        "response": "Rainbows are caused by the refraction, dispersion, and reflection of light, typically sunlight, in water droplets resulting in a spectrum of light appearing in the sky."
    },
    {
        "prompt": "What is the Pythagorean theorem?",
        "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the length of the hypotenuse is equal to the sum of the squares of the lengths of the other two sides."
    }
]

# Load the tokenizer
tokenizer = GPTNeoTokenizer.from_pretrained('EleutherAI/gpt-neo-125M')

# Preprocess the dataset
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        prompt = item['prompt']
        response = item['response']
        inputs = self.tokenizer(prompt, return_tensors='pt', truncation=True, padding='max_length', max_length=256)
        labels = self.tokenizer(response, return_tensors='pt', truncation=True, padding='max_length', max_length=256).input_ids
        labels[labels == self.tokenizer.pad_token_id] = -100  # Mask out padding tokens for loss calculation
        inputs['labels'] = labels
        return {k: v.squeeze(0) for k, v in inputs.items()}

# Create the dataset
train_dataset = CustomDataset(dataset, tokenizer)

# Load the pre-trained model
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M')

# Set up training arguments with optimizations
training_args = TrainingArguments(
    per_device_train_batch_size=1,          # Smaller batch size to reduce memory usage
    gradient_accumulation_steps=4,          # Accumulate gradients to simulate a larger batch size
    num_train_epochs=3,
    fp16=True,                              # Enable mixed precision training
    logging_dir='./logs',
    output_dir='./fine-tuned-model',
    save_total_limit=2,
    save_steps=500
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Set up the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Generate text with the fine-tuned model
from transformers import pipeline

text_generator = pipeline('text-generation', model='./fine-tuned-model')
output = text_generator("How does photosynthesis work?")
print(output)


ImportError: cannot import name 'GPTNeoTokenizer' from 'transformers' (/usr/local/lib/python3.10/dist-packages/transformers/__init__.py)

In [None]:
import json
import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments

# Step 1: Prepare Your Dataset
# Embed the JSON dataset directly in Python code
dataset = [
    {
        "prompt": "How does photosynthesis work?",
        "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose. They use sunlight to transform carbon dioxide and water into glucose and oxygen."
    },
    {
        "prompt": "Explain the theory of relativity.",
        "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity. Special relativity addresses the relationship between space and time, while general relativity deals with gravity and the curvature of spacetime."
    },
    {
        "prompt": "What is the Pythagorean theorem?",
        "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the hypotenuse is equal to the sum of the squares of the other two sides."
    },
    {
        "prompt": "What causes rainbows?",
        "response": "Rainbows are caused by the refraction, reflection, and dispersion of light in water droplets, resulting in a spectrum of light appearing in the sky."
    }
]

# Step 2: Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-1.3B')

# Add a padding token if it's not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Step 3: Preprocess the dataset
def preprocess_data(dataset):
    inputs = []
    labels = []
    for entry in dataset:
        prompt = entry['prompt']
        response = entry['response']
        input_ids = tokenizer(prompt, return_tensors='pt', truncation=True, padding='max_length', max_length=64).input_ids
        label_ids = tokenizer(response, return_tensors='pt', truncation=True, padding='max_length', max_length=64).input_ids
        inputs.append(input_ids.squeeze())
        labels.append(label_ids.squeeze())
    return inputs, labels

train_inputs, train_labels = preprocess_data(dataset)

# Create a custom dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {'input_ids': self.inputs[idx], 'labels': self.labels[idx]}

train_dataset = CustomDataset(train_inputs, train_labels)

# Step 4: Load the pre-trained model
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-1.3B')

# Step 5: Fine-tune the model
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    fp16=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

# Step 6: Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Step 7: Generate text using the fine-tuned model
from transformers import pipeline

text_generator = pipeline('text-generation', model='./fine-tuned-model', tokenizer='./fine-tuned-model')
output = text_generator("What causes rainbows?", max_length=50)
print(output)


In [1]:
import json
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset

# Embedded JSON dataset
dataset = [
    {
        "prompt": "How does photosynthesis work?",
        "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose. They use sunlight to transform carbon dioxide and water into glucose and oxygen."
    },
    {
        "prompt": "Explain the theory of relativity.",
        "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity. Special relativity addresses the relationship between space and time, while general relativity deals with gravity and the curvature of spacetime."
    },
    {
        "prompt": "What causes rainbows?",
        "response": "Rainbows are caused by the refraction, dispersion, and reflection of light, typically sunlight, in water droplets resulting in a spectrum of light appearing in the sky."
    },
    {
        "prompt": "What is the Pythagorean theorem?",
        "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the length of the hypotenuse is equal to the sum of the squares of the lengths of the other two sides."
    }
]

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-125M')
tokenizer.pad_token = tokenizer.eos_token  # Set pad token as EOS token to avoid errors

# Preprocess the dataset
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        prompt = item['prompt']
        response = item['response']
        inputs = self.tokenizer(prompt, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
        labels = self.tokenizer(response, return_tensors='pt', truncation=True, padding='max_length', max_length=128).input_ids
        labels[labels == tokenizer.pad_token_id] = -100  # Mask out padding tokens for loss calculation
        inputs['labels'] = labels
        return {k: v.squeeze(0) for k, v in inputs.items()}

# Create the dataset
train_dataset = CustomDataset(dataset, tokenizer)

# Load the pre-trained model
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M')

# Set up training arguments with optimizations
training_args = TrainingArguments(
    per_device_train_batch_size=1,          # Smaller batch size to reduce memory usage
    gradient_accumulation_steps=2,          # Accumulate gradients to simulate a larger batch size
    num_train_epochs=1,                     # Fewer epochs to reduce runtime
    logging_dir='./logs',
    output_dir='./fine-tuned-model',
    save_total_limit=1,                     # Limit the number of saved checkpoints
    save_steps=200                          # Save model every 200 steps
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Set up the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Generate text with the fine-tuned model
from transformers import pipeline

text_generator = pipeline('text-generation', model='./fine-tuned-model')
output = text_generator("How does photosynthesis work?", max_length=100)
print(output)


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Step,Training Loss


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'How does photosynthesis work?\n\nPhotolysis is a process that occurs when photosynthesis occurs in a photosynthetic organism. Photosynthesis is the process of converting light into energy. The photosynthetic process is the process of converting light into energy.\n\nPhotolysis is a process that occurs when photosynthesis occurs in a photosynthetic organism. Photosynthesis is the process of converting light into energy.\n\nPhotolysis is a process that occurs when photosynthesis occurs in a'}]


In [2]:
import json
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset

# Embedded JSON dataset
dataset = [
    {
        "prompt": "How does photosynthesis work?",
        "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose. They use sunlight to transform carbon dioxide and water into glucose and oxygen."
    },
    {
        "prompt": "Explain the theory of relativity.",
        "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity. Special relativity addresses the relationship between space and time, while general relativity deals with gravity and the curvature of spacetime."
    },
    {
        "prompt": "What causes rainbows?",
        "response": "Rainbows are caused by the refraction, dispersion, and reflection of light, typically sunlight, in water droplets resulting in a spectrum of light appearing in the sky."
    },
    {
        "prompt": "What is the Pythagorean theorem?",
        "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the length of the hypotenuse is equal to the sum of the squares of the lengths of the other two sides."
    }
]

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-125M')
tokenizer.pad_token = tokenizer.eos_token  # Set pad token as EOS token to avoid errors

# Preprocess the dataset
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        prompt = item['prompt']
        response = item['response']
        inputs = self.tokenizer(prompt, return_tensors='pt', truncation=True, padding='max_length', max_length=128)
        labels = self.tokenizer(response, return_tensors='pt', truncation=True, padding='max_length', max_length=128).input_ids
        labels[labels == tokenizer.pad_token_id] = -100  # Mask out padding tokens for loss calculation
        inputs['labels'] = labels
        return {k: v.squeeze(0) for k, v in inputs.items()}

# Create the dataset
train_dataset = CustomDataset(dataset, tokenizer)

# Load the pre-trained model
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M')

# Set up training arguments with optimizations
training_args = TrainingArguments(
    per_device_train_batch_size=1,          # Smaller batch size to reduce memory usage
    gradient_accumulation_steps=2,          # Accumulate gradients to simulate a larger batch size
    num_train_epochs=1,                     # Fewer epochs to reduce runtime
    logging_dir='./logs',
    output_dir='./fine-tuned-model',
    save_total_limit=1,                     # Limit the number of saved checkpoints
    save_steps=200                          # Save model every 200 steps
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Set up the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Generate text with the fine-tuned model
from transformers import pipeline

text_generator = pipeline('text-generation', model='./fine-tuned-model')
output = text_generator("How does photosynthesis work?", max_length=100)
print(output)


Step,Training Loss


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'How does photosynthesis work?\n\nPhotolysis is a process that occurs when photosynthesis occurs in a photosynthetic organism. Photosynthesis is the process of converting light into energy. The photosynthetic process is the process of converting light into energy.\n\nPhotolysis is a process that occurs when photosynthesis occurs in a photosynthetic organism. Photosynthesis is the process of converting light into energy.\n\nPhotolysis is a process that occurs when photosynthesis occurs in a'}]


In [3]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import torch

# Embedded JSON dataset
dataset_json = [
    {
        "prompt": "How does photosynthesis work?",
        "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose."
    },
    {
        "prompt": "Explain the theory of relativity.",
        "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity."
    },
    {
        "prompt": "What is the Pythagorean theorem?",
        "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the length of the hypotenuse is equal to the sum of the squares of the lengths of the other two sides."
    },
    {
        "prompt": "How do vaccines work?",
        "response": "Vaccines work by stimulating the immune system to recognize and fight specific pathogens, such as viruses or bacteria."
    }
]

# Convert the JSON dataset into a custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data[idx]['prompt']
        response = self.data[idx]['response']
        input_encodings = self.tokenizer(prompt, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        response_encodings = self.tokenizer(response, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        input_ids = input_encodings['input_ids'].squeeze()
        attention_mask = input_encodings['attention_mask'].squeeze()
        labels = response_encodings['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Load tokenizer and model
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Prepare the dataset
train_dataset = CustomDataset(dataset_json, tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_dir='./logs',
    logging_steps=10,
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Load the fine-tuned model and tokenizer for text generation
model = GPT2LMHeadModel.from_pretrained('./fine-tuned-model')
tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-model')

# Interactive loop for user input
print("Enter your prompt (type 'exit' to quit):")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break

    # Tokenize user input
    inputs = tokenizer(user_input, return_tensors='pt')

    # Generate response
    outputs = model.generate(inputs['input_ids'], max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

    # Decode and print the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("AI: ", response)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [4]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import torch

# Larger JSON dataset embedded within the code
dataset_json = [
    {
        "prompt": "How does photosynthesis work?",
        "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose."
    },
    {
        "prompt": "Explain the theory of relativity.",
        "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity."
    },
    {
        "prompt": "What is the Pythagorean theorem?",
        "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the length of the hypotenuse is equal to the sum of the squares of the lengths of the other two sides."
    },
    {
        "prompt": "How do vaccines work?",
        "response": "Vaccines work by stimulating the immune system to recognize and fight specific pathogens, such as viruses or bacteria."
    },
    # Additional data entries
    {
        "prompt": "What causes rainbows?",
        "response": "Rainbows are caused by the refraction, dispersion, and reflection of light, typically from the sun, in water droplets, resulting in a spectrum of light appearing in the sky."
    },
    {
        "prompt": "What is the capital of France?",
        "response": "The capital of France is Paris."
    },
    {
        "prompt": "How does the internet work?",
        "response": "The internet is a global network of computers that communicate with each other using the Internet Protocol (IP) and other protocols."
    },
    {
        "prompt": "Explain the process of evaporation.",
        "response": "Evaporation is the process by which water changes from a liquid to a gas or vapor."
    },
    {
        "prompt": "What is gravity?",
        "response": "Gravity is a force by which a planet or other body draws objects toward its center. The force of gravity keeps all of the planets in orbit around the sun."
    },
    {
        "prompt": "Who wrote 'To Kill a Mockingbird'?",
        "response": "'To Kill a Mockingbird' was written by Harper Lee."
    },
    # You can continue adding more entries here...
]

# Convert the JSON dataset into a custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data[idx]['prompt']
        response = self.data[idx]['response']
        input_encodings = self.tokenizer(prompt, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        response_encodings = self.tokenizer(response, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        input_ids = input_encodings['input_ids'].squeeze()
        attention_mask = input_encodings['attention_mask'].squeeze()
        labels = response_encodings['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Load tokenizer and model
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add a padding token if it does not exist
tokenizer.pad_token = tokenizer.eos_token

# Load the model
model = GPT2LMHeadModel.from_pretrained(model_name)

# Prepare the dataset
train_dataset = CustomDataset(dataset_json, tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=1,  # Increase the number of epochs for larger datasets
    logging_dir='./logs',
    logging_steps=10,
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Load the fine-tuned model and tokenizer for text generation
model = GPT2LMHeadModel.from_pretrained('./fine-tuned-model')
tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-model')

# Interactive loop for user input
print("Enter your prompt (type 'exit' to quit):")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break

    # Tokenize user input
    inputs = tokenizer(user_input, return_tensors='pt')

    # Generate response
    outputs = model.generate(inputs['input_ids'], max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

    # Decode and print the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("AI: ", response)


Step,Training Loss


Enter your prompt (type 'exit' to quit):
You: What causes rainbows ?


The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


AI:  What causes rainbows?

Rainbows are the most common type of rainbows. They are usually found in the tropics and in the tropics of the Pacific Ocean. They are found in the tropics of the Pacific Ocean, the Caribbean, and the Indian Ocean.

Rainbows are formed when the air is heated by the sun. The air is heated by the sun, and the air is cooled by the sun. The air is cooled by the sun, and the air is
You: Explain theory of relativity ?
AI:  Explain theory of relativity?

The theory of relativity is a theory of relativity that describes the laws of nature. It is a theory of relativity that describes the laws of nature. It is a theory of relativity that describes the laws of nature. It is a theory of relativity that describes the laws of nature. It is a theory of relativity that describes the laws of nature. It is a theory of relativity that describes the laws of nature. It is a theory of relativity that describes the laws of
You: What is gravity ?
AI:  What is gravity?

Gravity is t

In [5]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import torch

# Larger JSON dataset embedded within the code
dataset_json = [
    {
        "prompt": "How does photosynthesis work?",
        "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose."
    },
    {
        "prompt": "Explain the theory of relativity.",
        "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity."
    },
    {
        "prompt": "What is the Pythagorean theorem?",
        "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the length of the hypotenuse is equal to the sum of the squares of the lengths of the other two sides."
    },
    {
        "prompt": "How do vaccines work?",
        "response": "Vaccines work by stimulating the immune system to recognize and fight specific pathogens, such as viruses or bacteria."
    },
    # Additional data entries
    {
        "prompt": "What causes rainbows?",
        "response": "Rainbows are caused by the refraction, dispersion, and reflection of light, typically from the sun, in water droplets, resulting in a spectrum of light appearing in the sky."
    },
    {
        "prompt": "What is the capital of France?",
        "response": "The capital of France is Paris."
    },
    {
        "prompt": "How does the internet work?",
        "response": "The internet is a global network of computers that communicate with each other using the Internet Protocol (IP) and other protocols."
    },
    {
        "prompt": "Explain the process of evaporation.",
        "response": "Evaporation is the process by which water changes from a liquid to a gas or vapor."
    },
    {
        "prompt": "What is gravity?",
        "response": "Gravity is a force by which a planet or other body draws objects toward its center. The force of gravity keeps all of the planets in orbit around the sun."
    },
    {
        "prompt": "Who wrote 'To Kill a Mockingbird'?",
        "response": "'To Kill a Mockingbird' was written by Harper Lee."
    },
    # You can continue adding more entries here...
]

# Convert the JSON dataset into a custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data[idx]['prompt']
        response = self.data[idx]['response']
        input_encodings = self.tokenizer(prompt, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        response_encodings = self.tokenizer(response, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        input_ids = input_encodings['input_ids'].squeeze()
        attention_mask = input_encodings['attention_mask'].squeeze()
        labels = response_encodings['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Load tokenizer and model
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add a padding token if it does not exist
tokenizer.pad_token = tokenizer.eos_token

# Load the model
model = GPT2LMHeadModel.from_pretrained(model_name)

# Prepare the dataset
train_dataset = CustomDataset(dataset_json, tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=1,  # Increase the number of epochs for larger datasets
    logging_dir='./logs',
    logging_steps=10,
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Load the fine-tuned model and tokenizer for text generation
model = GPT2LMHeadModel.from_pretrained('./fine-tuned-model')
tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-model')

# Interactive loop for user input
print("Enter your prompt (type 'exit' to quit):")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break

    # Tokenize user input
    inputs = tokenizer(user_input, return_tensors='pt')

    # Generate response
    outputs = model.generate(inputs['input_ids'], max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

    # Decode and print the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("AI: ", response)


Step,Training Loss


Enter your prompt (type 'exit' to quit):
You: What cause rainbows ?
AI:  What cause rainbows?

Rainbows are the most common type of rainbows. They are usually found in the tropics and in the tropics of the Pacific Ocean. They are found in the tropics of the Pacific Ocean, the Caribbean, and the Indian Ocean.

Rainbows are formed when the air is heated by the sun. The air is heated by the sun, and the air is cooled by the sun. The air is cooled by the sun, and the air is
You: How do vaccines work ?
AI:  How do vaccines work?

The answer is that vaccines work by preventing the development of immunity. The immune system is the body's natural defense against infectious diseases. The immune system is the body's natural defense against infectious diseases.

The immune system is the body's natural defense against infectious diseases.

The immune system is the body's natural defense against infectious diseases.

The immune system is the body's natural defense against infectious diseases.

The 

In [6]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import torch

# Larger JSON dataset embedded within the code
dataset_json = [
    {
        "prompt": "How does photosynthesis work?",
        "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose."
    },
    {
        "prompt": "Explain the theory of relativity.",
        "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity."
    },
    {
        "prompt": "What is the Pythagorean theorem?",
        "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the length of the hypotenuse is equal to the sum of the squares of the lengths of the other two sides."
    },
    {
        "prompt": "How do vaccines work?",
        "response": "Vaccines work by stimulating the immune system to recognize and fight specific pathogens, such as viruses or bacteria."
    },
    {
        "prompt": "What causes rainbows?",
        "response": "Rainbows are caused by the refraction, dispersion, and reflection of light, typically from the sun, in water droplets, resulting in a spectrum of light appearing in the sky."
    },
    {
        "prompt": "What is the capital of France?",
        "response": "The capital of France is Paris."
    },
    {
        "prompt": "How does the internet work?",
        "response": "The internet is a global network of computers that communicate with each other using the Internet Protocol (IP) and other protocols."
    },
    {
        "prompt": "Explain the process of evaporation.",
        "response": "Evaporation is the process by which water changes from a liquid to a gas or vapor."
    },
    {
        "prompt": "What is gravity?",
        "response": "Gravity is a force by which a planet or other body draws objects toward its center. The force of gravity keeps all of the planets in orbit around the sun."
    },
    {
        "prompt": "Who wrote 'To Kill a Mockingbird'?",
        "response": "'To Kill a Mockingbird' was written by Harper Lee."
    },
    # Additional data entries can be added here...
]

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data[idx]['prompt']
        response = self.data[idx]['response']
        input_encodings = self.tokenizer(prompt, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        response_encodings = self.tokenizer(response, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        input_ids = input_encodings['input_ids'].squeeze()
        attention_mask = input_encodings['attention_mask'].squeeze()
        labels = response_encodings['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)

train_dataset = CustomDataset(dataset_json, tokenizer)

training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=3,  # Increased number of epochs
    logging_dir='./logs',
    logging_steps=10,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

trainer.train()

model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

model = GPT2LMHeadModel.from_pretrained('./fine-tuned-model')
tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-model')

print("Enter your prompt (type 'exit' to quit):")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break

    inputs = tokenizer(user_input, return_tensors='pt', padding='max_length', truncation=True, max_length=512)

    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=100,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("AI: ", response)


Step,Training Loss
10,3.2215


Enter your prompt (type 'exit' to quit):
You: How does photosynthesis work ? 


ValueError: Input length of input_ids is 512, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.

In [7]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import torch

# Embedded JSON dataset
dataset_json = [
    # Your dataset entries here...
]

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data[idx]['prompt']
        response = self.data[idx]['response']
        input_encodings = self.tokenizer(prompt, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        response_encodings = self.tokenizer(response, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        input_ids = input_encodings['input_ids'].squeeze()
        attention_mask = input_encodings['attention_mask'].squeeze()
        labels = response_encodings['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)

train_dataset = CustomDataset(dataset_json, tokenizer)

training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=3,  # Increased number of epochs
    logging_dir='./logs',
    logging_steps=10,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

trainer.train()

model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

model = GPT2LMHeadModel.from_pretrained('./fine-tuned-model')
tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-model')

print("Enter your prompt (type 'exit' to quit):")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break

    inputs = tokenizer(user_input, return_tensors='pt', padding='max_length', truncation=True, max_length=512)

    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=100,  # Changed from max_length to max_new_tokens
        pad_token_id=tokenizer.eos_token_id
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("AI: ", response)


ValueError: num_samples should be a positive integer value, but got num_samples=0

In [8]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import torch

# Embedded JSON dataset
dataset_json = [
    # Add your dataset entries here
    {"prompt": "How does photosynthesis work?", "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose. They use sunlight to transform carbon dioxide and water into glucose and oxygen."},
    {"prompt": "Explain the theory of relativity.", "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity. Special relativity addresses the relationship between space and time, while general relativity deals with gravity and the curvature of spacetime."}
]

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data[idx]['prompt']
        response = self.data[idx]['response']
        input_encodings = self.tokenizer(prompt, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        response_encodings = self.tokenizer(response, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        input_ids = input_encodings['input_ids'].squeeze()
        attention_mask = input_encodings['attention_mask'].squeeze()
        labels = response_encodings['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)

train_dataset = CustomDataset(dataset_json, tokenizer)

# Debugging output
print(f"Dataset length: {len(train_dataset)}")

training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

try:
    trainer.train()
except ValueError as e:
    print(f"Error during training: {e}")

model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Loading the model and tokenizer for inference
model = GPT2LMHeadModel.from_pretrained('./fine-tuned-model')
tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-model')

print("Enter your prompt (type 'exit' to quit):")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break

    inputs = tokenizer(user_input, return_tensors='pt', padding='max_length', truncation=True, max_length=512)

    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=100,
        pad_token_id=tokenizer.eos_token_id
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("AI: ", response)


Dataset length: 2


Step,Training Loss


Enter your prompt (type 'exit' to quit):
You: How does photosythesis work ?
AI:  How does photosythesis work?The photosynthesis process is the process of converting photosynthetic material into sugars. The photosynthetic material is then used to make sugars. The photosynthetic material is then used to make sugars. The photosynthetic material is then used to make sugars. The photosynthetic material is then used to make sugars. The photosynthetic material is then used to make sugars. The photosynthetic material is then used to make sugars. The photosynthetic material is then used to make sugars.
You: exit


In [9]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import torch

# Embedded JSON dataset
dataset_json = [
    {"prompt": "How does photosynthesis work?", "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose. They use sunlight to transform carbon dioxide and water into glucose and oxygen."},
    {"prompt": "Explain the theory of relativity.", "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity. Special relativity addresses the relationship between space and time, while general relativity deals with gravity and the curvature of spacetime."}
]

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data[idx]['prompt']
        response = self.data[idx]['response']
        input_encodings = self.tokenizer(prompt, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        response_encodings = self.tokenizer(response, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        input_ids = input_encodings['input_ids'].squeeze()
        attention_mask = input_encodings['attention_mask'].squeeze()
        labels = response_encodings['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)

train_dataset = CustomDataset(dataset_json, tokenizer)

# Debugging output
print(f"Dataset length: {len(train_dataset)}")

training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=5,  # Increased number of epochs
    logging_dir='./logs',
    logging_steps=10,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

try:
    trainer.train()
except ValueError as e:
    print(f"Error during training: {e}")

model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Loading the model and tokenizer for inference
model = GPT2LMHeadModel.from_pretrained('./fine-tuned-model')
tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-model')

print("Enter your prompt (type 'exit' to quit):")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break

    inputs = tokenizer(user_input, return_tensors='pt', padding='max_length', truncation=True, max_length=512)

    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=100,
        temperature=0.7,  # Adjust this value for more diverse responses
        top_p=0.9,        # Use nucleus sampling for better diversity
        pad_token_id=tokenizer.eos_token_id
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("AI: ", response)


Dataset length: 2


Step,Training Loss


Enter your prompt (type 'exit' to quit):
You: Explain the theory of relativity ?




AI:  Explain the theory of relativity?

The theory of relativity is a theory of relativity that describes the laws of physics. It is a theory of relativity that describes the laws of physics. It is a theory of relativity that describes the laws of physics. It is a theory of relativity that describes the laws of physics. It is a theory of relativity that describes the laws of physics. It is a theory of relativity that describes the laws of physics. It is a theory of relativity that describes the laws of physics. It is a theory
You: exit


In [10]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset

# Define the dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data[idx]['prompt']
        response = self.data[idx]['response']
        input_encodings = self.tokenizer(prompt, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        response_encodings = self.tokenizer(response, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        input_ids = input_encodings['input_ids'].squeeze()
        attention_mask = input_encodings['attention_mask'].squeeze()
        labels = response_encodings['input_ids'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

# Initialize model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)

# Create dataset
dataset_json = [
    {"prompt": "How does photosynthesis work?", "response": "Photosynthesis is the process by which green plants, algae, and some bacteria convert light energy into chemical energy stored in glucose. They use sunlight to transform carbon dioxide and water into glucose and oxygen."},
    {"prompt": "Explain the theory of relativity.", "response": "The theory of relativity, developed by Albert Einstein, includes two parts: special relativity and general relativity. Special relativity addresses the relationship between space and time, while general relativity deals with gravity and the curvature of spacetime."}
]
train_dataset = CustomDataset(dataset_json, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=5,
    logging_dir='./logs',
    logging_steps=10,
)

# Create Trainer
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# Train model
try:
    trainer.train()
except ValueError as e:
    print(f"Error during training: {e}")

# Save the model
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Load the model and tokenizer for inference
model = GPT2LMHeadModel.from_pretrained('./fine-tuned-model')
tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-model')

# Interactive prompt
print("Enter your prompt (type 'exit' to quit):")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break

    inputs = tokenizer(user_input, return_tensors='pt', padding='max_length', truncation=True, max_length=512)

    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=100,
        do_sample=True,     # Enable sampling
        temperature=0.7,   # Adjust for creativity
        top_p=0.9,         # Nucleus sampling for diversity
        pad_token_id=tokenizer.eos_token_id
    )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("AI: ", response)


Step,Training Loss


Enter your prompt (type 'exit' to quit):
You: How does photosynthesis work  ?
AI:  How does photosynthesis work ?"

"You know, I know that it takes about 50 million years to make a chemical reaction of hydrogen and oxygen. But it is a very simple process. So it doesn't require any special equipment. It is simply a matter of using a microscope to see. It is a very simple process. So it does not require any special equipment. It is simply a matter of using a microscope to see. It is a very simple process. So it does not require any special equipment. It
You: exit


In [11]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=1,  # Use a smaller batch size
    num_train_epochs=3,
    logging_dir='./logs',
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy='epoch'
)




In [12]:
model_name = 'gpt2-small'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPTNeoForCausalLM.from_pretrained(model_name)


OSError: gpt2-small is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
import json
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset, DataLoader
import torch

# Define the dataset class
class CustomDataset(Dataset):
    def __init__(self, prompts, responses, tokenizer, max_length=512):
        self.prompts = prompts
        self.responses = responses
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.prompts)

    def __getitem__(self, idx):
        prompt = self.prompts[idx]
        response = self.responses[idx]
        inputs = self.tokenizer(prompt, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length')
        labels = self.tokenizer(response, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length')
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': labels['input_ids'].squeeze()
        }

# Load the tokenizer and model
model_name = 'EleutherAI/gpt-neo-1.3B'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPTNeoForCausalLM.from_pretrained(model_name)

# Example dataset
data = [
    {"prompt": "How does photosynthesis work?", "response": "Photosynthesis is the process by which green plants convert light energy into chemical energy."},
    {"prompt": "Explain the theory of relativity.", "response": "The theory of relativity includes special and general relativity, dealing with the nature of space-time and gravity."}
]

prompts = [entry['prompt'] for entry in data]
responses = [entry['response'] for entry in data]

# Prepare the dataset
dataset = CustomDataset(prompts, responses, tokenizer)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_dir='./logs',
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy='epoch'
)

# Define the data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# For interactive input
from transformers import pipeline

text_generator = pipeline('text-generation', model='./fine-tuned-model', tokenizer='./fine-tuned-model')

while True:
    user_input = input("Enter your prompt (type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    response = text_generator(user_input, max_length=150, num_return_sequences=1)
    print("AI:", response[0]['generated_text'])


In [1]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=1,  # Use a smaller batch size
    num_train_epochs=3,
    logging_dir='./logs',
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy='epoch'
)


NameError: name 'TrainingArguments' is not defined

In [2]:
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=1,  # Use a smaller batch size
    num_train_epochs=3,
    logging_dir='./logs',
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy='epoch'
)


NameError: name 'TrainingArguments' is not defined

In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPTNeoForCausalLM, GPTNeoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import json

class CustomDataset(Dataset):
    def __init__(self, prompts, responses, tokenizer, max_length=512):
        self.prompts = prompts
        self.responses = responses
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.prompts)

    def __getitem__(self, idx):
        prompt = self.prompts[idx]
        response = self.responses[idx]
        inputs = self.tokenizer(prompt, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length')
        labels = self.tokenizer(response, return_tensors='pt', max_length=self.max_length, truncation=True, padding='max_length')
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': labels['input_ids'].squeeze()
        }

def train_and_generate(model_name='gpt2', is_gpt2=True):
    # Load the tokenizer and model
    if is_gpt2:
        tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        model = GPT2LMHeadModel.from_pretrained(model_name)
    else:
        tokenizer = GPTNeoTokenizer.from_pretrained(model_name)
        model = GPTNeoForCausalLM.from_pretrained(model_name)

    # Example dataset (Replace with your actual dataset)
    data = [
        {"prompt": "How does photosynthesis work?", "response": "Photosynthesis is the process by which green plants convert light energy into chemical energy."},
        {"prompt": "Explain the theory of relativity.", "response": "The theory of relativity includes special and general relativity, dealing with the nature of space-time and gravity."}
    ]

    prompts = [entry['prompt'] for entry in data]
    responses = [entry['response'] for entry in data]

    # Prepare the dataset
    dataset = CustomDataset(prompts, responses, tokenizer)

    # Define the training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        per_device_train_batch_size=1,
        num_train_epochs=3,
        logging_dir='./logs',
        save_steps=10_000,
        save_total_limit=2,
        evaluation_strategy='epoch'
    )

    # Define the data collator
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset
    )

    # Train the model
    trainer.train()

    # Save the model and tokenizer
    model.save_pretrained('./fine-tuned-model')
    tokenizer.save_pretrained('./fine-tuned-model')

    # For interactive input
    from transformers import pipeline

    text_generator = pipeline('text-generation', model='./fine-tuned-model', tokenizer='./fine-tuned-model')

    while True:
        user_input = input("Enter your prompt (type 'exit' to quit): ")
        if user_input.lower() == 'exit':
            break
        response = text_generator(user_input, max_length=150, num_return_sequences=1)
        print("AI:", response[0]['generated_text'])

# Call the function for GPT-2
train_and_generate(model_name='gpt2', is_gpt2=True)

# Call the function for GPT-Neo
# train_and_generate(model_name='EleutherAI/gpt-neo-1.3B', is_gpt2=False)


ImportError: cannot import name 'GPTNeoTokenizer' from 'transformers' (/usr/local/lib/python3.10/dist-packages/transformers/__init__.py)

In [5]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments

# Embedded JSON training data
training_data = [
    {"prompt": "How does photosynthesis work?", "response": "Photosynthesis is the process by which green plants convert light energy into chemical energy stored in glucose."},
    {"prompt": "Explain the theory of relativity.", "response": "The theory of relativity, developed by Albert Einstein, describes the laws of physics in relation to moving bodies and gravity."},
    {"prompt": "What is the Pythagorean theorem?", "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the hypotenuse is equal to the sum of the squares of the other two sides."}
]

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_text = item['prompt']
        target_text = item['response']
        inputs = self.tokenizer(input_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
        targets = self.tokenizer(target_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets['input_ids'].squeeze()
        }

# Load the GPT-2 model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add a padding token if not present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    model.resize_token_embeddings(len(tokenizer))

# Prepare dataset and dataloader
dataset = CustomDataset(training_data, tokenizer)
train_dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=10,
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Function to generate text after fine-tuning
def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=50,
        num_return_sequences=1,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
print(generate_text("Explain the theory of relativity."))


Step,Training Loss


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Explain the theory of relativity.


In [6]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments

# Embedded JSON training data
training_data = [
    {"prompt": "How does photosynthesis work?", "response": "Photosynthesis is the process by which green plants convert light energy into chemical energy stored in glucose."},
    {"prompt": "Explain the theory of relativity.", "response": "The theory of relativity, developed by Albert Einstein, describes the laws of physics in relation to moving bodies and gravity."},
    {"prompt": "What is the Pythagorean theorem?", "response": "The Pythagorean theorem states that in a right-angled triangle, the square of the hypotenuse is equal to the sum of the squares of the other two sides."}
]

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_text = item['prompt']
        target_text = item['response']
        inputs = self.tokenizer(input_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
        targets = self.tokenizer(target_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets['input_ids'].squeeze()
        }

# Load the GPT-2 model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add a padding token if not present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    model.resize_token_embeddings(len(tokenizer))

# Prepare dataset and dataloader
dataset = CustomDataset(training_data, tokenizer)
train_dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=10,
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Function to generate text after fine-tuning
def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=50,
        num_return_sequences=1,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
print(generate_text("Explain the theory of relativity."))


KeyboardInterrupt: 

In [7]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset

# Embedded JSON dataset for fine-tuning (smaller dataset for faster training)
training_data = [
    {"prompt": "What causes rainbows?", "response": "Rainbows are caused by the refraction, dispersion, and reflection of light in water droplets."},
    {"prompt": "How do vaccines work?", "response": "Vaccines work by stimulating the immune system to recognize and fight specific pathogens."},
]

# Dataset class to prepare data for training
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data[idx]["prompt"]
        response = self.data[idx]["response"]
        encoding = self.tokenizer(prompt, truncation=True, padding="max_length", max_length=256, return_tensors="pt")
        labels = self.tokenizer(response, truncation=True, padding="max_length", max_length=256, return_tensors="pt").input_ids
        encoding["labels"] = labels
        return {key: val.squeeze() for key, val in encoding.items()}

# Initialize model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add special tokens if needed
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
model.resize_token_embeddings(len(tokenizer))

# Prepare dataset
train_dataset = CustomDataset(training_data, tokenizer)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Set up training arguments with reduced epochs and batch size
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=1,
    num_train_epochs=1,  # Reduced epochs
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,  # Limits the number of saved checkpoints
    gradient_accumulation_steps=8,  # Helps if you reduce the batch size
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Function to generate responses based on user input
def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=256)
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=100,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Interactive loop for user input
while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        break
    response = generate_response(user_input)
    print(f"AI: {response}")


Step,Training Loss


You: What causes rainbows ?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


AI: What causes rainbows?
The problem with rainbows is that causes rainbows, is that they're not made from resin. The resin of the resin is made from resin resin, so that is what causes the rainbow. It's actually the resin that makes rainbows.

The rainbow is a very important part of the rainbow. It's the rainbow that makes rainbows. It's the rainbow that makes the rainbow, of course, rainbows.

So, you know, there
You: exit


In [8]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset

# Expanded dataset for better fine-tuning
training_data = [
    {"prompt": "What causes rainbows?", "response": "Rainbows are caused by the refraction, dispersion, and reflection of light in water droplets, resulting in a spectrum of light appearing in the sky."},
    {"prompt": "How do vaccines work?", "response": "Vaccines work by stimulating the body's immune system to recognize and fight specific pathogens, creating immunity without causing the disease."},
    {"prompt": "Explain the theory of relativity?", "response": "The theory of relativity, developed by Albert Einstein, describes how time and space are interconnected and how they are affected by the presence of mass and energy."},
    # Add more diverse and detailed data here
]

# Dataset class to prepare data for training
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data[idx]["prompt"]
        response = self.data[idx]["response"]
        encoding = self.tokenizer(prompt, truncation=True, padding="max_length", max_length=256, return_tensors="pt")
        labels = self.tokenizer(response, truncation=True, padding="max_length", max_length=256, return_tensors="pt").input_ids
        encoding["labels"] = labels
        return {key: val.squeeze() for key, val in encoding.items()}

# Initialize model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add special tokens if needed
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
model.resize_token_embeddings(len(tokenizer))

# Prepare dataset
train_dataset = CustomDataset(training_data, tokenizer)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Set up training arguments with more epochs
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=3,  # Increased epochs
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    gradient_accumulation_steps=4,  # Adjusted to manage batch size
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Function to generate responses based on user input
def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=256)
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=100,
        num_return_sequences=1,
        temperature=0.8,  # Slightly higher temperature to increase randomness
        top_p=0.9,
        do_sample=True
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Interactive loop for user input
while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        break
    response = generate_response(user_input)
    print(f"AI: {response}")


Step,Training Loss


You: What causes rainbows ?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


AI: What causes rainbows?


One explanation is that rainbows are found on trees. It is a common idea that the rainbows are on the side of trees, as they are very visible from the sky.

In fact, there are many kinds of rainbows on trees. The following are some examples.

There is a good explanation for rainbows on the side of trees.

A good explanation for rainbows on the side of trees

There is a good
You: How do vaccines work


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


AI: How do vaccines work? The answer is simple, no.

In theory, vaccines work. The CDC says it's not true.

And when you use it, they say it's the right answer. They say it's the wrong answer. It's the right answer.


It's the wrong answer. The CDC said it's the right answer.
it's the wrong answer.

So the CDC says it's the right answer. It's the right answer
You: Explain theory of relativity ?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


AI: Explain theory of relativity?

Lecture on relativity

Lecture on relativity - Introduction

Lecture on relativity - Introduction

Lecture on relativity - Introduction

Lecture on relativity - Introduction

Lecture on relativity - Introduction

Lecture on relativity - Introduction

Lect on relativity - Introduction

Lecture on relativity - Introduction
Lecture on relativity - Introduction

Lect on relativity - Introduction
You: exit


In [9]:
import json
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset

# Expanded and refined dataset for better fine-tuning
training_data = [
    {"prompt": "What causes rainbows?", "response": "Rainbows are caused by the refraction, dispersion, and reflection of light in water droplets, resulting in a spectrum of light appearing in the sky."},
    {"prompt": "How do vaccines work?", "response": "Vaccines work by stimulating the body's immune system to recognize and fight specific pathogens, creating immunity without causing the disease."},
    {"prompt": "Explain the theory of relativity.", "response": "The theory of relativity, developed by Albert Einstein, explains how space and time are linked for objects moving at a consistent speed in a straight line. It introduced the concept that time is relative, depending on the observer's speed."},
    # Add more diverse and accurate examples
]

# Dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data[idx]["prompt"]
        response = self.data[idx]["response"]
        encoding = self.tokenizer(prompt, truncation=True, padding="max_length", max_length=256, return_tensors="pt")
        labels = self.tokenizer(response, truncation=True, padding="max_length", max_length=256, return_tensors="pt").input_ids
        encoding["labels"] = labels
        return {key: val.squeeze() for key, val in encoding.items()}

# Initialize model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add special tokens if needed
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
model.resize_token_embeddings(len(tokenizer))

# Prepare dataset
train_dataset = CustomDataset(training_data, tokenizer)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Set up training arguments with more epochs
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=2,
    num_train_epochs=5,  # Increased epochs for better training
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    gradient_accumulation_steps=4,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-model')

# Function to generate responses
def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=256)
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=150,
        num_return_sequences=1,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Interactive loop for user input
while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        break
    response = generate_response(user_input)
    print(f"AI: {response}")


Step,Training Loss


You: What causes rainbows ?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


AI: What causes rainbows?

This is a very common phenomenon. It occurs when a person or animals show signs of distress. The person or animals show signs of distress. This is usually caused by the sun. The person or animals show signs of distress. The person or animals show signs of distress.

There is a common cause of rainbows, but it is not always the cause of rainbows. The cause of rainbows.

It is an indicator of a weather system or situation. It is a sign of a system.

Rainbows are a general sign of a weather system. They are a general sign of a system.

In the case of rainbows, the weather system is usually the weather system.
You: exit


In [10]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Define the dataset class
class SimpleDataset(Dataset):
    def __init__(self, tokenizer, texts, max_length=512):
        self.input_ids = []
        self.attention_masks = []

        for text in texts:
            encodings = tokenizer(text, truncation=True, padding='max_length', max_length=max_length)
            self.input_ids.append(torch.tensor(encodings['input_ids']))
            self.attention_masks.append(torch.tensor(encodings['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.input_ids[idx],
        }

# Training data
train_texts = [
    "Rainbows are formed by the reflection, refraction, and dispersion of light in water droplets.",
    "Photosynthesis is the process by which green plants and some other organisms use sunlight to synthesize foods with the help of chlorophyll.",
    "The theory of relativity usually encompasses two interrelated theories by Albert Einstein: special relativity and general relativity."
]

# Initialize tokenizer and model
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add pad token if it doesn't exist
tokenizer.pad_token = tokenizer.eos_token

# Create the dataset
dataset = SimpleDataset(tokenizer, train_texts)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine-tuned-model')
tokenizer.save_pretrained('./fine-tuned-tokenizer')

# Interactive loop for user input
model.eval()
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break

    inputs = tokenizer(user_input, return_tensors='pt', padding=True, truncation=True)
    outputs = model.generate(**inputs, max_length=150, pad_token_id=tokenizer.eos_token_id)

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"AI: {response}")


Step,Training Loss


You: What causes rainbows ?
AI: What causes rainbows?

Rainbows are a type of light that can be seen in the sky. They are usually seen in the sky when the sun is shining.

Rainbows are a type of light that can be seen in the sky. They are usually seen in the sky when the sun is shining.
You: How do vaccines work ?
AI: How do vaccines work?

The answer is that vaccines work by preventing the development of new diseases.

The vaccine is a vaccine that is administered to the body to prevent the development of new diseases.

The vaccine is a vaccine that is administered to the body to prevent the development of new diseases.

The vaccine is a vaccine that is administered to the body to prevent the development of new diseases.

The vaccine is a vaccine that is administered to the body to prevent the development of new diseases.

The vaccine is a vaccine that is administered to the body to prevent the development of new diseases.

The vaccine is a vaccine that is administered to the body to 