<a href="https://colab.research.google.com/github/tarunsha009/LLM-Learning/blob/main/gpt2_finetuned_cs_books.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets evaluate transformers[sentencepiece]

In [None]:
!pip install pypdf

In [None]:
from pypdf import PdfReader

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, "rb") as file:
        pdf_reader = PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

book1 = extract_text_from_pdf("/content/Building Microservices - Designing Fine-Grained Systems.pdf")
book2 = extract_text_from_pdf("/content/Building Microservices.pdf")
book3 = extract_text_from_pdf("/content/Clean Architecture A Craftsman Guide to Software Structure and Design.pdf")
book4 = extract_text_from_pdf("/content/Clean.Code.A.Handbook.of.Agile.Software.Craftsmanship.pdf")
book5 = extract_text_from_pdf("/content/Guru's SDF (1).pdf")


combined_text = book1 + book2 + book3 + book4 + book5

with open("cs_books.txt", "w") as f:
    f.write(combined_text)

In [None]:
print(len(combined_text))

In [None]:
import re

def clean_text(text):
  text = re.sub(r'\n', ' ', text)
  text = re.sub(r'[^\w\s.,;:!?]', '', text)
  return text

cleaned_text = clean_text(combined_text)


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

chunks = [cleaned_text[i:i+1000] for i in range(0, len(cleaned_text), 1000)]

In [None]:
from datasets import Dataset

dataset = Dataset.from_dict({"text": chunks})


def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        return_tensors="pt",
        padding="max_length"
    )
    # Set labels = input_ids (shifted for next-token prediction)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].clone()
    return tokenized_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)
print(tokenized_datasets[0].keys())  # Should show: input_ids, attention_mask, labels

In [None]:
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer

model = GPT2LMHeadModel.from_pretrained("gpt2")

training_args = TrainingArguments(
    output_dir="gpt2-cs-books",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    learning_rate=5e-5,
    fp16=True,
    save_steps=10_000,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

trainer.train()

In [None]:
def generate_text(prompt, model, tokenizer, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,       # Increase for more randomness (try 0.7-1.0)
        top_k=50,              # Sample from top 50 likely next tokens
        top_p=0.95,            # Nucleus sampling: picks from top tokens covering 95% probability
        repetition_penalty=1.2,  # Penalize repeated phrases (values >1.0 reduce repetition)
        do_sample=True,        # Enable sampling (required for temperature/top_k/top_p)
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example prompt about Clean Code
prompt = "The key principles of Clean Code are:"
generated_text = generate_text(prompt, model, tokenizer)
print(generated_text)

In [None]:
prompt = "Explain the key principles of Clean Code in computer programming:"
generated_text = generate_text(prompt, model, tokenizer)
print(generated_text)

In [None]:
model.save_pretrained("gpt2-finetuned-cs-books")
tokenizer.save_pretrained("gpt2-finetuned-cs-books")

In [None]:
def generate_high_quality_text(prompt, model, tokenizer, max_length=150):
    # Encode the input with attention mask
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    ).to(model.device)

    # Generate text with better parameters
    output = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,           # Balanced creativity
        top_k=40,                  # Focus on top probable tokens
        top_p=0.9,                 # Nucleus sampling
        repetition_penalty=1.3,     # Reduce repetition
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        early_stopping=True
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Usage:
prompt = "Explain the key principles of Clean Code in computer programming:"
print(generate_high_quality_text(prompt, model, tokenizer))