In [1]:
!pip install transformers
!pip install transformers datasets nltk




In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import nltk
from nltk.corpus import gutenberg

import os
os.environ["WANDB_DISABLED"] = "true"



  from .autonotebook import tqdm as notebook_tqdm





In [12]:

# Step 1: Download the NLTK Gutenberg corpus
nltk.download('gutenberg')

# Step 2: Load a few books from the Gutenberg corpus
selected_books = ['blake-poems.txt', 'milton-paradise.txt ', 'whitman-leaves.txt', 'carroll-alice.txt', 'chesterton-thursday.txt', 'austen-sense.txt']  # Add/remove as needed  'carroll-alice.txt'
corpus = ""

for book in selected_books:
    corpus += gutenberg.raw(book)

# Save the combined corpus to a text file
with open("nltk_gutenberg_poems.txt", "w") as f:
    f.write(corpus)

# Step 3: Load the GPT-2 tokenizer and model
model_name = "gpt2"  # Smaller GPT-2 model for Colab
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Step 4: Prepare the dataset for training
def load_text_dataset(file_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

def create_data_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

# Load and tokenize the dataset
train_dataset = load_text_dataset("nltk_gutenberg_poems.txt", tokenizer)
data_collator = create_data_collator(tokenizer)

# Step 5: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./gpt2-fine-tuned",  # Directory for model checkpoints
    overwrite_output_dir=True,
    num_train_epochs=6,             # Number of fine-tuning epochs
    per_device_train_batch_size=2,  # Adjust batch size based on available memory
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",           # Directory for logs
    prediction_loss_only=True
)

# Step 6: Fine-Tune the Model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
      # Include validation dataset
)

trainer.train()

# Save the fine-tuned model
model.save_pretrained("./gpt2-fine-tuned-poem")
tokenizer.save_pretrained("./gpt2-fine-tuned-poem")

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  3%|▎         | 500/14694 [09:36<3:23:00,  1.17it/s]

{'loss': 4.0334, 'grad_norm': 5.648264408111572, 'learning_rate': 4.82986252892337e-05, 'epoch': 0.2}


  7%|▋         | 1000/14694 [16:51<3:34:27,  1.06it/s]

{'loss': 3.8608, 'grad_norm': 4.26663875579834, 'learning_rate': 4.659725057846741e-05, 'epoch': 0.41}


 10%|█         | 1500/14694 [24:07<3:03:08,  1.20it/s]

{'loss': 3.7953, 'grad_norm': 3.741769790649414, 'learning_rate': 4.48958758677011e-05, 'epoch': 0.61}


 14%|█▎        | 2000/14694 [31:25<3:14:13,  1.09it/s]

{'loss': 3.7213, 'grad_norm': 3.9264845848083496, 'learning_rate': 4.3194501156934806e-05, 'epoch': 0.82}


 17%|█▋        | 2500/14694 [38:45<2:52:23,  1.18it/s]

{'loss': 3.7028, 'grad_norm': 3.8675878047943115, 'learning_rate': 4.149312644616851e-05, 'epoch': 1.02}


 20%|██        | 3000/14694 [46:02<2:48:20,  1.16it/s]

{'loss': 3.4385, 'grad_norm': 5.010801315307617, 'learning_rate': 3.9791751735402204e-05, 'epoch': 1.22}


 24%|██▍       | 3500/14694 [55:34<3:59:48,  1.29s/it]

{'loss': 3.4163, 'grad_norm': 4.26170539855957, 'learning_rate': 3.809037702463591e-05, 'epoch': 1.43}


 27%|██▋       | 4000/14694 [1:04:02<2:41:31,  1.10it/s]

{'loss': 3.4581, 'grad_norm': 4.675246715545654, 'learning_rate': 3.638900231386961e-05, 'epoch': 1.63}


 31%|███       | 4500/14694 [1:11:14<2:26:18,  1.16it/s]

{'loss': 3.4097, 'grad_norm': 3.9337430000305176, 'learning_rate': 3.4687627603103305e-05, 'epoch': 1.84}


 34%|███▍      | 5000/14694 [1:18:29<2:23:37,  1.12it/s]

{'loss': 3.4057, 'grad_norm': 4.362245559692383, 'learning_rate': 3.298625289233701e-05, 'epoch': 2.04}


 37%|███▋      | 5500/14694 [1:25:39<2:06:59,  1.21it/s]

{'loss': 3.2051, 'grad_norm': 4.745555877685547, 'learning_rate': 3.128487818157071e-05, 'epoch': 2.25}


 41%|████      | 6000/14694 [1:32:49<2:03:07,  1.18it/s]

{'loss': 3.1994, 'grad_norm': 4.564483165740967, 'learning_rate': 2.9583503470804413e-05, 'epoch': 2.45}


 44%|████▍     | 6500/14694 [1:39:58<2:05:03,  1.09it/s]

{'loss': 3.2379, 'grad_norm': 5.0460686683654785, 'learning_rate': 2.7882128760038112e-05, 'epoch': 2.65}


 48%|████▊     | 7000/14694 [1:47:11<1:52:02,  1.14it/s]

{'loss': 3.2605, 'grad_norm': 4.659337520599365, 'learning_rate': 2.618075404927181e-05, 'epoch': 2.86}


 51%|█████     | 7500/14694 [1:54:25<1:42:05,  1.17it/s]

{'loss': 3.1854, 'grad_norm': 4.51217794418335, 'learning_rate': 2.4479379338505513e-05, 'epoch': 3.06}


 54%|█████▍    | 8000/14694 [2:01:37<1:36:23,  1.16it/s]

{'loss': 3.0534, 'grad_norm': 5.526889801025391, 'learning_rate': 2.2778004627739213e-05, 'epoch': 3.27}


 58%|█████▊    | 8500/14694 [2:08:45<1:27:31,  1.18it/s]

{'loss': 3.0858, 'grad_norm': 4.96920919418335, 'learning_rate': 2.1076629916972915e-05, 'epoch': 3.47}


 61%|██████    | 9000/14694 [2:15:52<1:19:32,  1.19it/s]

{'loss': 3.061, 'grad_norm': 4.688619613647461, 'learning_rate': 1.9375255206206618e-05, 'epoch': 3.67}


 65%|██████▍   | 9500/14694 [2:23:00<1:12:20,  1.20it/s]

{'loss': 3.0761, 'grad_norm': 4.472217082977295, 'learning_rate': 1.7673880495440317e-05, 'epoch': 3.88}


 68%|██████▊   | 10000/14694 [2:30:10<1:07:34,  1.16it/s]

{'loss': 3.0045, 'grad_norm': 5.782901763916016, 'learning_rate': 1.5972505784674016e-05, 'epoch': 4.08}


 71%|███████▏  | 10500/14694 [2:37:49<1:00:48,  1.15it/s]

{'loss': 2.9538, 'grad_norm': 5.643039226531982, 'learning_rate': 1.4271131073907717e-05, 'epoch': 4.29}


 75%|███████▍  | 11000/14694 [2:45:02<53:11,  1.16it/s]  

{'loss': 2.9529, 'grad_norm': 5.538005352020264, 'learning_rate': 1.256975636314142e-05, 'epoch': 4.49}


 78%|███████▊  | 11500/14694 [2:52:16<45:50,  1.16it/s]  

{'loss': 2.9425, 'grad_norm': 5.63086462020874, 'learning_rate': 1.086838165237512e-05, 'epoch': 4.7}


 82%|████████▏ | 12000/14694 [2:59:26<38:16,  1.17it/s]  

{'loss': 2.9583, 'grad_norm': 5.16098690032959, 'learning_rate': 9.16700694160882e-06, 'epoch': 4.9}


 85%|████████▌ | 12500/14694 [3:06:35<33:41,  1.09it/s]

{'loss': 2.8972, 'grad_norm': 6.050135612487793, 'learning_rate': 7.465632230842522e-06, 'epoch': 5.1}


 88%|████████▊ | 13000/14694 [3:13:43<24:30,  1.15it/s]

{'loss': 2.8427, 'grad_norm': 5.879138469696045, 'learning_rate': 5.764257520076222e-06, 'epoch': 5.31}


 92%|█████████▏| 13500/14694 [3:20:57<16:48,  1.18it/s]

{'loss': 2.8544, 'grad_norm': 5.3724751472473145, 'learning_rate': 4.062882809309922e-06, 'epoch': 5.51}


 95%|█████████▌| 14000/14694 [3:28:08<09:44,  1.19it/s]

{'loss': 2.8741, 'grad_norm': 5.560638904571533, 'learning_rate': 2.3615080985436233e-06, 'epoch': 5.72}


 99%|█████████▊| 14500/14694 [3:35:18<02:48,  1.15it/s]

{'loss': 2.879, 'grad_norm': 6.447165489196777, 'learning_rate': 6.601333877773241e-07, 'epoch': 5.92}


100%|██████████| 14694/14694 [3:38:07<00:00,  1.12it/s]


{'train_runtime': 13087.0281, 'train_samples_per_second': 2.245, 'train_steps_per_second': 1.123, 'train_loss': 3.229136655845658, 'epoch': 6.0}


('./gpt2-fine-tuned-poem\\tokenizer_config.json',
 './gpt2-fine-tuned-poem\\special_tokens_map.json',
 './gpt2-fine-tuned-poem\\vocab.json',
 './gpt2-fine-tuned-poem\\merges.txt',
 './gpt2-fine-tuned-poem\\added_tokens.json')

In [13]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2-fine-tuned-poem")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-fine-tuned-poem")

# Define a seed text
seed_text = "Night is young"

# Encode the input text
input_ids = tokenizer.encode(seed_text, return_tensors='pt')

# Generate text
output = model.generate(
    input_ids,
    max_length=100,
    num_return_sequences=1,
    no_repeat_ngram_size=2,
    repetition_penalty=2.0,
    top_p=0.9,
    temperature=1.0,
    do_sample=True,
)

# Decode and print the result
generated_poem = tokenizer.decode(output[0], skip_special_tokens=True)
print("\nGenerated Poem:")
print(generated_poem)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Generated Poem:
Night is young man, with his mighty head on high; 
Who doth sit by in recess without watch?  Where are the lights?" -- cried  Thee.

 THEE'ENDER INTRODUCTION XXXIII [Volunteers] Volume I:--Year and Night
 (from under a cloud of stars,) year after day we move.--The sky bright-side soon darkens! O moon!" says she aloud!--[On receiving this command from Heaven.] This


In [22]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Specify the directory where the model and tokenizer are saved
saved_model_dir = "C:/Users/tejas/Downloads/DSAI First Sem Tejaswi/Knowledge Processing/Project/fine_tuned_gpt"

# Load the model
model = AutoModelForCausalLM.from_pretrained(saved_model_dir)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(saved_model_dir)


In [40]:
from transformers import pipeline

# Load the pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Generate text from seed input
# seed_text = "Colorful flowers"
# seed_text = "Tulips are pretty"
seed_text= "Ghosts are scary"
# seed_text = "This world is beautiful"
# seed_text= "Life is beautiful"
output = generator(seed_text, max_length=300, num_return_sequences=1)

print("Generated Text:")
print(output[0]["generated_text"])


Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text:
Ghosts are scary,' said the Gryphon. 'They'll take a
man like that and eat his kid: that's the sort of thing!'

'But a man like that,' the Mock Turtle replied, 'would eat only half his
children and then go back to his own party. And if they've got to eat half
their own children again, then God-knows what they're going to do!'

'You might as well call them ghosts,' the Gryphon remarked.

'I'm afraid I'm not,' said the Mock Turtle. 'I should like to think
of them as a sort of modern family. There's something striking about such
a thing!'

'I haven't the least idea,' said the Mock Turtle, 'when you first see a
Ghost Sunday,' and he turned suddenly round to Alice. 'I've never seen a Sunday so old!'

'It was only because I've never seen him,' Alice exclaimed: 'and there's nothing in his shape
other than a very young head: he might well be a devil looking like him, if he wasn't so good
looking.'

'Come on!' said the Gryphon sharply, drawing his wand round the Gryphon's ear li

In [11]:
from google.colab import files
import shutil

# Compress the saved model directory
shutil.make_archive("fine_tuned_gpt", 'zip', output_dir)

# Download the compressed model
files.download("fine_tuned_gpt.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Path to your fine-tuned model in Google Drive
fine_tuned_model_path = "C:/Users/tejas/Downloads/DSAI First Sem Tejaswi/Knowledge Processing/Project/fine_tuned_gpt"

# Import Libraries
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.translate.meteor_score import meteor_score
import nltk
import re

# Download necessary NLTK resources
nltk.download('punkt')

# Load the Fine-Tuned Model and Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(fine_tuned_model_path)
model = GPT2LMHeadModel.from_pretrained(fine_tuned_model_path)

# Normalize text for consistency
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Generate Text Using the Fine-Tuned Model with Beam Search
def generate_text(seed_text, max_length=100, num_beams=5):
    input_ids = tokenizer.encode(seed_text, return_tensors='pt')
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_beams=num_beams,  # Use beam search to increase output quality
        no_repeat_ngram_size=3,
        top_k=100,
        top_p=0.9,
        temperature=0.7,
        do_sample=True,
        eos_token_id=tokenizer.eos_token_id,
        num_return_sequences=1
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# METEOR Score Calculation with Tokenization and Stemming
def compute_meteor_score(reference, generated):
    reference_tokens = nltk.word_tokenize(normalize_text(reference))
    generated_tokens = nltk.word_tokenize(normalize_text(generated))
    
    # Optional: Apply stemming to improve matching
    from nltk.stem import PorterStemmer
    stemmer = PorterStemmer()
    reference_tokens = [stemmer.stem(token) for token in reference_tokens]
    generated_tokens = [stemmer.stem(token) for token in generated_tokens]
    
    return meteor_score([reference_tokens], generated_tokens)

# Example Reference Texts
reference_texts = [
    "Life is nice, like eating rice. The sun is bright, it gives me light. Birds fly high, way up in the sky. Flowers are cool, they grow by the pool. Life is great, don't hesitate. Unless you're late, then blame your fate. Roses are red, violets are blue, Life is beautiful, and that’s... true?"  ]

# Generate Text Using a Seed
seed_text = "Life is beautiful"
generated_texts = [generate_text(seed_text, max_length=100, num_beams=5) for _ in range(len(reference_texts))]

# # Compute METEOR Scores
# meteor_scores = [compute_meteor_score(ref, gen) for ref, gen in zip(reference_texts, generated_texts)]
# average_meteor = sum(meteor_scores) / len(meteor_scores)

# # Print METEOR Scores and Average METEOR Score
# print("Generated Texts:", generated_texts)
# print("METEOR Scores:", meteor_scores)
# print("Average METEOR Score:", average_meteor)

# Compute METEOR Scores
meteor_scores = [compute_meteor_score(ref, gen) for ref, gen in zip(reference_texts, generated_texts)]
average_meteor = sum(meteor_scores) / len(meteor_scores)

# Compute Perplexity for Each Generated Text
perplexities = [calculate_perplexity(model, tokenizer, gen) for gen in generated_texts]
average_perplexity = sum(perplexities) / len(perplexities)

# Print Results
print("Generated Texts:", generated_texts)
print("METEOR Scores:", meteor_scores)
print("Average METEOR Score:", average_meteor)
print("Perplexities of Generated Texts:", perplexities)
print("Average Perplexity:", average_perplexity)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tejas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Texts: ['Life is beautiful to me, and I love it\n    with all my heart.\n\n\n\n}  Song of the Sea-Tide\n\nSong of the sea-tide,\nI sing the song of the tide,\nIt is not the sea alone, it is the whole race,\nThe whole race of men, women, cities, farms, farms of the earth.\n\nI do not know what it is to be a man or woman,\nBut']
METEOR Scores: [0.12841091492776888]
Average METEOR Score: 0.12841091492776888
Perplexities of Generated Texts: [3.3819446563720703]
Average Perplexity: 3.3819446563720703
