In [2]:
# Install required packages
!pip install -U transformers datasets

# Upload your plain text file
from google.colab import files
uploaded = files.upload()  # Upload your .txt file here

import os
file_path = list(uploaded.keys())[0]  # Automatically get the uploaded filename

# Load the file into a dataset
from datasets import Dataset

# Read text file
with open(file_path, encoding='utf-8') as f:
    lines = f.readlines()

# Create dataset
data = Dataset.from_dict({"text": lines})

# Load GPT-2 tokenizer and model
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Avoid padding error
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_data = data.map(tokenize_function, batched=True)
tokenized_data.set_format("torch", columns=["input_ids", "attention_mask"])

# Training setup
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
trainer.train()




Saving sample3.txt to sample3 (1).txt


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mvarshi3107[0m ([33mvarshi3107-code-craft-infotech[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


TrainOutput(global_step=3, training_loss=5.72910753885905, metrics={'train_runtime': 339.5777, 'train_samples_per_second': 0.009, 'train_steps_per_second': 0.009, 'total_flos': 783876096000.0, 'train_loss': 5.72910753885905, 'epoch': 3.0})

In [3]:
trainer.save_model("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")


('./gpt2-finetuned/tokenizer_config.json',
 './gpt2-finetuned/special_tokens_map.json',
 './gpt2-finetuned/vocab.json',
 './gpt2-finetuned/merges.txt',
 './gpt2-finetuned/added_tokens.json')

In [4]:
from transformers import pipeline, GPT2Tokenizer, GPT2LMHeadModel

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetuned")

# Create generation pipeline
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Generate text from a custom prompt
prompt = "Once upon a time"
generated = text_generator(prompt, max_length=100, num_return_sequences=1)

print(generated[0]["generated_text"])


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Once upon a time, the game would be a disaster for anyone who was not a fan of the series. They would be forced to fight for their lives, with the player's ability to heal them in return becoming a key feature of the game. This would lead to the game being somewhat boring at times, but eventually, the players would settle into their roles and make up their own stories. These stories would take place in the game world, but in the form of a series of endings. In the case of the game, these endings would be based off of the actual story, instead of the pre-existing characters.

The game's storyline had a rather straightforward structure. The player would decide what the story was. There were nine endings to the story. The player's choice to fight the game's end would determine how many items were available for their quest, how much time was left on their quest, and how much time was left on their quest.

The ending of the game would be revealed in a series of events that would take place 