In [1]:
!pip install -U pip setuptools wheel
!pip install transformers==4.41.0
!pip install numba==0.60.0
!pip install cuml-cu12==25.2.0 \
             cudf-cu12==25.2.0 \
             --extra-index-url=https://pypi.nvidia.com
!pip install fsspec==2024.2.0
!pip install datasets
!pip uninstall -y transformers peft
!pip install transformers==4.39.3 peft==0.10.0 datasets fsspec==2024.2.0
!pip install --upgrade transformers accelerate
!pip show transformers cuml cudf numba fsspec datasets
!pip install huggingface_hub[hf_xet]


Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Collecting setuptools
  Downloading setuptools-78.1.0-py3-none-any.whl.metadata (6.6 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading setuptools-78.1.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: setuptools, pip
  Attempting uninstall: setuptools
    Found existing installation: setuptools 75.1.0
    Uninstalling setuptools-75.1.0:
      Successfully uninstalled setuptools-75.1.0
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are instal

In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [3]:

import os
import pandas as pd
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    TextDataset,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    TrainerCallback
)

#  Combining CSV files into one text file
file_paths = [
    "/kaggle/input/linguogen-text-generation-dataset/Animals.csv",
    "/kaggle/input/linguogen-text-generation-dataset/Books.csv",
    "/kaggle/input/linguogen-text-generation-dataset/Climate.csv",
    "/kaggle/input/linguogen-text-generation-dataset/Environment.csv",
    "/kaggle/input/linguogen-text-generation-dataset/Friends.csv",
    "/kaggle/input/linguogen-text-generation-dataset/Hospital.csv",
    "/kaggle/input/linguogen-text-generation-dataset/Movies.csv",
    "/kaggle/input/linguogen-text-generation-dataset/Religion.csv",
    "/kaggle/input/linguogen-text-generation-dataset/School.csv",
    "/kaggle/input/linguogen-text-generation-dataset/Space.csv"
]

all_text = ""

for path in file_paths:
    df = pd.read_csv(path)
    for col in df.select_dtypes(include="object").columns:
        all_text += "\n".join(df[col].dropna().astype(str)) + "\n"

with open("combined_dataset.txt", "w", encoding="utf-8") as f:
    f.write(all_text)

# Step 4: Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Required for GPT2
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Step 5: Create dataset
def load_dataset(tokenizer, file_path, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

def get_data_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

train_dataset = load_dataset(tokenizer, "combined_dataset.txt")
data_collator = get_data_collator(tokenizer)
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=1000,
    save_total_limit=1,
    logging_steps=5,
    logging_first_step=True,
    report_to="none"
)

# Step 6: Custom Callback to record epoch-wise loss
class LossLogger(TrainerCallback):
    def __init__(self):
        self.logs = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if 'loss' in logs:
            self.logs.append({'epoch': round(state.epoch, 2), 'loss': logs['loss']})

loss_logger = LossLogger()

# Step 7: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    callbacks=[loss_logger]
)

# Step 8: Train
trainer.train()

# Step 9: Display loss logs
loss_df = pd.DataFrame(loss_logger.logs)
# Keep only the last log of each epoch
epoch_losses = loss_df.groupby("epoch").last().reset_index()
display(epoch_losses)


2025-04-10 09:34:29.756603: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744277669.928993      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744277669.979040      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,1.5255
5,1.4643
10,1.1314
15,1.3374
20,1.1814
25,1.2143
30,1.0821
35,0.9797


Unnamed: 0,epoch,loss
0,0.08,1.5255
1,0.42,1.4643
2,0.83,1.1314
3,1.25,1.3374
4,1.67,1.1814
5,2.08,1.2143
6,2.5,1.0821
7,2.92,0.9797


In [4]:
model_path = "./gpt2-finetuned"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)


('./gpt2-finetuned/tokenizer_config.json',
 './gpt2-finetuned/special_tokens_map.json',
 './gpt2-finetuned/vocab.json',
 './gpt2-finetuned/merges.txt',
 './gpt2-finetuned/added_tokens.json')

In [5]:
from transformers import pipeline

generator = pipeline("text-generation", model=model_path, tokenizer=model_path)
prompt = "My favorite book is"
outputs = generator(prompt, max_length=100, num_return_sequences=1)

print(outputs[0]['generated_text'])

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


My favorite book is by the late Samuel Beckett, which goes without saying that he was not the kind of one who would be open to the idea of a world with no human beings around it.


For me, fantasy is always about characters; I try to keep a close eye on my characters' world in order to keep them interesting and believable. What follows is not a novel and is not a spy novel - the characters in it are also very different from one another. They have unique
