In [None]:
!pip install transformers datasets peft accelerate bitsandbytes


import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
    set_seed
)
from peft import get_peft_model, LoraConfig, TaskType
import json
import zipfile
import os
from google.colab import files
import warnings
warnings.filterwarnings('ignore')

set_seed(42)

Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.47.0


In [None]:
def load_and_format_data(csv_path):
    print("Loading and formatting data...")

    df = pd.read_csv(csv_path)
    print(f"Loaded {len(df)} rows from CSV")

    formatted_data = []
    for _, row in df.iterrows():
        company = str(row['Company']).strip()
        slogan = str(row['Slogans']).strip()

        prompt = f"Generate a slogan for {company}:"
        completion = f" <slogan>{slogan}</slogan>"

        full_text = prompt + completion + " <|endoftext|>"

        formatted_data.append({
            'text': full_text,
            'prompt': prompt,
            'completion': completion
        })

    print(f"Created {len(formatted_data)} training examples")
    return formatted_data

In [None]:
def prepare_dataset(formatted_data, tokenizer, max_length=128):
    print("Preparing dataset...")

    def tokenize_function(examples):
        tokenized = tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )
        tokenized["labels"] = tokenized["input_ids"].copy()
        return tokenized

    dataset = Dataset.from_pandas(pd.DataFrame(formatted_data))

    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=dataset.column_names,
    )

    print(f"Dataset prepared with {len(tokenized_dataset)} examples")
    return tokenized_dataset

In [None]:
def setup_model_with_lora(model_name="gpt2"):

    print(f"Loading {model_name} model and tokenizer...")

    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id

    special_tokens = {"additional_special_tokens": ["<slogan>", "</slogan>"]}
    tokenizer.add_special_tokens(special_tokens)

    model.resize_token_embeddings(len(tokenizer))

    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=["c_attn", "c_proj", "c_fc"]
    )

    model = get_peft_model(model, lora_config)

    print("Model setup with LoRA complete!")
    print(f"Trainable parameters: {model.num_parameters():,}")
    print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")

    return model, tokenizer

In [None]:
def train_model(model, tokenizer, train_dataset, output_dir="./slogan_model"):

    print("Setting up training...")

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2,
        warmup_steps=100,
        logging_steps=50,
        save_steps=500,
        save_total_limit=2,
        prediction_loss_only=True,
        learning_rate=5e-5,
        weight_decay=0.01,
        fp16=True,
        dataloader_pin_memory=False,
        remove_unused_columns=False,
        report_to="none",
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=data_collator,
    )

    print("Starting training...")
    trainer.train()

    print("Training completed!")
    return trainer

In [None]:
def save_and_export_model(trainer, tokenizer, model_dir="./slogan_model"):
    print("Saving model...")

    trainer.save_model(model_dir)
    tokenizer.save_pretrained(model_dir)

    training_info = {
        "model_type": "GPT-2 with LoRA",
        "task": "Slogan Generation",
        "training_steps": trainer.state.global_step,
        "final_loss": trainer.state.log_history[-1].get("train_loss", "N/A")
    }

    with open(f"{model_dir}/training_info.json", "w") as f:
        json.dump(training_info, f, indent=2)

    zip_filename = "slogan_model.zip"
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(model_dir):
            for file in files:
                file_path = os.path.join(root, file)
                arc_name = os.path.relpath(file_path, model_dir)
                zipf.write(file_path, arc_name)

    print(f"Model saved to {model_dir}")
    print(f"Zip file created: {zip_filename}")

    try:
        files.download(zip_filename)
        print("Zip file download started!")
    except:
        print("Manual download: You can download the zip file from the file browser")

In [None]:
def generate_slogan(model, tokenizer, prompt, max_length=50, temperature=0.7, num_return_sequences=1):
    model.eval()

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    eos_token_id = tokenizer.convert_tokens_to_ids("</slogan>") if "</slogan>" in tokenizer.get_vocab() else tokenizer.eos_token_id

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=len(inputs.input_ids[0]) + max_length,
            temperature=temperature,
            num_return_sequences=num_return_sequences,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            repetition_penalty=1.5,
            eos_token_id=eos_token_id
        )

    generated_slogans = []
    for output in outputs:
        full_text = tokenizer.decode(output, skip_special_tokens=True)

        slogan_part = full_text[len(prompt):].strip()

        if "<slogan>" in slogan_part and "</slogan>" in slogan_part:
            slogan = slogan_part.split("<slogan>")[1].split("</slogan>")[0].strip()
        else:
            slogan = slogan_part

        generated_slogans.append(slogan)

    return generated_slogans


def load_model_for_inference(model_dir="./slogan_model"):
    print("Loading fine-tuned model for inference...")

    tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
    tokenizer.pad_token = tokenizer.eos_token  # ensure padding works

    base_model = GPT2LMHeadModel.from_pretrained("gpt2")
    base_model.config.pad_token_id = tokenizer.eos_token_id

    base_model.resize_token_embeddings(len(tokenizer))

    from peft import PeftModel
    model = PeftModel.from_pretrained(base_model, model_dir)

    if torch.cuda.is_available():
        model = model.to("cuda")

    model.eval()
    print("Model and tokenizer loaded successfully!")

    return model, tokenizer

In [None]:
def main_training_pipeline(csv_path):
    print("Starting GPT-2 Slogan Generation Fine-tuning Pipeline")

    formatted_data = load_and_format_data(csv_path)

    model, tokenizer = setup_model_with_lora()

    train_dataset = prepare_dataset(formatted_data, tokenizer)

    trainer = train_model(model, tokenizer, train_dataset)

    save_and_export_model(trainer, tokenizer)

    print("Training pipeline completed successfully!")

    return model, tokenizer

In [None]:
if __name__ == "__main__":

    csv_file_path = "slogans.csv"

    trained_model, trained_tokenizer = main_training_pipeline(csv_file_path)

    print("TESTING INFERENCE")

    test_prompts = [
        "Generate a slogan for Pepsi:",
        "Generate a slogan for Nescafe:",
        "Generate a slogan for Red Bull:",
    ]

    for prompt in test_prompts:
        print(f"\nPrompt: {prompt}")
        slogans = generate_slogan(trained_model, trained_tokenizer, prompt, num_return_sequences=3)
        for i, slogan in enumerate(slogans, 1):
            print(f"   {i}. {slogan}")

Starting GPT-2 Slogan Generation Fine-tuning Pipeline
Loading and formatting data...
Loaded 2466 rows from CSV
Created 2466 training examples
Loading gpt2 model and tokenizer...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Model setup with LoRA complete!
Trainable parameters: 125,620,992
Total parameters: 125,620,992
Preparing dataset...


Map:   0%|          | 0/2466 [00:00<?, ? examples/s]

Dataset prepared with 2466 examples
Setting up training...
Starting training...


`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,6.6747
100,5.719
150,4.3667
200,3.979
250,3.8876
300,3.7874
350,3.7145
400,3.7106
450,3.6238
500,3.6346


Training completed!
Saving model...
Model saved to ./slogan_model
Zip file created: slogan_model.zip
Manual download: You can download the zip file from the file browser
Training pipeline completed successfully!
TESTING INFERENCE

Prompt: Generate a slogan for Pepsi:
   1. Roush. (1984) A dram of thirst that is made in the world's most powerful drinks, but also one where you can do it yourself!(1987), an inspiration to make life better.(1990). reelsaustic
   2. the great thing about life. (2010) spergels with hops and anxieties are so delicious it made me want to spend more time drinking them! michiel johmann The German beverage of choice in Europe, is brewed
   3. , The best of the good. (2007)The finest pint ever!(2008-)Fluidly named by Guinness in Scotland as part 3 of its portfolio .ometimes said to be both refreshing and inspiring.(2009-2011)-Totally

Prompt: Generate a slogan for Nescafe:
   1. Tasting nectar, experience. ( 2005 ) (2004)Nike of France's favourite sport drink is n

In [None]:
model, tokenizer = load_model_for_inference("./slogan_model")

new_prompts = [
    "Generate a slogan for Coca Cola:",
    "Generate a slogan for Nescafe:",
    "Generate a slogan for Pepsi:",
    "Generate a slogan for Nestle:",
    "Generate a slogan for Red Bull:"
]

for prompt in new_prompts:
    print(f"Prompt: {prompt}")
    slogans = generate_slogan(model, tokenizer, prompt, num_return_sequences=3)
    for slogan in slogans:
        print(f"Generated: {slogan}")
    print("-" * 40)

Loading fine-tuned model for inference...
Model and tokenizer loaded successfully!
Prompt: Generate a slogan for Coca Cola:
Generated: The Best of the best. (1988) by Kavanagh, Ireland Ltd. (1992-1995).(1997), edited By JB Revell , London GPs .(2001)-2007.by John Miller and Anne Holleyen
Generated: I don't like soda. It's the least of our problems. We've always been drinkers, but now we're not drinking it at all (laughs).The biggest issue facing India is supply chain management and what its best advice would
Generated: (click on the image) (1990s!) The perfect blend of taste and flavor.
Drink responsibly, consume safely , abstain from alcohol or tobacco products.(1986-) Tasty drink you can enjoy yourself to-day . To
----------------------------------------
Prompt: Generate a slogan for Nescafe:
Generated: The best of the worst. (2008) by Nick Mitchell, Croydon Brewery Ltd. , Lancashire UK. (2011 ) .(2014):  challengingly high quality! (2013).By Joss Whedon
Generated: the finest sparkli