# Step 1: Synthetic Dataset Creation

We create a small dataset of business descriptions to fine-tune our model.

In [2]:
import pandas as pd
import random

In [3]:
# Define some sample business types and cities
business_types = [
    "coffee shop", "law firm", "tech startup", "fitness studio", "pet store",
    "bookstore", "flower shop", "bakery", "consulting agency", "marketing firm"
]

cities = ["New York", "London", "Paris", "Berlin", "Tokyo", "San Francisco", "Dubai", "Sydney"]

# Generate 200 synthetic business descriptions
dataset = []
for _ in range(200):
    business = random.choice(business_types)
    city = random.choice(cities)
    desc = f"{business} in {city}"
    dataset.append({"business_description": desc})

# Convert to DataFrame
df = pd.DataFrame(dataset)
df.head()

Unnamed: 0,business_description
0,marketing firm in Paris
1,bookstore in Tokyo
2,flower shop in London
3,law firm in New York
4,flower shop in San Francisco


In [4]:
df.to_csv("../data/synthetic_dataset.csv", index=False)
print("Synthetic dataset saved to /data/synthetic_dataset.csv")

Synthetic dataset saved to /data/synthetic_dataset.csv


In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from datasets import Dataset

In [6]:
# Load tokenizer & base model
model_name = "openlm-research/open_llama_3b"   # Example; small open LLaMa model
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

# Load dataset
data = pd.read_csv("../data/synthetic_dataset.csv")

# Add dummy target column: "domain_name"
data["domain_name"] = data["business_description"].apply(lambda x: x.replace(" ", "") + ".com")

# Convert to HuggingFace dataset
hf_dataset = Dataset.from_pandas(data)


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


In [7]:
def format_data(example):
    prompt = f'Suggest a domain name for: "{example["business_description"]}"'
    return {"input_text": prompt, "label_text": example["domain_name"]}

hf_dataset = hf_dataset.map(format_data)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [8]:
hf_dataset[0]

{'business_description': 'marketing firm in Paris',
 'domain_name': 'marketingfirminParis.com',
 'input_text': 'Suggest a domain name for: "marketing firm in Paris"',
 'label_text': 'marketingfirminParis.com'}

In [None]:
trainer.train()
