In [1]:
!pip install transformers datasets peft accelerate torch

Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-p

In [2]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

# Loading the dataset 
try:
    # Loading only necessary columns to save memory
    df = pd.read_csv(
        "/kaggle/input/recipenlg/RecipeNLG_dataset.csv",
        usecols=['ingredients', 'directions', 'title', 'NER']  # Select only columns you need
    )
    
    # Format the columns
    def format_text(row):
        return (
            f"Generate a {row['title']} recipe with these ingredients: {row['ingredients']}\n"
            f"Instructions: {row['directions']}\n"
            f"Food entities: {row['NER']}"
        )
    
    # Apply formatting row-by-row (better for memory)
    df["text"] = df.apply(format_text, axis=1)
    
    # Only keep the formatted text column
    dataset = Dataset.from_pandas(df[['text']])  
    
    # Clean up memory
    del df  # Free up pandas memory
    import gc
    gc.collect()

except Exception as e:
    print(f"Error loading dataset: {e}")

In [3]:
dataset = dataset.train_test_split(test_size=0.1)

In [4]:
from transformers import AutoModelForCausalLM

model_name = "gpt2-medium"
model = AutoModelForCausalLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

2025-04-22 10:25:02.544171: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745317503.060527      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745317503.202712      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
from transformers import AutoTokenizer
from peft import LoraConfig

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/2008027 [00:00<?, ? examples/s]

Map:   0%|          | 0/223115 [00:00<?, ? examples/s]

In [None]:
from transformers import Trainer, TrainingArguments

trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./recipe-lora",
        per_device_train_batch_size=4,
        num_train_epochs=3,
        learning_rate=5e-5,
        fp16=True,
    ),
    train_dataset=tokenized_dataset["train"],
)
trainer.train()



1


<IPython.core.display.Javascript object>

In [None]:
# Generate recipes
prompt = "Generate a vegan chocolate cake:"
output = model.generate(**tokenizer(prompt, return_tensors="pt"), max_length=200)
print(tokenizer.decode(output[0]))