In [2]:
!pip install transformers datasets peft accelerate torch



In [14]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

# 1. Load the dataset efficiently
try:
    # Load only necessary columns to save memory
    df = pd.read_csv(
        "/kaggle/input/recipenlg/RecipeNLG_dataset.csv",
        usecols=['ingredients', 'directions', 'title', 'NER']  # Select only columns you need
    )
    
    # 2. Format the text more efficiently (avoid f-strings with entire columns)
    def format_text(row):
        return (
            f"Generate a {row['title']} recipe with these ingredients: {row['ingredients']}\n"
            f"Instructions: {row['directions']}\n"
            f"Food entities: {row['NER']}"
        )
    
    # Apply formatting row-by-row (better for memory)
    df["text"] = df.apply(format_text, axis=1)
    
    # 3. Convert to Hugging Face Dataset in chunks if needed
    dataset = Dataset.from_pandas(df[['text']])  # Only keep the formatted text column
    
    # 4. Clean up memory
    del df  # Free up pandas memory
    import gc
    gc.collect()

except Exception as e:
    print(f"Error loading dataset: {e}")

In [18]:
dataset = dataset.train_test_split(test_size=0.1)

In [20]:
from transformers import AutoModelForCausalLM

model_name = "gpt2-medium"
model = AutoModelForCausalLM.from_pretrained(model_name)

In [19]:
from transformers import AutoTokenizer
from peft import LoraConfig

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Apply LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)


Map:   0%|          | 0/2008027 [00:00<?, ? examples/s]

Map:   0%|          | 0/223115 [00:00<?, ? examples/s]