In [None]:
!pip install transformers datasets peft accelerate torch
!pip install huggingface_hub[hf_xet]

In [4]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
import gc

class RecipeDataProcessor:
    def __init__(self, sample_size=100):
        self.sample_size = sample_size
        self.tokenizer = None
        self.dataset = None
        
    def load_data(self, file_path):
        """Load and preprocess the RecipeNLG dataset"""
        try:
            # Load only necessary columns with explicit dtype to reduce memory
            df = pd.read_csv(
                file_path,
                usecols=['title', 'ingredients', 'directions', 'NER'],
                dtype={
                    'title': 'string',
                    'ingredients': 'string',
                    'directions': 'string',
                    'NER': 'string'
                }
            )
            
            # Sample the dataset (while maintaining reproducibility)
            df = df.sample(n=self.sample_size, random_state=42).reset_index(drop=True)
            
            # Clean and format the text
            df['text'] = self._format_recipes(df)
            
            # Convert to HuggingFace Dataset
            self.dataset = Dataset.from_pandas(df[['text']])
            
            # Clean up
            del df
            gc.collect()
            
            print(f"Successfully loaded and processed {self.sample_size} samples.")
            return True
            
        except Exception as e:
            print(f"Error loading dataset: {str(e)}")
            return False
    
    def _format_recipes(self, df):
        """Format recipe data into a consistent text format"""
        formatted_texts = []
        
        for _, row in df.iterrows():
            # Basic cleaning
            title = str(row['title']).strip()
            ingredients = str(row['ingredients']).strip().replace("'", "").replace("[", "").replace("]", "")
            directions = str(row['directions']).strip().replace("'", "").replace("[", "").replace("]", "")
            ner = str(row['NER']).strip().replace("'", "").replace("[", "").replace("]", "")
            
            # Create structured prompt
            formatted_text = (
                f"Recipe Title: {title}\n\n"
                f"Ingredients:\n{ingredients}\n\n"
                f"Instructions:\n{directions}\n\n"
                f"Food Entities: {ner}"
            )
            formatted_texts.append(formatted_text)
            
        return formatted_texts
    
    def initialize_tokenizer(self, model_name="gpt2"):
        """Initialize the tokenizer"""
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.tokenizer.pad_token = self.tokenizer.eos_token  # Set pad token
            print(f"Initialized tokenizer for model: {model_name}")
            return True
        except Exception as e:
            print(f"Error initializing tokenizer: {str(e)}")
            return False
    
    def split_dataset(self, test_size=0.1, seed=42):
        """Split the dataset into training and test sets"""
        if not hasattr(self, 'dataset') or self.dataset is None:
            print("No dataset loaded to split!")
            return False
            
        try:
            # Perform the split
            split_dataset = self.dataset.train_test_split(
                test_size=test_size,
                seed=seed
            )
            
            # Convert to DatasetDict for proper HuggingFace handling
            self.dataset = DatasetDict({
                'train': split_dataset['train'],
                'test': split_dataset['test']
            })
            
            print(f"Successfully split dataset: {len(self.dataset['train'])} train, {len(self.dataset['test'])} test samples")
            return True
            
        except Exception as e:
            print(f"Error splitting dataset: {str(e)}")
            return False
    
    def tokenize_dataset(self, max_length=512):
        """Tokenize the dataset"""
        if not self.tokenizer:
            print("Tokenizer not initialized!")
            return False
            
        def tokenize_function(examples):
            return self.tokenizer(
                examples["text"],
                truncation=True,
                padding="max_length",
                max_length=max_length,
                return_tensors="pt"
            )
            
        try:
            self.dataset = self.dataset.map(
                tokenize_function,
                batched=True,
                remove_columns=["text"]  # Remove original text column
            )
            print("Successfully tokenized dataset.")
            return True
        except Exception as e:
            print(f"Error tokenizing dataset: {str(e)}")
            return False

# Correct Usage Example
if __name__ == "__main__":
    # Initialize processor
    processor = RecipeDataProcessor(sample_size=100)
    
    # 1. Load and process data
    processor.load_data("/kaggle/input/recipenlg/RecipeNLG_dataset.csv")
    
    # 2. Initialize tokenizer
    processor.initialize_tokenizer("gpt2")
    
    # 3. Split dataset (now this method exists!)
    processor.split_dataset(test_size=0.1)  # 90% train, 10% test
    
    # 4. Tokenize dataset
    processor.tokenize_dataset(max_length=256)
    
    # Now you can access:
    # - processor.dataset['train'] for training
    # - processor.dataset['test'] for evaluation

Successfully loaded and processed 100 samples.
Initialized tokenizer for model: gpt2
Successfully split dataset: 90 train, 10 test samples


Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Successfully tokenized dataset.


In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

class RecipeModel:
    def __init__(self, model_name="gpt2-medium"):
        self.model_name = model_name
        self.model = None
        self.tokenizer = None
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
    def initialize_model_and_tokenizer(self):
        """Initialize both model and tokenizer"""
        try:
            # Initialize tokenizer with special tokens for recipes
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
            # Load model with adjusted configuration
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                pad_token_id=self.tokenizer.eos_token_id
            ).to(self.device)
            
            # Resize token embeddings if needed
            self.model.resize_token_embeddings(len(self.tokenizer))
            
            print(f"Successfully initialized {self.model_name} on {self.device}")
            return True
            
        except Exception as e:
            print(f"Error initializing model: {str(e)}")
            return False
    
    def generate_recipe(self, prompt, **generation_kwargs):
        """Generate recipe text from prompt"""
        if not self.model or not self.tokenizer:
            print("Model or tokenizer not initialized!")
            return None
            
        try:
            # Default generation parameters (can be overridden)
            default_params = {
                'max_length': 300,
                'num_return_sequences': 1,
                'temperature': 0.7,
                'top_k': 50,
                'top_p': 0.9,
                'do_sample': True,
                'no_repeat_ngram_size': 2,
                'early_stopping': True
            }
            
            # Update with any provided kwargs
            generation_params = {**default_params, **generation_kwargs}
            
            # Tokenize input
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
            
            # Generate output
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    **generation_params
                )
            
            # Decode and clean output
            generated_text = self.tokenizer.decode(
                outputs[0], 
                skip_special_tokens=True
            )
            
            # Post-processing
            generated_text = self._post_process(generated_text)
            return generated_text
            
        except Exception as e:
            print(f"Error during generation: {str(e)}")
            return None
    
    def _post_process(self, text):
        """Clean up generated recipe text"""
        # Remove any incomplete sentences at the end
        last_period = text.rfind('.')
        if last_period != -1:
            text = text[:last_period+1]
            
        # Ensure proper section formatting
        sections = ["Recipe Title:", "Ingredients:", "Instructions:"]
        for section in sections:
            if section not in text:
                text = text.replace(section.lower(), section)
                
        return text.strip()

# Example Usage
if __name__ == "__main__":
    # Initialize model handler
    recipe_model = RecipeModel("gpt2-medium")
    
    # Set up model and tokenizer
    if recipe_model.initialize_model_and_tokenizer():
        # Example prompt
        prompt = "Generate a chocolate chip cookie recipe with these ingredients: flour, sugar, eggs, chocolate chips\n\nRecipe Title:"
        
        # Generate recipe
        generated_recipe = recipe_model.generate_recipe(
            prompt,
            max_length=400,  # Override default
            temperature=0.8  # More creative
        )
        
        print("\nGenerated Recipe:\n")
        print(generated_recipe)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

2025-04-24 11:09:13.130825: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745492953.637773      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745492953.787049      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Successfully initialized gpt2-medium on cuda

Generated Recipe:

Generate a chocolate chip cookie recipe with these ingredients: flour, sugar, eggs, chocolate chips

Recipe Title: Chocolate Chip Cookie Recipe Ingredients: For the dough:
 and 1 cup butter, melted, divided
, 3/4 cup sugar and 4 eggs
 the rest of the butter melted
for the filling: 4 cups all purpose flour
5/8 cup semi-sweet chocolate, softened
1 1/2 cups sugar
3/16 tsp salt
4 1 1 / 2 cups flour For baking: and 3 1⁄ 2 tsp baking soda
2 1 tsp vanilla extract
For the baking tray: 2 large (18-inch) square pans (12-ounce capacity)
6 large eggs for the coating
12 cups semisweet chocolate
8 large unsalted butter for decorating
Preheat oven to 325 degrees F.
In a large bowl, combine the flour and sugar. Set aside. In a medium bowl or stand mixer fitted with the paddle attachment, cream the eggs with 1 teaspoon sugar on medium speed until smooth, about 5 minutes. Add the cocoa and continue to cream until fully incorporated, approx