In [5]:
import pandas as pd
from datasets import load_dataset
import numpy as np
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer  # Fallback


In [None]:
# Tải dataset 
dataset = load_dataset("Schmitz005/recipe_nlg_dataset_sample")

# Chuyển thành DataFrame và lưu JSON
df = dataset['train'].to_pandas() 
df.to_json('recipes_sample.json', orient='records', lines=True)
print("Dataset saved offline! Shape:", df.shape)

Dataset saved offline! Shape: (1000, 7)


In [5]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [6]:
print(df.head())

                   title                                        ingredients  \
0    No-Bake Nut Cookies  ["1 c. firmly packed brown sugar", "1/2 c. eva...   
1  Jewell Ball'S Chicken  ["1 small jar chipped beef, cut up", "4 boned ...   
2            Creamy Corn  ["2 (16 oz.) pkg. frozen corn", "1 (8 oz.) pkg...   
3          Chicken Funny  ["1 large whole chicken", "2 (10 1/2 oz.) cans...   
4   Reeses Cups(Candy)    ["1 c. peanut butter", "3/4 c. graham cracker ...   

                                          directions  \
0  ["In a heavy 2-quart saucepan, mix brown sugar...   
1  ["Place chipped beef on bottom of baking dish....   
2  ["In a slow cooker, combine all ingredients. C...   
3  ["Boil and debone chicken.", "Put bite size pi...   
4  ["Combine first four ingredients and press in ...   

                                              link    source  \
0   www.cookbooks.com/Recipe-Details.aspx?id=44874  Gathered   
1  www.cookbooks.com/Recipe-Details.aspx?id=699419  Gathered

In [12]:
df["NER"][3]

'["chicken", "chicken gravy", "cream of mushroom soup", "shredded cheese"]'

In [6]:
# Load data
df = pd.read_json('recipes_sample.json', orient='records', lines=True)

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())
print("Sample NER (first row):", df['NER'].iloc[0] if 'NER' in df.columns else "No NER")
print("Sample ingredients (first row):", df['ingredients'].iloc[0])


Dataset shape: (1000, 7)
Columns: ['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source', 'NER']
Sample NER (first row): ["brown sugar", "milk", "vanilla", "nuts", "butter", "bite size shredded rice biscuits"]
Sample ingredients (first row): ["1 c. firmly packed brown sugar", "1/2 c. evaporated milk", "1/2 tsp. vanilla", "1/2 c. broken nuts (pecans)", "2 Tbsp. butter or margarine", "3 1/2 c. bite size shredded rice biscuits"]


In [7]:
def extract_raw_ingredients(row):
    raw_ings = []
    # Từ NER (robust: handle list, tuple, dict)
    if 'NER' in row and row['NER']:
        ner_data = row['NER']
        if isinstance(ner_data, str):
            try:
                ner_data = eval(ner_data)  # Nếu string, parse
            except:
                ner_data = []
        if isinstance(ner_data, list):
            for sublist in ner_data if isinstance(ner_data[0], list) else [ner_data]:
                for entity in sublist:
                    if isinstance(entity, (list, tuple, dict)):
                        # Flexible: Tìm 'INGREDIENT' key hoặc index 2
                        if isinstance(entity, dict) and 'type' in entity and entity['type'] == 'INGREDIENT':
                            text = entity.get('text', entity.get('value', ''))
                        elif len(entity) >= 4 and entity[2] == 'INGREDIENT':
                            text = entity[3]
                        else:
                            continue
                        clean_ing = re.sub(r'^\s*\d+(?:\s+\w+)*\s+(.*)', r'\1', str(text).lower()).strip()  # Regex rộng hơn: bỏ "1 2/3 cup " etc.
                        if clean_ing and len(clean_ing) > 1 and clean_ing not in ['and', 'with', 'of']:  # Bỏ stop words thủ công
                            raw_ings.append(clean_ing)
        if raw_ings:
            return raw_ings
    
    # Fallback mạnh từ ingredients: Split và clean
    if isinstance(row['ingredients'], (list, str)):
        if isinstance(row['ingredients'], str):
            ings_list = row['ingredients'].split(',')  # Nếu string
        else:
            ings_list = row['ingredients']
        for ing_text in ings_list:
            clean_ing = re.sub(r'^\s*\d+(?:\s+\w+)*\s+(.*)', r'\1', str(ing_text).lower()).strip()
            clean_ing = re.sub(r'^\s*(a|an|the|of|with|and)\s+(.*)', r'\1', clean_ing)  # Bỏ stop words
            if clean_ing and len(clean_ing) > 2:  # >2 để tránh "a ", "an"
                # Lấy từ cuối nếu nhiều (VD: "sliced carrots" → "carrots")
                words = clean_ing.split()
                if len(words) > 1 and words[0] in ['chopped', 'sliced', 'ground', 'diced']:
                    clean_ing = ' '.join(words[1:])
                raw_ings.append(clean_ing)
    return raw_ings if raw_ings else ['unknown']  # Dummy nếu empty

# Áp dụng
df['raw_ingredients'] = df.apply(extract_raw_ingredients, axis=1)
df['raw_str'] = df['raw_ingredients'].apply(lambda x: ' '.join(x) if x else 'unknown')

# Debug chi tiết
print("Sample raw_ingredients (first 5):", df['raw_ingredients'].head().tolist())
print("Sample raw_str (first 5):", df['raw_str'].head().tolist())
print("Any empty raw_str?", df['raw_str'].str.strip().eq('').any())
print("Non-empty rows:", len(df[df['raw_str'].str.strip() != '']) )

Sample raw_ingredients (first 5): [['["1 c. firmly packed brown sugar"', '"1/2 c. evaporated milk"', '"1/2 tsp. vanilla"', '"1/2 c. broken nuts (pecans)"', '"2 tbsp. butter or margarine"', '"3 1/2 c. bite size shredded rice biscuits"]'], ['["1 small jar chipped beef', 'cut up"', '"4 boned chicken breasts"', '"1 can cream of mushroom soup"', '"1 carton sour cream"]'], ['["2 (16 oz.) pkg. frozen corn"', '"1 (8 oz.) pkg. cream cheese', 'cubed"', '"1/3 c. butter', 'cubed"', '"1/2 tsp. garlic powder"', '"1/2 tsp. salt"', '"1/4 tsp. pepper"]'], ['["1 large whole chicken"', '"2 (10 1/2 oz.) cans chicken gravy"', '"1 (10 1/2 oz.) can cream of mushroom soup"', '"1 (6 oz.) box stove top stuffing"', '"4 oz. shredded cheese"]'], ['["1 c. peanut butter"', '"3/4 c. graham cracker crumbs"', '"1 c. melted butter"', '"1 lb. (3 1/2 c.) powdered sugar"', '"1 large pkg. chocolate chips"]']]
Sample raw_str (first 5): ['["1 c. firmly packed brown sugar" "1/2 c. evaporated milk" "1/2 tsp. vanilla" "1/2 c. br

In [8]:

# Clean
df = df[df['raw_str'].str.strip() != ''].dropna(subset=['title', 'directions'])

# Vectorize: Thử TF-IDF, fallback CountVectorizer nếu empty
try:
    vectorizer = TfidfVectorizer(max_features=2000, stop_words=None, binary=True, min_df=1)  # Bỏ stop_words để an toàn
    X = vectorizer.fit_transform(df['raw_str'])
    if len(vectorizer.vocabulary_) == 0:
        raise ValueError("Still empty")
    print("TF-IDF success! Vocab size:", len(vectorizer.vocabulary_))
except:
    print("TF-IDF failed, using CountVectorizer")
    vectorizer = CountVectorizer(max_features=2000, stop_words=None, binary=True)
    X = vectorizer.fit_transform(df['raw_str'])
    print("CountVectorizer vocab size:", len(vectorizer.vocabulary_))

y = df['title']

# Train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Training done! Accuracy:", accuracy_score(y_test, y_pred))


TF-IDF success! Vocab size: 1309
Training done! Accuracy: 0.06


In [9]:
# Lưu
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(X, 'features_matrix.pkl')
joblib.dump(model, 'recipe_model.pkl')
df.to_pickle('recipes_df.pkl')
print("All saved! Check debug prints for raw_str.")

All saved! Check debug prints for raw_str.
