In [None]:
from datasets import load_dataset
import pandas as pd

# Tải dataset 
dataset = load_dataset("Schmitz005/recipe_nlg_dataset_sample")

# Chuyển thành DataFrame và lưu JSON
df = dataset['train'].to_pandas() 
df.to_json('recipes_sample.json', orient='records', lines=True)
print("Dataset saved offline! Shape:", df.shape)

Dataset saved offline! Shape: (1000, 7)


In [5]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [6]:
print(df.head())

                   title                                        ingredients  \
0    No-Bake Nut Cookies  ["1 c. firmly packed brown sugar", "1/2 c. eva...   
1  Jewell Ball'S Chicken  ["1 small jar chipped beef, cut up", "4 boned ...   
2            Creamy Corn  ["2 (16 oz.) pkg. frozen corn", "1 (8 oz.) pkg...   
3          Chicken Funny  ["1 large whole chicken", "2 (10 1/2 oz.) cans...   
4   Reeses Cups(Candy)    ["1 c. peanut butter", "3/4 c. graham cracker ...   

                                          directions  \
0  ["In a heavy 2-quart saucepan, mix brown sugar...   
1  ["Place chipped beef on bottom of baking dish....   
2  ["In a slow cooker, combine all ingredients. C...   
3  ["Boil and debone chicken.", "Put bite size pi...   
4  ["Combine first four ingredients and press in ...   

                                              link    source  \
0   www.cookbooks.com/Recipe-Details.aspx?id=44874  Gathered   
1  www.cookbooks.com/Recipe-Details.aspx?id=699419  Gathered

In [12]:
df["NER"][3]

'["chicken", "chicken gravy", "cream of mushroom soup", "shredded cheese"]'

In [13]:
import pandas as pd
import numpy as np
import re
import joblib

# Load data offline
try:
    df = pd.read_json('recipes_sample.json', orient='records', lines=True)
except ValueError as e:
    print(f"JSON error: {e}. Anh check file 'recipes_sample.json' xem có đúng format không nhé!")

# Debug: Check columns và data
print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())
print("Sample NER (first 3 rows):", df['NER'].head(3).tolist())
print("Sample ingredients (first 3 rows):", df['ingredients'].head(3).tolist())
print("Sample title (first 3 rows):", df['title'].head(3).tolist())

# Clean data: Drop NaN ở NER và ingredients, đảm bảo NER là list
df = df.dropna(subset=['NER', 'ingredients'])
df['NER'] = df['NER'].apply(lambda x: x if isinstance(x, list) else eval(x) if isinstance(x, str) else [])
df['ingredients'] = df['ingredients'].apply(lambda x: x if isinstance(x, list) else eval(x) if isinstance(x, str) else [])
df = df[df['NER'].apply(len) > 0]  # Drop row thiếu NER
df = df[df['ingredients'].apply(len) > 0]  # Drop row thiếu ingredients

# Kiểm tra lại
print("Cleaned shape:", df.shape)
print("Any empty NER?", df['NER'].apply(len).eq(0).any())

Dataset shape: (1000, 7)
Columns: ['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source', 'NER']
Sample NER (first 3 rows): ['["brown sugar", "milk", "vanilla", "nuts", "butter", "bite size shredded rice biscuits"]', '["beef", "chicken breasts", "cream of mushroom soup", "sour cream"]', '["frozen corn", "cream cheese", "butter", "garlic powder", "salt", "pepper"]']
Sample ingredients (first 3 rows): ['["1 c. firmly packed brown sugar", "1/2 c. evaporated milk", "1/2 tsp. vanilla", "1/2 c. broken nuts (pecans)", "2 Tbsp. butter or margarine", "3 1/2 c. bite size shredded rice biscuits"]', '["1 small jar chipped beef, cut up", "4 boned chicken breasts", "1 can cream of mushroom soup", "1 carton sour cream"]', '["2 (16 oz.) pkg. frozen corn", "1 (8 oz.) pkg. cream cheese, cubed", "1/3 c. butter, cubed", "1/2 tsp. garlic powder", "1/2 tsp. salt", "1/4 tsp. pepper"]']
Sample title (first 3 rows): ['No-Bake Nut Cookies', "Jewell Ball'S Chicken", 'Creamy Corn']
Cleaned shape: (

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Tạo ner_str từ NER
def create_ner_str(row):
    ner = row['NER']
    if isinstance(ner, list):
        return ' '.join([tag.lower() for tag in ner if isinstance(tag, str) and tag])
    elif isinstance(ner, dict):
        return ' '.join([tag.lower() for tag in ner.values() if isinstance(tag, str) and tag])
    return ''  # Fallback nếu NER sai format

df['ner_str'] = df.apply(create_ner_str, axis=1)

# Debug: Check ner_str
print("Sample ner_str (first 3):", df['ner_str'].head(3).tolist())
print("Any empty ner_str?", df['ner_str'].str.strip().eq('').any())

# Drop empty ner_str
df = df[df['ner_str'].str.strip() != '']
print("Final cleaned shape:", df.shape)

# Tạo TF-IDF trên ner_str
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english', binary=True, min_df=1)
X = vectorizer.fit_transform(df['ner_str'])

# Target
y = df['title']

# Lưu
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(X, 'features_matrix.pkl')
df.to_pickle('recipes_df.pkl')
print("Preprocess done! X shape:", X.shape)
print("Vocabulary size:", len(vectorizer.vocabulary_))

Sample ner_str (first 3): ['brown sugar milk vanilla nuts butter bite size shredded rice biscuits', 'beef chicken breasts cream of mushroom soup sour cream', 'frozen corn cream cheese butter garlic powder salt pepper']
Any empty ner_str? False
Final cleaned shape: (1000, 8)
Preprocess done! X shape: (1000, 719)
Vocabulary size: 719


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load từ preprocess
X = joblib.load('features_matrix.pkl')
y = df['title']

# Option 1: Cosine Similarity (không train, dùng X để match)
# Sẵn sàng!

# Option 2: Supervised - RandomForest
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Training done! Accuracy:", accuracy_score(y_test, y_pred))

# Lưu
joblib.dump(model, 'recipe_model.pkl')

Training done! Accuracy: 0.035


['recipe_model.pkl']