#Metrics

In [None]:
def calculate_ndcg(recommendations, relevance_scores, k=10):
    recommendations = recommendations[:k]
    relevance_scores = relevance_scores[:k]

    dcg = sum([rel / np.log2(idx + 2) for idx, rel in enumerate(relevance_scores)])
    idcg = sum([1.0 / np.log2(idx + 2) for idx in range(len(relevance_scores))])

    ndcg = dcg / idcg if idcg > 0 else 0.0
    return ndcg

def calculate_coverage(recommendations, total_items):
    unique_recommendations = set(recommendations['Recipes ID'])
    coverage = len(unique_recommendations) / total_items
    return coverage

#KNN

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the data
data = pd.read_csv('/content/drive/MyDrive/DS108/02.preprocess/dataset_final/dataset.csv')

# Drop duplicate rows
data = data.drop_duplicates()

# Insert 'Recipes ID' column
data.insert(0, 'Recipes ID', range(1, len(data) + 1))

# Select relevant columns
columns = ['Recipes ID', 'Time (mins)', 'Calories', 'Total Fat (g)', 'Saturated Fat (g)', 'Cholesterol (mg)',
           'Sodium (mg)', 'Total Carbohydrate (g)', 'Dietary Fiber (g)', 'Sugars (g)', 'Protein (g)',
           'Diet Label', 'Ingredient', 'Ingredient_units', 'Direction']
dataset = data[columns]

# Define maximum values for filtering
max_Calories = 2000  # kcal
max_daily_fat = 78  # grams
max_daily_Saturatedfat = 20  # grams
max_daily_Cholesterol = 300  # mg
max_daily_Sodium = 2300  # mg
max_daily_Carbohydrate = 275  # grams
max_daily_Fiber = 28  # grams
max_daily_Sugar = 50  # grams (added sugars)
max_daily_Protein = 50  # grams

max_list = [max_Calories, max_daily_fat, max_daily_Saturatedfat, max_daily_Cholesterol, max_daily_Sodium, max_daily_Carbohydrate, max_daily_Fiber, max_daily_Sugar, max_daily_Protein]

# Define functions
def scaling(dataframe):
    scaler = StandardScaler()
    prep_data = scaler.fit_transform(dataframe.iloc[:, 3:11])
    return prep_data, scaler

def tfidf_transform(ingredient_units):
    tfidf = TfidfVectorizer()
    ingredient_units_tfidf = tfidf.fit_transform(ingredient_units)
    return ingredient_units_tfidf, tfidf

def nn_predictor(prep_data):
    neigh = NearestNeighbors(metric='cosine', algorithm='brute')
    neigh.fit(prep_data)
    return neigh

def build_pipeline(neigh, scaler, params):
    n_neighbors = min(params['n_neighbors'], len(scaler.transform(dataset.iloc[:, 3:11])))
    print(f"Number of neighbors used: {n_neighbors}")  # Add this line to check number of neighbors used
    transformer = FunctionTransformer(lambda X: neigh.kneighbors(X, n_neighbors=n_neighbors)[1], validate=False)
    pipeline = Pipeline([
        ('scaler', scaler),
        ('NN', transformer)
    ])
    return pipeline

def extract_data(dataframe, ingredient_filter, max_nutritional_values):
    extracted_data = dataframe.copy()
    for column, maximum in zip(extracted_data.columns[3:11], max_nutritional_values):
        extracted_data = extracted_data[extracted_data[column] < maximum]
    print(f"Filtered Data Shape: {extracted_data.shape}")  # Add this line to check filtered data shape
    return extracted_data

def apply_pipeline(pipeline, _input, extracted_data, scaler):
    indices = pipeline.transform(_input)[0]
    indices = indices.flatten()
    print(f"Indices of Neighbors: {indices}")  # Add this line to print the indices
    recommendations = extracted_data.iloc[indices]
    print(f"Number of Recommendations: {len(recommendations)}")  # Print number of recommendations

    input_scaled = scaler.transform(_input)
    recommendations_scaled = scaler.transform(recommendations.iloc[:, 3:11])
    similarities = cosine_similarity(input_scaled, recommendations_scaled)
    print(f"Similarities: {similarities}")  # Print the similarities

    return recommendations, similarities

def recommand(dataframe, _input, max_nutritional_values, ingredient_filter=None, params={'n_neighbors': 5, 'return_distance': False}):
    extracted_data = extract_data(dataframe, ingredient_filter, max_nutritional_values)
    print(f"Extracted Data Shape: {extracted_data.shape}")  # Print shape of extracted data
    prep_data, scaler = scaling(extracted_data)
    ingredient_units_tfidf, tfidf = tfidf_transform(extracted_data['Ingredient_units'])
    neigh = nn_predictor(prep_data)
    pipeline = build_pipeline(neigh, scaler, params)

    _input_scaled = scaler.transform(_input)

    recommendations, similarities = apply_pipeline(pipeline, _input_scaled, extracted_data, scaler)

    if ingredient_filter is not None:
        filter_vector = tfidf.transform([' '.join(ingredient_filter)])
        recommendations_tfidf = tfidf.transform(recommendations['Ingredient_units'])
        text_similarities = cosine_similarity(recommendations_tfidf, filter_vector).flatten()

        if len(recommendations) == len(text_similarities):
            recommendations = recommendations[text_similarities > 0.0]
            text_similarities = text_similarities[text_similarities > 0.0]
        else:
            print("Warning: Length mismatch between recommendations and text similarities")
            text_similarities = None
    else:
        text_similarities = None

    print("Recommended Recipes:")
    print(recommendations)
    print("\nCosine Similarities (Numeric Features):")
    print(similarities)
    if text_similarities is not None:
        print("\nCosine Similarities (Text Features):")
        print(text_similarities)

    return recommendations, similarities, text_similarities

# Run model
test_input = dataset.iloc[0:1, 3:11].to_numpy()
ingredient_filter = ['garlic']
recommendations, similarities, text_similarities = recommand(dataset, test_input, max_list, ingredient_filter=ingredient_filter)

Filtered Data Shape: (2344, 15)
Extracted Data Shape: (2344, 15)
Number of neighbors used: 5
Indices of Neighbors: [1008  316  107  268  267]
Number of Recommendations: 5
Similarities: [[0.974178   0.95229378 0.95181954 0.94262861 0.93635933]]
Recommended Recipes:
     Recipes ID  Time (mins)  Calories  Total Fat (g)  Saturated Fat (g)  \
698         699         40.0      29.7            1.9                0.7   

     Cholesterol (mg)  Sodium (mg)  Total Carbohydrate (g)  Dietary Fiber (g)  \
698               8.7         22.9                     0.5                0.1   

     Sugars (g)  Protein (g) Diet Label  \
698         0.1          2.4  unlabeled   

                                            Ingredient  \
698  ['ground beef', 'ground pork', 'onion', 'garli...   

                                      Ingredient_units  \
698  ground beef, ground pork, onion, garlic cloves...   

                                             Direction  
698  ['Combine first 8 ingredients in lar



In [None]:
relevance_scores = np.zeros(len(recommendations))
relevance_scores[:2] = 1

ndcg_score = calculate_ndcg(recommendations, relevance_scores, k=10)
print(f"NDCG Score of KNN: {ndcg_score}")

coverage_score = calculate_coverage(recommendations, len(dataset))
print(f"Coverage Score of KNN: {coverage_score}")

NDCG Score of KNN: 1.0
Coverage Score of KNN: 9.738046547862498e-05


#GMM

In [None]:
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the data
data = pd.read_csv('/content/drive/MyDrive/DS108/02.preprocess/dataset_final/dataset.csv')

# Drop duplicate rows
data = data.drop_duplicates()

# Insert 'Recipes ID' column
data.insert(0, 'Recipes ID', range(1, len(data) + 1))

# Select relevant columns
columns = ['Recipes ID', 'Time (mins)', 'Calories', 'Total Fat (g)', 'Saturated Fat (g)', 'Cholesterol (mg)',
           'Sodium (mg)', 'Total Carbohydrate (g)', 'Dietary Fiber (g)', 'Sugars (g)', 'Protein (g)',
           'Diet Label', 'Ingredient', 'Ingredient_units', 'Direction']
dataset = data[columns]

# Define maximum values for filtering
max_Calories = 2000  # kcal
max_daily_fat = 78  # grams
max_daily_Saturatedfat = 20  # grams
max_daily_Cholesterol = 300  # mg
max_daily_Sodium = 2300  # mg
max_daily_Carbohydrate = 275  # grams
max_daily_Fiber = 28  # grams
max_daily_Sugar = 50  # grams (added sugars)
max_daily_Protein = 50  # grams

max_list = [max_Calories, max_daily_fat, max_daily_Saturatedfat, max_daily_Cholesterol, max_daily_Sodium, max_daily_Carbohydrate, max_daily_Fiber, max_daily_Sugar, max_daily_Protein]

# Define functions
def scaling(dataframe):
    scaler = StandardScaler()
    prep_data = scaler.fit_transform(dataframe.iloc[:, 3:11])
    return prep_data, scaler

def tfidf_transform(ingredient_units):
    tfidf = TfidfVectorizer()
    ingredient_units_tfidf = tfidf.fit_transform(ingredient_units)
    return ingredient_units_tfidf, tfidf

def gmm_predictor(prep_data, n_components):
    gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=0)
    gmm.fit(prep_data)
    return gmm

def build_pipeline(gmm, scaler, params):
    transformer = FunctionTransformer(lambda X: gmm.predict(X), validate=False)
    pipeline = Pipeline([
        ('scaler', scaler),
        ('GMM', transformer)
    ])
    return pipeline

def extract_data(dataframe, ingredient_filter, max_nutritional_values):
    extracted_data = dataframe.copy()
    for column, maximum in zip(extracted_data.columns[3:11], max_nutritional_values):
        extracted_data = extracted_data[extracted_data[column] < maximum]
    return extracted_data

def apply_pipeline(pipeline, _input, extracted_data, scaler):
    cluster_labels = pipeline.transform(_input)
    cluster_indices = np.where(cluster_labels == cluster_labels[0])[0]
    recommendations = extracted_data.iloc[cluster_indices]

    input_scaled = scaler.transform(_input)
    recommendations_scaled = scaler.transform(recommendations.iloc[:, 3:11])
    similarities = cosine_similarity(input_scaled, recommendations_scaled)

    return recommendations, similarities

def recommand(dataframe, _input, max_nutritional_values, ingredient_filter=None, params={'n_components': 5}):
    extracted_data = extract_data(dataframe, ingredient_filter, max_nutritional_values)
    prep_data, scaler = scaling(extracted_data)
    ingredient_units_tfidf, tfidf = tfidf_transform(extracted_data['Ingredient_units'])
    gmm = gmm_predictor(prep_data, params['n_components'])
    pipeline = build_pipeline(gmm, scaler, params)

    _input_scaled = scaler.transform(_input)

    recommendations, similarities = apply_pipeline(pipeline, _input_scaled, extracted_data, scaler)

    if ingredient_filter is not None:
        filter_vector = tfidf.transform([' '.join(ingredient_filter)])
        recommendations_tfidf = tfidf.transform(recommendations['Ingredient_units'])
        text_similarities = cosine_similarity(recommendations_tfidf, filter_vector).flatten()

        # Ensure the length matches before filtering
        if len(recommendations) == len(text_similarities):
            recommendations = recommendations[text_similarities > 0.0]
            text_similarities = text_similarities[text_similarities > 0.0]
        else:
            print("Warning: Length mismatch between recommendations and text similarities")
            text_similarities = None
    else:
        text_similarities = None

    print("Recommended Recipes:")
    print(recommendations)
    print("\nCosine Similarities (Numeric Features):")
    print(similarities)
    if text_similarities is not None:
        print("\nCosine Similarities (Text Features):")
        print(text_similarities)

    return recommendations, similarities, text_similarities

# Run model
test_input = dataset.iloc[0:1, 3:11].to_numpy()
ingredient_filter = ['garlic']
recommand(dataset, test_input, max_list, ingredient_filter=ingredient_filter)

Recommended Recipes:
Empty DataFrame
Columns: [Recipes ID, Time (mins), Calories, Total Fat (g), Saturated Fat (g), Cholesterol (mg), Sodium (mg), Total Carbohydrate (g), Dietary Fiber (g), Sugars (g), Protein (g), Diet Label, Ingredient, Ingredient_units, Direction]
Index: []

Cosine Similarities (Numeric Features):
[[0.05496253]]

Cosine Similarities (Text Features):
[]




(Empty DataFrame
 Columns: [Recipes ID, Time (mins), Calories, Total Fat (g), Saturated Fat (g), Cholesterol (mg), Sodium (mg), Total Carbohydrate (g), Dietary Fiber (g), Sugars (g), Protein (g), Diet Label, Ingredient, Ingredient_units, Direction]
 Index: [],
 array([[0.05496253]]),
 array([], dtype=float64))

#Radius Neighbors Classifier

In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the data
data = pd.read_csv('/content/drive/MyDrive/DS108/02.preprocess/dataset_final/dataset.csv')

# Drop duplicate rows
data = data.drop_duplicates()

# Insert 'Recipes ID' column
data.insert(0, 'Recipes ID', range(1, len(data) + 1))

# Select relevant columns
columns = ['Recipes ID', 'Time (mins)', 'Calories', 'Total Fat (g)', 'Saturated Fat (g)', 'Cholesterol (mg)',
           'Sodium (mg)', 'Total Carbohydrate (g)', 'Dietary Fiber (g)', 'Sugars (g)', 'Protein (g)',
           'Diet Label', 'Ingredient', 'Ingredient_units', 'Direction']
dataset = data[columns]

# Define maximum values for filtering
max_Calories = 2000  # kcal
max_daily_fat = 78  # grams
max_daily_Saturatedfat = 20  # grams
max_daily_Cholesterol = 300  # mg
max_daily_Sodium = 2300  # mg
max_daily_Carbohydrate = 275  # grams
max_daily_Fiber = 28  # grams
max_daily_Sugar = 50  # grams (added sugars)
max_daily_Protein = 50  # grams

max_list = [max_Calories, max_daily_fat, max_daily_Saturatedfat, max_daily_Cholesterol, max_daily_Sodium, max_daily_Carbohydrate, max_daily_Fiber, max_daily_Sugar, max_daily_Protein]

# Define functions
def scaling(dataframe):
    scaler = StandardScaler()
    prep_data = scaler.fit_transform(dataframe.iloc[:, 3:11])
    return prep_data, scaler

def radius_neighbors_fit(prep_data, params):
    radius_neighbors = RadiusNeighborsClassifier(radius=params['radius'], algorithm='brute', metric='cosine')
    radius_neighbors.fit(prep_data, np.zeros(len(prep_data)))  # RadiusNeighborsClassifier yêu cầu một mảng nhãn, ta sử dụng mảng gồm toàn số 0
    return radius_neighbors

def extract_data(dataframe, max_nutritional_values):
    extracted_data = dataframe.copy()
    for column, maximum in zip(extracted_data.columns[3:11], max_nutritional_values):
        extracted_data = extracted_data[extracted_data[column] < maximum]
    return extracted_data

def apply_radius_neighbors(radius_neighbors, _input, extracted_data, scaler, params):
    input_scaled = scaler.transform(_input)
    indices = radius_neighbors.radius_neighbors(input_scaled, return_distance=False)
    indices = indices[0]  # Vì chỉ cần một kết quả, ta lấy phần tử đầu tiên
    recommendations = extracted_data.iloc[indices]

    recommendations_scaled = scaler.transform(recommendations.iloc[:, 3:11])
    similarities = cosine_similarity(input_scaled, recommendations_scaled)

    return recommendations, similarities

def compute_text_similarity(dataframe, input_text, tfidf):
    recommendations_tfidf = tfidf.transform(dataframe['Ingredient_units'])
    text_similarities = cosine_similarity(recommendations_tfidf, input_text).flatten()
    return text_similarities

def recommend(dataframe, _input, max_nutritional_values, ingredient_filter=None, params={'radius': 0.5}):
    extracted_data = extract_data(dataframe, max_nutritional_values)
    prep_data, scaler = scaling(extracted_data)
    radius_neighbors = radius_neighbors_fit(prep_data, params)

    recommendations, numeric_similarities = apply_radius_neighbors(radius_neighbors, _input, extracted_data, scaler, params)

    if ingredient_filter is not None:
        tfidf = TfidfVectorizer()
        tfidf.fit(extracted_data['Ingredient_units'])

        transformed_input = tfidf.transform([' '.join(ingredient_filter)])

        text_similarities = compute_text_similarity(recommendations, transformed_input, tfidf)

        recommendations['Numeric Similarity'] = numeric_similarities.flatten()
        recommendations['Text Similarity'] = text_similarities

        recommendations_sorted = recommendations.sort_values(by='Numeric Similarity', ascending=False)

        top_recommendations = recommendations_sorted.head(5)

        print("Top Recommended Recipes:")
        print(top_recommendations[['Recipes ID', 'Numeric Similarity', 'Text Similarity']])
    else:
        print("Ingredient filter is not provided. Please provide an ingredient filter.")
        return None

    relevance_scores = np.zeros(len(top_recommendations))
    relevance_scores[:2] = 1

    # Calculate NDCG
    ndcg_score = calculate_ndcg(top_recommendations, relevance_scores, k=5)
    # print(f"NDCG Score: {ndcg_score}")

    # Calculate Coverage
    coverage_score = calculate_coverage(recommendations, len(dataset))
    # print(f"Coverage Score: {coverage_score}")

    return top_recommendations, ndcg_score, coverage_score

# Run model
test_input = dataset.iloc[0:1, 3:11].to_numpy()
ingredient_filter = ['garlic']
recommendations, ndcg_score, coverage_score = recommend(dataset, test_input, max_list, ingredient_filter)

Top Recommended Recipes:
      Recipes ID  Numeric Similarity  Text Similarity
6073        6074            0.924459         0.188852
6732        6733            0.906683         0.000000
3479        3480            0.894694         0.151270
320          321            0.862496         0.166101
2030        2031            0.861416         0.000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommendations['Numeric Similarity'] = numeric_similarities.flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommendations['Text Similarity'] = text_similarities


In [None]:
print("NDCG Score RNC:", ndcg_score)
print("Coverage Score RNC:", coverage_score)

NDCG Score RNC: 0.5531464700081437
Coverage Score RNC: 0.03807576200214237


#K Means

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Load the data
data = pd.read_csv('/content/drive/MyDrive/DS108/02.preprocess/dataset_final/dataset.csv')

# Drop duplicate rows
data = data.drop_duplicates()

# Insert 'Recipes ID' column
data.insert(0, 'Recipes ID', range(1, len(data) + 1))

# Select relevant columns
columns = ['Recipes ID', 'Time (mins)', 'Calories', 'Total Fat (g)', 'Saturated Fat (g)', 'Cholesterol (mg)',
           'Sodium (mg)', 'Total Carbohydrate (g)', 'Dietary Fiber (g)', 'Sugars (g)', 'Protein (g)',
           'Diet Label', 'Ingredient', 'Ingredient_units', 'Direction']
dataset = data[columns]

# Define maximum values for filtering
max_Calories = 2000  # kcal
max_daily_fat = 78  # grams
max_daily_Saturatedfat = 20  # grams
max_daily_Cholesterol = 300  # mg
max_daily_Sodium = 2300  # mg
max_daily_Carbohydrate = 275  # grams
max_daily_Fiber = 28  # grams
max_daily_Sugar = 50  # grams (added sugars)
max_daily_Protein = 50  # grams

max_list = [max_Calories, max_daily_fat, max_daily_Saturatedfat, max_daily_Cholesterol, max_daily_Sodium, max_daily_Carbohydrate, max_daily_Fiber, max_daily_Sugar, max_daily_Protein]

# Define functions
def scaling(dataframe):
    scaler = StandardScaler()
    prep_data = scaler.fit_transform(dataframe.iloc[:, 3:11])
    return prep_data, scaler

def kmeans_fit(prep_data, params):
    kmeans = KMeans(n_clusters=params['n_clusters'])
    kmeans.fit(prep_data)
    return kmeans

def build_pipeline(kmeans, scaler):
    pipeline = Pipeline([
        ('scaler', scaler),
        ('kmeans', kmeans)
    ])
    return pipeline

def extract_data(dataframe, max_nutritional_values):
    extracted_data = dataframe.copy()
    for column, maximum in zip(extracted_data.columns[3:11], max_nutritional_values):
        extracted_data = extracted_data[extracted_data[column] < maximum]
    return extracted_data

def apply_pipeline(pipeline, _input, extracted_data, scaler):
    input_scaled = scaler.transform(_input)
    cluster_labels = pipeline.predict(input_scaled)
    indices = np.where(pipeline.named_steps['kmeans'].labels_ == cluster_labels[0])[0]
    recommendations = extracted_data.iloc[indices]

    input_scaled = scaler.transform(_input)
    recommendations_scaled = scaler.transform(recommendations.iloc[:, 3:11])

    return recommendations

def recommend(dataframe, _input, max_nutritional_values, params={'n_clusters': 5}):
    extracted_data = extract_data(dataframe, max_nutritional_values)
    prep_data, scaler = scaling(extracted_data)
    kmeans = kmeans_fit(prep_data, params)
    pipeline = build_pipeline(kmeans, scaler)

    recommendations = apply_pipeline(pipeline, _input, extracted_data, scaler)
    # Lấy chỉ 5 công thức hàng đầu
    top_recommendations = recommendations.head(5)

    print("Top Recommended Recipes:")
    print(top_recommendations)

    relevance_scores = np.zeros(len(top_recommendations))

    # Calculate NDCG
    ndcg_score = calculate_ndcg(top_recommendations, relevance_scores, k=5)
    # print(f"NDCG Score: {ndcg_score}")

    # Calculate Coverage
    coverage_score = calculate_coverage(recommendations, len(dataset))
    # print(f"Coverage Score: {coverage_score}")

    return top_recommendations, ndcg_score, coverage_score

# Run model
test_input = dataset.iloc[0:1, 3:11].to_numpy()
recommendations, ndcg_score, coverage_score = recommend(dataset, test_input, max_list)



Top Recommended Recipes:
    Recipes ID  Time (mins)  Calories  Total Fat (g)  Saturated Fat (g)  \
26          27         25.0      17.7            1.7                1.1   
50          51         30.0      45.2            1.6                0.5   
56          57         45.0     114.3            8.6                2.0   
59          60         22.0      65.3            3.8                0.5   
75          76         32.0      22.0            0.1                0.0   

    Cholesterol (mg)  Sodium (mg)  Total Carbohydrate (g)  Dietary Fiber (g)  \
26               4.4        101.1                     0.7                0.2   
50               1.8        145.9                     6.5                2.2   
56               4.9        115.1                     6.5                2.0   
59               0.0         22.8                     7.4                1.4   
75               0.0         34.6                     5.0                1.5   

    Sugars (g)  Protein (g)      Diet Label



In [None]:
print("NDCG Score K Means:", ndcg_score)
print("Coverage Score K Means:", coverage_score)

NDCG Score K Means: 0.0
Coverage Score K Means: 0.09845165059888986


#DBSCAN

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the data
data = pd.read_csv('/content/drive/MyDrive/DS108/02.preprocess/dataset_final/dataset.csv')

# Drop duplicate rows
data = data.drop_duplicates()

# Insert 'Recipes ID' column
data.insert(0, 'Recipes ID', range(1, len(data) + 1))

# Select relevant columns
columns = ['Recipes ID', 'Time (mins)', 'Calories', 'Total Fat (g)', 'Saturated Fat (g)', 'Cholesterol (mg)',
           'Sodium (mg)', 'Total Carbohydrate (g)', 'Dietary Fiber (g)', 'Sugars (g)', 'Protein (g)',
           'Diet Label', 'Ingredient', 'Ingredient_units', 'Direction']
dataset = data[columns]

# Define maximum values for filtering
max_Calories = 2000  # kcal
max_daily_fat = 78  # grams
max_daily_Saturatedfat = 20  # grams
max_daily_Cholesterol = 300  # mg
max_daily_Sodium = 2300  # mg
max_daily_Carbohydrate = 275  # grams
max_daily_Fiber = 28  # grams
max_daily_Sugar = 50  # grams (added sugars)
max_daily_Protein = 50  # grams

max_list = [max_Calories, max_daily_fat, max_daily_Saturatedfat, max_daily_Cholesterol, max_daily_Sodium, max_daily_Carbohydrate, max_daily_Fiber, max_daily_Sugar, max_daily_Protein]

# Define functions
def scaling(dataframe):
    if len(dataframe) == 0:
        return None, None
    scaler = StandardScaler()
    prep_data = scaler.fit_transform(dataframe.iloc[:, 3:11])
    return prep_data, scaler

def tfidf_transform(ingredient_units):
    tfidf = TfidfVectorizer()
    ingredient_units_tfidf = tfidf.fit_transform(ingredient_units)
    return ingredient_units_tfidf, tfidf

def dbscan_predictor(prep_data, eps, min_samples):
    dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine')
    dbscan.fit(prep_data)
    return dbscan

def build_pipeline(dbscan, scaler, params):
    cluster_labels = dbscan.labels_
    transformer = FunctionTransformer(lambda X: [cluster_labels], validate=False)
    pipeline = Pipeline([
        ('scaler', scaler),
        ('DBSCAN', transformer)
    ])
    return pipeline

def extract_data(dataframe, ingredient_filter, max_nutritional_values):
    extracted_data = dataframe.copy()
    for column, maximum in zip(extracted_data.columns[3:11], max_nutritional_values):
        extracted_data = extracted_data[extracted_data[column] < maximum]
    return extracted_data

def apply_pipeline(pipeline, _input, extracted_data, scaler, dbscan):
    cluster_labels = dbscan.labels_
    input_scaled = scaler.transform(_input)
    input_label = dbscan.fit_predict(input_scaled)

    # Find the cluster of the input data point
    cluster_indices = np.where(cluster_labels == input_label[0])[0]
    if len(cluster_indices) == 0:
        return None, None

    recommendations = extracted_data.iloc[cluster_indices]
    recommendations_scaled = scaler.transform(recommendations.iloc[:, 3:11])
    similarities = cosine_similarity(input_scaled, recommendations_scaled)

    return recommendations, similarities

def recommand(dataframe, _input, max_nutritional_values, ingredient_filter=None, params={'eps': 0.5, 'min_samples': 5}):
    extracted_data = extract_data(dataframe, ingredient_filter, max_nutritional_values)
    if len(extracted_data) == 0:
        print("No data available after filtering")
        return None, None, None

    prep_data, scaler = scaling(extracted_data)
    if prep_data is None or scaler is None:
        print("No data available for scaling")
        return None, None, None

    ingredient_units_tfidf, tfidf = tfidf_transform(extracted_data['Ingredient_units'])
    dbscan = dbscan_predictor(prep_data, params['eps'], params['min_samples'])
    pipeline = build_pipeline(dbscan, scaler, params)

    _input_scaled = scaler.transform(_input)

    recommendations, similarities = apply_pipeline(pipeline, _input_scaled, extracted_data, scaler, dbscan)

    if recommendations is None or similarities is None:
        print("No recommendations found")
        return None, None, None

    if ingredient_filter is not None:
        filter_vector = tfidf.transform([' '.join(ingredient_filter)])
        recommendations_tfidf = tfidf.transform(recommendations['Ingredient_units'])
        if recommendations_tfidf.shape[0] == 0:
            print("No recommendations found after filtering by ingredients")
            return recommendations, similarities, None

        text_similarities = cosine_similarity(recommendations_tfidf, filter_vector).flatten()

        # Ensure the length matches before filtering
        if len(recommendations) == len(text_similarities):
            recommendations = recommendations[text_similarities > 0.0]
            text_similarities = text_similarities[text_similarities > 0.0]
        else:
            print("Warning: Length mismatch between recommendations and text similarities")
            text_similarities = None
    else:
        text_similarities = None

    print("Recommended Recipes:")
    print(recommendations)
    print("\nCosine Similarities (Numeric Features):")
    print(similarities)
    if text_similarities is not None:
        print("\nCosine Similarities (Text Features):")
        print(text_similarities)

    return recommendations, similarities, text_similarities

# Run model
test_input = dataset.iloc[0:1, 3:11].to_numpy()
ingredient_filter = ['garlic']
recommand(dataset, test_input, max_list, ingredient_filter=ingredient_filter, params={'eps': 0.5, 'min_samples': 5})


No recommendations found




(None, None, None)