# 4-dimensional multi-vector test search

In this file, different food search methods were tested in order to find the best strategy for finding recommendation results in the system.

## Macronutrientes

In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
ruta_csv = "../datasets/foundation-food.csv"
df = pd.read_csv(ruta_csv, sep=',')

In [3]:
selected_cols_with_id_and_name = ["id", "name", "calories", "protein", "total_fat", "carbs"]

In [4]:
tmp = df[selected_cols_with_id_and_name]

In [5]:
tmp

Unnamed: 0,id,name,calories,protein,total_fat,carbs
0,1073,"CREMA, ÁCIDA O AGRIA, BAJA GRASA",181.0,7.00,14.10,7.00
1,15083,"AZUCAR BLANCA, GRANULADA, FORTIF.C/VIT A Y HIE...",387.0,0.00,0.00,99.80
2,1074,"CREMA, ÁCIDA O AGRIA, CULTIVADA",198.0,2.44,19.35,4.63
3,1070,"CREMA, ACIDA O AGRIA, CULTIVADA, BAJA GRASA",135.0,2.94,12.00,4.26
4,1071,"CREMA, ACIDA O AGRIA, LIVIANA",136.0,3.50,10.60,7.10
...,...,...,...,...,...,...
2652,25044,"PASTA, SIN GLUTEN (MAIZ) COCIDA",126.0,2.63,0.73,27.91
2653,25045,"PASTA, SIN GLUTEN (MAIZ) CRUDA",357.0,7.46,2.08,79.26
2654,25046,"PASTA, SIN GLUTEN (MAIZ, ARROZ), COCIDO",179.0,3.20,1.00,38.05
2655,25047,"PRETZELS, SIN GLUTEN (MAICENA, HARINA PAPA)",389.0,3.52,6.67,78.62


In [6]:
selected_cols = ["calories", "protein", "total_fat", "carbs"]

In [7]:
foods = df[selected_cols]

In [8]:
foods.head()

Unnamed: 0,calories,protein,total_fat,carbs
0,181.0,7.0,14.1,7.0
1,387.0,0.0,0.0,99.8
2,198.0,2.44,19.35,4.63
3,135.0,2.94,12.0,4.26
4,136.0,3.5,10.6,7.1


In [9]:
# Eliminar los registros con valores nulos en 'sugar' o 'total_fat'
foods = foods.dropna(subset=selected_cols)

# Verificar el número de registros eliminados
print(f"Registros eliminados: {len(df) - len(foods)}")

foodsBK = foods.copy()

Registros eliminados: 5


# Search algorithm

In [10]:
def calculate_similarity(df, target):
    # Keep only the columns that are not None in the target
    used_columns = [col for col in selected_cols if target.get(col) is not None]

    if not used_columns:
        raise ValueError("You must specify at least one variable to compare.")

    # Filter the DataFrame to use only the relevant columns
    filtered_df = df[used_columns].copy()

    # Normalize the data (values between 0 and 1)
    df_normalized = (filtered_df - filtered_df.min()) / (filtered_df.max() - filtered_df.min())

    # Normalize the target values as well
    target_normalized = {
        col: (target[col] - filtered_df[col].min()) / (filtered_df[col].max() - filtered_df[col].min())
        for col in used_columns
    }

    # Calculate Euclidean distance between each row and the target
    distances = np.linalg.norm(df_normalized.values - np.array([target_normalized[col] for col in used_columns]), axis=1)

    # Convert to similarity percentage (100% = perfect match)
    similarity = 1 - distances / np.sqrt(len(used_columns))
    df["similarity_%"] = (similarity * 100).round(2)

    # Sort and return top 5 matches
    return df.sort_values(by="similarity_%", ascending=False).head(5)

In [11]:
# Example
target_input = {"calories": 389, "protein": 3.52, "total_fat": None, "carbs": 78}

start_time = time.time()
top_matches = calculate_similarity(df, target_input)
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Execution time: {elapsed_time:.4f} seconds")
top_matches[["id", "food_category", "name", "similarity_%"]]

Execution time: 0.0077 seconds


Unnamed: 0,id,food_category,name,similarity_%
2655,25047,25,"PRETZELS, SIN GLUTEN (MAICENA, HARINA PAPA)",99.7
1353,15040,15,"CARAMELOS, TIPO TOFFEES",98.99
1678,18024,18,"PASTEL, CHOCOLATE, HARINA COMERCIAL",98.7
539,10038,10,"CASTANA, EUROPEA, SECA, PELADA",98.32
1285,14014,14,PAN DE ARROZ O QUESADILLA,98.01


# Search algorithm scaled

In [12]:
def calculate_scaled_similarity(df: pd.DataFrame, target: dict, selected_cols=None, portion_range=(0.1, 5.0), portion_step=0.1):    
    if selected_cols is None:
        selected_cols = ["calories", "protein", "total_fat", "carbs"]

    # Filter only valid input columns
    valid_cols = [col for col in selected_cols if target.get(col) is not None]
    if not valid_cols:
        raise ValueError("No valid target values provided.")

    # Normalize the data
    normalized_df = df[valid_cols].copy()
    col_mins = normalized_df.min()
    col_maxs = normalized_df.max()
    normalized_df = (normalized_df - col_mins) / (col_maxs - col_mins)

    # Normalize target
    normalized_target = []
    for col in valid_cols:
        value = target[col]
        norm_value = (value - col_mins[col]) / (col_maxs[col] - col_mins[col])
        normalized_target.append(norm_value)
    normalized_target = np.array(normalized_target)

    results = []

    for idx, row in normalized_df.iterrows():
        best_similarity = -1
        best_portion = 0

        # Test different portion factors (0.1 to 5.0)
        for factor in np.arange(portion_range[0], portion_range[1] + portion_step, portion_step):
            scaled_row = row * factor
            distance = np.linalg.norm(scaled_row - normalized_target)
            max_possible_distance = np.sqrt(len(valid_cols))
            similarity = 1 - (distance / max_possible_distance)
            
            if similarity > best_similarity:
                best_similarity = similarity
                best_portion = factor

        # Save original row with similarity and best portion
        result_row = df.loc[idx].copy()
        result_row["similarity_%"] = round(best_similarity * 100, 2)
        result_row["ideal_portions"] = round(best_portion, 2)
        results.append(result_row)

    # Create DataFrame of results
    result_df = pd.DataFrame(results)
    result_df = result_df.sort_values(by="similarity_%", ascending=False)

    return result_df.head(5)

In [13]:
# Example
target_input_2 = {"calories": 389*2, "protein": 3.52*2, "total_fat": None, "carbs": 78*2}

start_time = time.time()
top_matches_2 = calculate_scaled_similarity(df, target_input_2)
# top_matches_2 = calculate_scaled_similarity(df, target_input_2, None, (0.1, 5.0), 0.5)
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Execution time: {elapsed_time:.4f} seconds")
top_matches_2[["id", "food_category", "name", "similarity_%", "ideal_portions"]]

Execution time: 19.0827 seconds


Unnamed: 0,id,food_category,name,similarity_%,ideal_portions
2655,25047,25,"PRETZELS, SIN GLUTEN (MAICENA, HARINA PAPA)",99.4,2.0
1285,14014,14,PAN DE ARROZ O QUESADILLA,98.53,2.1
2373,24182,24,EMPANADA DE PLATANO C/FRIJOL (EL SALVADOR),98.49,3.4
1698,18094,18,"PASTELITOS RELLENOS, AZUCAR Y CANELA",98.02,2.1
1353,15040,15,"CARAMELOS, TIPO TOFFEES",97.98,2.0


# Search algorithm scaled using binary search

In [14]:
def binary_search_best_portion(row, normalized_target, left=0.1, right=5.0, epsilon=1e-3):
    """
    Perform binary search to find the best portion factor that maximizes similarity.
    """
    while right - left > epsilon:
        mid = (left + right) / 2
        
        # Calculate the scaled row for the current middle point
        scaled_row = row * mid
        
        # Calculate the Euclidean distance
        distance = np.linalg.norm(scaled_row - normalized_target)
        max_possible_distance = np.sqrt(len(scaled_row))
        similarity = 1 - (distance / max_possible_distance)
        
        # Adjust the search range based on similarity
        left_scaled_row = row * (mid - epsilon)
        left_distance = np.linalg.norm(left_scaled_row - normalized_target)
        left_similarity = 1 - (left_distance / max_possible_distance)

        right_scaled_row = row * (mid + epsilon)
        right_distance = np.linalg.norm(right_scaled_row - normalized_target)
        right_similarity = 1 - (right_distance / max_possible_distance)
        
        # If the middle value is the best, we found the max similarity point
        if similarity >= left_similarity and similarity >= right_similarity:
            return mid, similarity
        
        # Narrow the search range to the best side
        if left_similarity > similarity:
            right = mid
        else:
            left = mid
    
    return mid, similarity



In [15]:
def calculate_scaled_similarity_with_binary_search(df: pd.DataFrame, target: dict, selected_cols=None, portion_range=(0.1, 5.0), portion_step=0.1):
    if selected_cols is None:
        selected_cols = ["calories", "protein", "total_fat", "carbs"]

    # Filter only valid input columns
    valid_cols = [col for col in selected_cols if target.get(col) is not None]
    if not valid_cols:
        raise ValueError("No valid target values provided.")

    # Normalize the data
    normalized_df = df[valid_cols].copy()
    col_mins = normalized_df.min()
    col_maxs = normalized_df.max()
    normalized_df = (normalized_df - col_mins) / (col_maxs - col_mins)

    # Normalize target
    normalized_target = []
    for col in valid_cols:
        value = target[col]
        norm_value = (value - col_mins[col]) / (col_maxs[col] - col_mins[col])
        normalized_target.append(norm_value)
    normalized_target = np.array(normalized_target)

    results = []

    for idx, row in normalized_df.iterrows():
        # Find the best portion and similarity using binary search
        best_portion, best_similarity = binary_search_best_portion(row, normalized_target, left=portion_range[0], right=portion_range[1])

        # Save original row with similarity and best portion
        result_row = df.loc[idx].copy()
        result_row["similarity_%"] = round(best_similarity * 100, 2)
        result_row["ideal_portions"] = round(best_portion, 2)
        results.append(result_row)

    # Create DataFrame of results
    result_df = pd.DataFrame(results)
    result_df = result_df.sort_values(by="similarity_%", ascending=False)

    return result_df.head(5)

In [16]:
# Example 3
target_input_3 = {"calories": 389*2, "protein": 3.52*2, "total_fat": None, "carbs": 78*2}

start_time = time.time()
top_matches_3 = calculate_scaled_similarity_with_binary_search(df, target_input_3)
# top_matches_3 = calculate_scaled_similarity_with_binary_search(df, target_input_2, None, (0.1, 5.0), 0.5)
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Execution time: {elapsed_time:.4f} seconds")
top_matches_3[["id", "food_category", "name", "similarity_%", "ideal_portions"]]

Execution time: 14.4979 seconds


Unnamed: 0,id,food_category,name,similarity_%,ideal_portions
2655,25047,25,"PRETZELS, SIN GLUTEN (MAICENA, HARINA PAPA)",99.67,1.99
1285,14014,14,PAN DE ARROZ O QUESADILLA,98.65,2.09
2373,24182,24,EMPANADA DE PLATANO C/FRIJOL (EL SALVADOR),98.62,3.42
1698,18094,18,"PASTELITOS RELLENOS, AZUCAR Y CANELA",98.55,2.13
1353,15040,15,"CARAMELOS, TIPO TOFFEES",98.36,2.03


# Search algorithm using pre compute

In [17]:
def precompute_projections_and_create_df(df, selected_cols, portion_range=(0.1, 5.0), portion_step=0.1):
    # Precomputar las proyecciones para todos los alimentos y factores de porción
    results = []
    
    # Generar todas las combinaciones de porciones para los alimentos
    for idx, row in df[selected_cols].iterrows():
        for factor in np.arange(portion_range[0], portion_range[1] + portion_step, portion_step):
            # Crear la proyección escalada pero sin normalizar
            scaled_row = row * factor
            
            # Añadir el food_idx y el factor de porción a la fila
            results.append({
                'food_idx': idx,
                'portion_factor': factor,
                **dict(zip(selected_cols, scaled_row))  # Añadir las columnas seleccionadas con sus valores escalados
            })
    
    # Crear DataFrame con todas las combinaciones
    return pd.DataFrame(results)

In [18]:
def filter_by_food_idx(df, food_idx_list):
    filtered_df = df[df['food_idx'].isin(food_idx_list)]
    
    return filtered_df

In [19]:
foods_and_projections = precompute_projections_and_create_df(df, selected_cols)

In [20]:
foods_and_projections

Unnamed: 0,food_idx,portion_factor,calories,protein,total_fat,carbs
0,0,0.1,18.1,0.700,1.410,0.700
1,0,0.2,36.2,1.400,2.820,1.400
2,0,0.3,54.3,2.100,4.230,2.100
3,0,0.4,72.4,2.800,5.640,2.800
4,0,0.5,90.5,3.500,7.050,3.500
...,...,...,...,...,...,...
132845,2656,4.6,1209.8,12.512,40.664,198.030
132846,2656,4.7,1236.1,12.784,41.548,202.335
132847,2656,4.8,1262.4,13.056,42.432,206.640
132848,2656,4.9,1288.7,13.328,43.316,210.945


In [21]:
# Example
target_input_4 = {"calories": 389, "protein": 3.52, "total_fat": None, "carbs": 78}

start_time = time.time()
top_matches_4 = calculate_similarity(foods_and_projections, target_input_4)
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Execution time: {elapsed_time:.4f} seconds")
food_idx_array = top_matches_4["food_idx"].to_numpy()
top_matches_4

Execution time: 0.0289 seconds


Unnamed: 0,food_idx,portion_factor,calories,protein,total_fat,carbs,similarity_%
132759,2655,1.0,389.0,3.52,6.67,78.62,99.94
91289,1825,4.0,384.0,3.2,6.4,78.0,99.92
118666,2373,1.7,377.4,3.672,19.006,78.2,99.85
67659,1353,1.0,382.0,4.6,8.1,77.0,99.8
91290,1825,4.1,393.6,3.28,6.56,79.95,99.8


In [22]:
filtered_results = df.loc[df.index.isin(food_idx_array)]
filtered_results = filtered_results.sort_values(by="similarity_%", ascending=False)

In [23]:
filtered_results[["id", "food_category", "name"]]

Unnamed: 0,id,food_category,name
2655,25047,25,"PRETZELS, SIN GLUTEN (MAICENA, HARINA PAPA)"
1353,15040,15,"CARAMELOS, TIPO TOFFEES"
2373,24182,24,EMPANADA DE PLATANO C/FRIJOL (EL SALVADOR)
1825,19076,19,"COMIDA-BEBE, POSTRE, MANZANA C/YOGUR"
