In [1]:
import pandas as pd
from rapidfuzz import process, fuzz
import os

In [65]:
my_grocery_list_df = pd.read_csv('sample_list.csv', index_col= False, header=0)
my_grocery_list_df = my_grocery_list_df.drop(columns=['Unnamed: 0'])
my_grocery_list_df.columns = ['Ingredients', 'Amount', 'Quantity']

In [66]:
my_grocery_list_df.head()

Unnamed: 0,Ingredients,Amount,Quantity
0,Rijst,300,gram
1,Kip,300,gram
2,Yoghurt,100,ml
3,Uien,2,stuks
4,Tomaten,2,stuks


In [67]:
df_all_products = pd.read_csv("df_all_products.csv")
df_all_products.head(2)

Unnamed: 0.1,Unnamed: 0,product_id,product_name,product_category,product_link,product_price,product_quantity,store_name
0,0,albert_heijn_1,Zespri kiwi sungoldAdvertentie,"Aardappel, groente, fruit",https://www.ah.nl/producten/product/wi367212/z...,5.99,750 g,albert_heijn
1,1,albert_heijn_2,Zespri Kiwi goldAdvertentie,"Aardappel, groente, fruit",https://www.ah.nl/producten/product/wi523724/z...,2.99,3 stuks,albert_heijn


In [68]:
# Step 2: Define a function for fuzzy matching with a confidence threshold
def fuzzy_match(item_name, choices, scorer=fuzz.WRatio, threshold=90):
    match = process.extractOne(item_name, choices, scorer=scorer)
    if match and match[1] >= threshold:
        return match[0]
    return None

# Step 3: Create a list to hold the matches
matches = []

# Step 4: Perform fuzzy matching for each item in my_grocery_list_df with a high confidence threshold
for item in my_grocery_list_df['Ingredients']:
    match = fuzzy_match(item, df_all_products['product_name'], threshold=90)
    matches.append(match)

# Step 5: Add the matches to the my_grocery_list_df
my_grocery_list_df['Matched Product'] = matches

# Step 6: Merge the DataFrames on the matched product name to include 'Price' and 'Image URL'
merged_df = pd.merge(
    my_grocery_list_df, 
    df_all_products[['product_name', 'product_price', 'product_link', 'product_quantity', 'store_name']], 
    how='left', 
    left_on='Matched Product', 
    right_on='product_name'
)

# Drop the duplicate 'Product Name' column
merged_df.drop(columns=['product_name'], inplace=True)

# Display the merged DataFrame
merged_df.head()

Unnamed: 0,Ingredients,Amount,Quantity,Matched Product,product_price,product_link,product_quantity,store_name
0,Rijst,300,gram,AH Biologisch Rijstwafels naturel,0.75,https://www.ah.nl/producten/product/wi58492/ah...,13 stuks,albert_heijn
1,Rijst,300,gram,AH Biologisch Rijstwafels naturel,0.75,https://www.ah.nl/producten/product/wi58492/ah...,13 stuks,albert_heijn
2,Kip,300,gram,Wahid Kipfilet naturel,2.49,https://www.ah.nl/producten/product/wi162009/w...,125 g,albert_heijn
3,Kip,300,gram,Wahid Kipfilet naturel,2.49,https://www.ah.nl/producten/product/wi162009/w...,125 g,albert_heijn
4,Kip,300,gram,Wahid Kipfilet naturel,2.49,https://www.ah.nl/producten/product/wi162009/w...,125 g,albert_heijn


In [69]:
print(merged_df.to_string())

          Ingredients      Amount    Quantity                                Matched Product product_price                                                                                                                                                                                      product_link product_quantity    store_name
0               Rijst         300        gram              AH Biologisch Rijstwafels naturel          0.75                                                                                                                     https://www.ah.nl/producten/product/wi58492/ah-biologisch-rijstwafels-naturel         13 stuks  albert_heijn
1               Rijst         300        gram              AH Biologisch Rijstwafels naturel          0.75                                                                                                                     https://www.ah.nl/producten/product/wi58492/ah-biologisch-rijstwafels-naturel         13 stuks  albert_heijn
2   