## Flavor Graph - data preprocessing

In [1]:
import sys
sys.path.append('../')

import pandas as pd

from gensim.models.phrases import Phraser
from src.data_preprocessing.text_preprocessing import (
    preprocess_food_item,
    preprocess_food_list,
)

from src.data_preprocessing.flavor_graph_preprocessing import (
    create_wine_nodes,
    create_food_wine_edges
)

In [2]:
NODES_PATH = '../data/flavor_graph/nodes_191120.csv'
EDGES_PATH = '../data/flavor_graph/edges_191120.csv'
FOOD_LIST_PATH = '../data/list_of_foods.csv'
FOOD_TRIGRAMS_PATH = '../data/food_trigrams.pkl'
FOOD_WINE_PAIRING_PATH = '../data/pairing_top3.csv'

food_trigram_model = Phraser.load(FOOD_TRIGRAMS_PATH)

In [3]:
nodes_df = pd.read_csv(NODES_PATH)
nodes_df.head()

Unnamed: 0,node_id,name,id,node_type,is_hub
0,0,1%_fat_buttermilk,,ingredient,no_hub
1,1,1%_fat_cottage_cheese,,ingredient,no_hub
2,3,10%_cream,,ingredient,no_hub
3,4,100%_bran,,ingredient,no_hub
4,5,10_inch_flour_tortilla,,ingredient,no_hub


In [4]:
mask = (nodes_df['node_type'] == 'ingredient') & (nodes_df['is_hub'] == 'hub')
hub_df= nodes_df.loc[mask]
print(len(hub_df))
hub_df.head()

416


Unnamed: 0,node_id,name,id,node_type,is_hub
33,38,abalone,,ingredient,hub
41,50,acorn,,ingredient,hub
48,57,adzuki_bean,,ingredient,hub
49,58,agar,,ingredient,hub
52,61,agave,,ingredient,hub


In [5]:
food_df = pd.read_csv(FOOD_LIST_PATH)
food_list = list(food_df["Food"])

food_list_preprocessed = preprocess_food_list(food_list, food_trigram_model)
food_list_preprocessed[:5]

['alfalfa', 'chichi', 'bamboo', 'lorrain', 'barbacoa']

In [16]:
hub_df.loc[:, 'name_norm'] = hub_df['name'].apply(lambda x: preprocess_food_item(x, food_trigram_model))
hub_df.head()

Unnamed: 0,node_id,name,id,node_type,is_hub,name_norm
33,38,abalone,,ingredient,hub,abalon
41,50,acorn,,ingredient,hub,acorn
48,57,adzuki_bean,,ingredient,hub,adzukibean
49,58,agar,,ingredient,hub,agar
52,61,agave,,ingredient,hub,agav


In [7]:
# take intersection of food list and hub list
intersection_hub_food = list(set(hub_df['name_norm']).intersection(set(food_list_preprocessed)))
print(len(intersection_hub_food))

intersection_hub_food[:5]

193


['alfalfa', 'sage', 'strawberri', 'kumquat', 'endiv']

In [8]:
hub_df_filtered = hub_df[hub_df['name_norm'].isin(intersection_hub_food)]

In [9]:
hub_df_filtered.head()

Unnamed: 0,node_id,name,id,node_type,is_hub,name_norm
41,50,acorn,,ingredient,hub,acorn
68,77,alfalfa,,ingredient,hub,alfalfa
79,88,almond,,ingredient,hub,almond
125,135,anchovy,,ingredient,hub,anchovi
138,148,anise,,ingredient,hub,anis


In [10]:
# wine pairing data
food_wine_pairing_df = pd.read_csv(FOOD_WINE_PAIRING_PATH, index_col=0)
food_wine_pairing_df.head()

Unnamed: 0,top1,top2,top3
cauliflow,"Pinot Noir, Pernand-Vergelesses, Burgundy, F...","Bordeaux-style Red Blend, Long Island, New Y...","Bordeaux-style Red Blend, , Stellenbosch, Sou..."
scallop,"Bordeaux-style Red Blend, , Stellenbosch, Sou...","Bordeaux-style Red Blend, Washington Other, ...","Malbec-Cabernet Sauvignon, Bordeaux-style Red ..."
soup,"Pinot Noir, , Niederösterreich, Austria","Gamay, Morgon, Beaujolais, France","Pinot Noir, Yarra Valley, Victoria, Australia"
hungri,"Pinot Noir, Crémant dAlsace, Alsace, France","Pinot Nero, Pinot Noir, Alto Adige, Northeast...","Red Blends, Red Blends, Terra Alta, Catalonia..."
prune,"Red Blends, Red Blends, Fronton, Southwest Fr...","Red Blends, Red Blends, Jumilla, Levante, Spain","Chardonnay, Franciacorta, Lombardy, Italy"


In [11]:
# Add food_name column to food_wine_pairing_df - non-normalized food name
food_wine_pairing_df['food_name'] = None
for index, row in food_wine_pairing_df.iterrows():
    try:
        food_wine_pairing_df.loc[index, 'food_name'] = hub_df_filtered[hub_df_filtered['name_norm'] == index]['name'].values[0]
    except IndexError:
        food_wine_pairing_df.loc[index, 'food_name'] = None

# Drop rows with no food_name i.e. outside of the intersection of food_list and hub_list
print(f"Size before droppng nans: {len(food_wine_pairing_df)}")
food_wine_pairing_df = food_wine_pairing_df.dropna(subset=['food_name'])
print(f"Size after droppng nans: {len(food_wine_pairing_df)}")
food_wine_pairing_df.head()

Size before droppng nans: 527
Size after droppng nans: 152


Unnamed: 0,top1,top2,top3,food_name
cauliflow,"Pinot Noir, Pernand-Vergelesses, Burgundy, F...","Bordeaux-style Red Blend, Long Island, New Y...","Bordeaux-style Red Blend, , Stellenbosch, Sou...",cauliflower
scallop,"Bordeaux-style Red Blend, , Stellenbosch, Sou...","Bordeaux-style Red Blend, Washington Other, ...","Malbec-Cabernet Sauvignon, Bordeaux-style Red ...",scallop
soup,"Pinot Noir, , Niederösterreich, Austria","Gamay, Morgon, Beaujolais, France","Pinot Noir, Yarra Valley, Victoria, Australia",soup
thyme,"Pinot Noir, Pernand-Vergelesses, Burgundy, F...","Zinfandel, California Other, California, US","Zinfandel, Napa, California, US",thyme
potato,"Bordeaux-style Red Blend, Washington Other, ...","Bordeaux-style Red Blend, , Stellenbosch, Sou...","Malbec-Cabernet Sauvignon, Bordeaux-style Red ...",potato


In [12]:
MAX_ID = nodes_df['node_id'].max()
print(f"Max node id: {MAX_ID}")

wine_items = set(food_wine_pairing_df['top1'].unique()) | set(food_wine_pairing_df['top2'].unique()) | set(food_wine_pairing_df['top3'].unique())
print(f"Number of wine items: {len(wine_items)}")

Max node id: 8747
Number of wine items: 107


In [13]:
# ADD WINE ITEMS TO NODES
wine_nodes_df = create_wine_nodes(wine_items, MAX_ID)
display(wine_nodes_df.head())

nodes_with_wine_df = pd.concat([nodes_df, wine_nodes_df], ignore_index=True)
nodes_with_wine_df.to_csv('../data/flavor_graph/nodes_with_wine.csv', index=False)

Unnamed: 0,node_id,name,node_type,is_hub
0,8748,"Bordeaux-style Red Blend, Francs Côtes de Bord...",wine,wine
1,8749,"Red Blends, Red Blends, Bolgheri, Tuscany, I...",wine,wine
2,8750,"Turbiana, Italian White, Lugana, Lombardy, I...",wine,wine
3,8751,"Bordeaux-style Red Blend, Saint-Émilion, Bord...",wine,wine
4,8752,"Cabernet Sauvignon, Coonawarra, South Austral...",wine,wine


In [14]:
edges_df = pd.read_csv(EDGES_PATH)
edges_df.head(3)

Unnamed: 0,id_1,id_2,score,edge_type
0,5063,6083,0.337742,ingr-ingr
1,244,4620,0.093855,ingr-ingr
2,2253,6753,0.10787,ingr-ingr


In [15]:
# ADD EDGES BETWEEN HUB ITEMS AND WINE ITEMS
food_wine_edges_df = create_food_wine_edges(food_wine_pairing_df, nodes_with_wine_df)
print(f"Number of food-wine edges: {len(food_wine_edges_df)}")

edges_with_wine_df = pd.concat([edges_df, food_wine_edges_df], ignore_index=True)
display(edges_with_wine_df.tail(10))
edges_with_wine_df.to_csv('../data/flavor_graph/edges_with_wine.csv', index=False)

Number of food-wine edges: 456


Unnamed: 0,id_1,id_2,score,edge_type
147625,3370,8781,0.333333,ingr-wine
147626,1005,8830,1.0,ingr-wine
147627,1005,8843,0.5,ingr-wine
147628,1005,8806,0.333333,ingr-wine
147629,3565,8814,1.0,ingr-wine
147630,3565,8843,0.5,ingr-wine
147631,3565,8826,0.333333,ingr-wine
147632,302,8819,1.0,ingr-wine
147633,302,8835,0.5,ingr-wine
147634,302,8756,0.333333,ingr-wine
