In [16]:
import sys
sys.path.append('../')

import pandas as pd

from gensim.models.phrases import Phraser
from src.data_preprocessing.text_preprocessing import (
    preprocess_food_item,
    preprocess_food_list,
)

from src.data_preprocessing.flavor_graph_preprocessing import (
    create_wine_nodes,
    create_food_wine_edges
)

In [17]:
NODES_PATH = '../data/flavor_graph/nodes_191120.csv'
EDGES_PATH = '../data/flavor_graph/edges_191120.csv'
FOOD_LIST_PATH = '../data/list_of_foods.csv'
FOOD_TRIGRAMS_PATH = '../data/food_trigrams.pkl'
# FOOD_WINE_SIMILARITY_PATH = '../data/food_wine_similarity.csv' # TODO

food_trigram_model = Phraser.load(FOOD_TRIGRAMS_PATH)

In [18]:
nodes_df = pd.read_csv(NODES_PATH)
nodes_df.head()

Unnamed: 0,node_id,name,id,node_type,is_hub
0,0,1%_fat_buttermilk,,ingredient,no_hub
1,1,1%_fat_cottage_cheese,,ingredient,no_hub
2,3,10%_cream,,ingredient,no_hub
3,4,100%_bran,,ingredient,no_hub
4,5,10_inch_flour_tortilla,,ingredient,no_hub


In [19]:
mask = (nodes_df['node_type'] == 'ingredient') & (nodes_df['is_hub'] == 'hub')
hub_df= nodes_df.loc[mask]
print(len(hub_df))
hub_df.head()

416


Unnamed: 0,node_id,name,id,node_type,is_hub
33,38,abalone,,ingredient,hub
41,50,acorn,,ingredient,hub
48,57,adzuki_bean,,ingredient,hub
49,58,agar,,ingredient,hub
52,61,agave,,ingredient,hub


In [20]:
food_df = pd.read_csv(FOOD_LIST_PATH)
food_list = list(food_df["Food"])

food_list_preprocessed = preprocess_food_list(food_list, food_trigram_model)
food_list_preprocessed[:5]

['beef', 'liver', 'french', 'tlayuda', 'dairi']

In [22]:
hub_df.loc[:, 'name_norm'] = hub_df['name'].apply(lambda x: preprocess_food_item(x, food_trigram_model))
hub_df.head()

Unnamed: 0,node_id,name,id,node_type,is_hub,name_norm
33,38,abalone,,ingredient,hub,abalon
41,50,acorn,,ingredient,hub,acorn
48,57,adzuki_bean,,ingredient,hub,adzukibean
49,58,agar,,ingredient,hub,agar
52,61,agave,,ingredient,hub,agav


In [23]:
# take intersection of food list and hub list
intersection_hub_food = list(set(hub_df['name_norm']).intersection(set(food_list_preprocessed)))
print(len(intersection_hub_food))

intersection_hub_food[:5]

193


['beef', 'caper', 'pate', 'alfalfa', 'oliv']

In [24]:
hub_df_filtered = hub_df[hub_df['name_norm'].isin(intersection_hub_food)]

In [25]:
hub_df_filtered.head()

Unnamed: 0,node_id,name,id,node_type,is_hub,name_norm
41,50,acorn,,ingredient,hub,acorn
68,77,alfalfa,,ingredient,hub,alfalfa
79,88,almond,,ingredient,hub,almond
125,135,anchovy,,ingredient,hub,anchovi
138,148,anise,,ingredient,hub,anis


In [26]:
# mock food_item - wine_item similarity dataframe
mock_data = {
    'food_item': ['anchovi', 'gelatin', 'ginger', 'gooseberri', 'grape', 'grapefruit', 'guava'],
    'wine_item': ['wine_name1', 'wine_name2', 'wine_name3', 'wine_name4', 'wine_name5', 'wine_name6', 'wine_name7'],
    'similarity': [0.54, 0.73, 0.23, 0.12, 0.65, 0.87, 0.34]
}
food_wine_similarity_df = pd.DataFrame(mock_data)
food_wine_similarity_df.head()

Unnamed: 0,food_item,wine_item,similarity
0,anchovi,wine_name1,0.54
1,gelatin,wine_name2,0.73
2,ginger,wine_name3,0.23
3,gooseberri,wine_name4,0.12
4,grape,wine_name5,0.65


In [27]:
# Add food_name column to food_wine_similarity_df - non-normalized food name
food_wine_similarity_df['food_name'] = None
for index, row in food_wine_similarity_df.iterrows():
    food_wine_similarity_df.loc[index, 'food_name'] = hub_df_filtered[hub_df_filtered['name_norm'] == row['food_item']]['name'].values[0]
food_wine_similarity_df.head()

Unnamed: 0,food_item,wine_item,similarity,food_name
0,anchovi,wine_name1,0.54,anchovy
1,gelatin,wine_name2,0.73,gelatin
2,ginger,wine_name3,0.23,ginger
3,gooseberri,wine_name4,0.12,gooseberry
4,grape,wine_name5,0.65,grape


In [28]:
MAX_ID = nodes_df['node_id'].max()
wine_items = list(food_wine_similarity_df['wine_item'].unique())

In [29]:
# ADD WINE ITEMS TO NODES
wine_nodes_df = create_wine_nodes(wine_items, MAX_ID)
display(wine_nodes_df.head())

nodes_with_wine_df = pd.concat([nodes_df, wine_nodes_df], ignore_index=True)
nodes_with_wine_df.to_csv('../data/flavor_graph/nodes_with_wine.csv', index=False)

Unnamed: 0,node_id,name,node_type,is_hub
0,8748,wine_name1,wine,wine
1,8749,wine_name2,wine,wine
2,8750,wine_name3,wine,wine
3,8751,wine_name4,wine,wine
4,8752,wine_name5,wine,wine


In [30]:
edges_df = pd.read_csv(EDGES_PATH)
edges_df.head(3)

Unnamed: 0,id_1,id_2,score,edge_type
0,5063,6083,0.337742,ingr-ingr
1,244,4620,0.093855,ingr-ingr
2,2253,6753,0.10787,ingr-ingr


In [31]:
# ADD EDGES BETWEEN HUB ITEMS AND WINE ITEMS
food_wine_edges_df = create_food_wine_edges(food_wine_similarity_df, nodes_with_wine_df)

edges_with_wine_df = pd.concat([edges_df, food_wine_edges_df], ignore_index=True)
display(edges_with_wine_df.tail(10))
edges_with_wine_df.to_csv('../data/flavor_graph/edges_with_wine.csv', index=False)

Unnamed: 0,id_1,id_2,score,edge_type
147176,1845,8715,,ingr-dcomp
147177,1845,8724,,ingr-dcomp
147178,1845,8723,,ingr-dcomp
147179,135,8748,0.54,ingr-wine
147180,2821,8749,0.73,ingr-wine
147181,2837,8750,0.23,ingr-wine
147182,2914,8751,0.12,ingr-wine
147183,2947,8752,0.65,ingr-wine
147184,2955,8753,0.87,ingr-wine
147185,3133,8754,0.34,ingr-wine
