In [1]:
from utils import *
import pandas as pd
from transformers import BertTokenizer, BertModel
from sklearn.decomposition import PCA
import torch
from torch_geometric.data import HeteroData
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


## Benchmark Construction

In [2]:
# Load in necessary data.
df_fndds = pd.read_csv('../processed_data/fndds.csv')
df_nutrition = pd.read_csv('../processed_data/food_tagging.csv')
df_user = pd.read_csv('../processed_data/user_tagging.csv')
df_user_food = pd.read_csv('../processed_data/food_user.csv')

In [3]:
# Step 1: Extract relevant columns
columns_to_extract = ['SEQN', 'gender', 'race', 'household_income', 'education', 'age_group']
df_user_extracted = df_user[columns_to_extract]

# Step 2: One-Hot Encode the categorical columns
df_user_vector = pd.get_dummies(df_user_extracted, columns=['gender', 'race', 'household_income', 'education', 'age_group'])

In [4]:
# Step 1: Concatenate Ingredient Descriptions by Food
ingredient_concat = df_fndds.groupby('food_id')['ingredient_desc'].apply(lambda x: ' '.join(x)).reset_index()
df_fndds = df_fndds.drop(columns=['ingredient_desc']).drop_duplicates(subset='food_id')
food_fndds = pd.merge(df_fndds, ingredient_concat, on='food_id')

# Step 2: Concatenate Food Description, WWEIA Description, and Ingredient Description
food_fndds['combined_desc'] = food_fndds[['food_desc', 'WWEIA_desc', 'ingredient_desc']].agg(' '.join, axis=1)

# Step 3: Convert Combined Descriptions into BERT Embeddings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

food_fndds['bert_embedding'] = food_fndds['combined_desc'].apply(get_bert_embedding)

# Convert the embeddings to a DataFrame
embeddings = np.vstack(food_fndds['bert_embedding'].values)
embeddings_df = pd.DataFrame(embeddings, index=food_fndds['food_id'])

# Step 4: Apply PCA to Reduce Dimensionality to 50
pca = PCA(n_components=50)
reduced_embeddings = pca.fit_transform(embeddings)
reduced_embeddings_df = pd.DataFrame(reduced_embeddings, index=food_fndds['food_id'])

# Result: Table with food_id as index and PCA-reduced embeddings as columns
reduced_embeddings_df.columns = [f'PC{i+1}' for i in range(50)]

In [5]:
# Step 1: Standardize and Normalize Nutrition Vectors
nutrition_vectors = df_nutrition.iloc[:, 1:17].values  # Assuming the first column is food_id

# Standardize (z-score normalization)
scaler = StandardScaler()
standardized_nutrition = scaler.fit_transform(nutrition_vectors)

# Normalize (min-max scaling)
min_max_scaler = MinMaxScaler()
normalized_nutrition = min_max_scaler.fit_transform(standardized_nutrition)

# Convert to DataFrame
nutrition_df = pd.DataFrame(normalized_nutrition, columns=df_nutrition.columns[1:17])
nutrition_df.insert(0, 'food_id', df_nutrition['food_id'])

# Step 2: Merge with Reduced Embeddings
df_nutrition_vector = pd.merge(nutrition_df, reduced_embeddings_df, on='food_id', how='left')
df_nutrition_vector.fillna(0, inplace=True)

In [6]:
# Here we created two tables with only health tags. This will be used later for node features for health loss downstream. 
df_user_copy = df_user.copy()
nutrition_columns = df_nutrition.columns[17:].tolist()
# Create new columns in df_user_copy if they are missing
for nutrition in nutrition_columns:
    user_col = 'user_' + nutrition
    if user_col not in df_user_copy.columns:
        df_user_copy[user_col] = 0

# Reorder columns
ordered_columns = ['SEQN'] + ['user_' + col for col in nutrition_columns]
df_user_tags = df_user_copy[ordered_columns]
df_nutrition_tags = df_nutrition[['food_id'] + nutrition_columns]

In [8]:
df_main = pd.merge(df_user_food, df_user, on='SEQN', how='left')
df_main = pd.merge(df_main, df_nutrition, on='food_id', how='left')

In [9]:
"""
Filter 1: We only leave the adults here. This becuase nutrition suggestions differ for children and adults.
"""
df_main = df_main[df_main['age'] >= 18]

In [10]:
nutrition_list = ['calorie', 'carb', 'protein', 'sugar', 'fiber', 'saturated_fat', 'cholesterol',
                  'sodium', 'folic_acid', 'calcium', 'iron', 'potassium', 'vitamin_b12', 'vitamin_c', 
                  'phosphorus', 'vitamin_d']
macro_nutrition_list = ['calorie', 'carb', 'protein', 'fiber', 'saturated_fat', 'cholesterol', 'sugar']

In [11]:
def benchmark_construction(df_main, nutrition_list):
    # Here we match the nutrition tags with the user tags for statistical analysis.
    df_anal = pd.DataFrame(df_main[['SEQN', 'food_id']])
    df_anal['total_match'] = 0
    df_anal['total_opposite'] = 0
    for nutrition in nutrition_list:
        user_high = 'user_high_' + nutrition
        user_low = 'user_low_' + nutrition
        food_high = 'high_' + nutrition
        food_low = 'low_' + nutrition

        if user_high in df_main.columns:
            df_anal[nutrition+'_high_matching'] = (df_main[user_high] & df_main[food_high]).astype(int)
            df_anal[nutrition+'_high_opposite'] = (df_main[user_high] & df_main[food_low]).astype(int)
            df_anal['total_match'] += df_anal[nutrition+'_high_matching']
            df_anal['total_opposite'] += df_anal[nutrition+'_high_opposite']
        if user_low in df_main.columns:
            df_anal[nutrition+'_low_matching'] = (df_main[user_low] & df_main[food_low]).astype(int)
            df_anal[nutrition+'_low_opposite'] = (df_main[user_low] & df_main[food_high]).astype(int)
            df_anal['total_match'] += df_anal[nutrition+'_low_opposite']
            df_anal['total_opposite'] += df_anal[nutrition+'_low_matching']

    df_anal['clean_score'] = df_anal['total_match'] - df_anal['total_opposite']

    # we only keep users who have consumed more than 10 healthy dishes.
    df_summary = df_anal[df_anal['clean_score'] > 0].groupby('SEQN')[['total_match', 'total_opposite', 'clean_score']].sum()
    df_summary['dish_count'] = df_anal[df_anal['clean_score'] > 0].groupby('SEQN')['food_id'].count()
    df_summary = df_summary.loc[(df_summary['dish_count'] > 10)].sort_values('clean_score', ascending=False)

    # Retrieve the valid user list. 
    valid_SEQN_set = df_summary.index
    df_anal = df_anal[df_anal['SEQN'].isin(valid_SEQN_set)]

    # Remap the rest features back.
    df_main = pd.merge(df_main[['SEQN', 'food_id']].drop_duplicates(), df_anal, on=['SEQN', 'food_id'], how='right')

    return df_main, df_summary

### Benchmark 

In [12]:
df_macro, df_macro_summary = benchmark_construction(df_main, macro_nutrition_list)
df_all, df_all_summary = benchmark_construction(df_main, nutrition_list)

In [13]:
# The number of healthy user-food interactions
print(df_macro_summary['dish_count'].sum())
# The unique food items
print(df_macro['food_id'].nunique())
# The total number of user-food interactions
print(len(df_macro))
# The number of unique users
print(df_macro['SEQN'].nunique())

122009
6769
314224
8170


In [14]:
# The number of healthy user-food interactions
print(df_all_summary['dish_count'].sum())
# The unique food items
print(df_all['food_id'].nunique())
# The total number of user-food interactions
print(len(df_all))
# The number of unique users
print(df_all['SEQN'].nunique())

207949
7516
488223
13282


In [15]:
df_all.to_csv('../processed_data/raw_benchmark.csv', index=False)
df_macro.to_csv('../processed_data/raw_benchmark_macro.csv', index=False)

In [19]:
def graph_construction(df_main, df_user_vector, df_nutrition_vector, df_user_tags, df_nutrition_tags, output_path, macro_only=False):

    # Get unique user and food IDs
    user_ids = df_main['SEQN'].unique()
    food_ids = df_main['food_id'].unique()

    # Create node features
    user_features = torch.tensor(user_ids, dtype=torch.long)
    food_features = torch.tensor(food_ids, dtype=torch.long)

    # Create edge indices
    user_indices = torch.tensor(df_main['SEQN'].map(lambda x: np.where(user_ids == x)[0][0]), dtype=torch.long)
    food_indices = torch.tensor(df_main['food_id'].map(lambda x: np.where(food_ids == x)[0][0]), dtype=torch.long)
    # Create edge indices
    edge_index = torch.stack([user_indices, food_indices], dim=0)

    # Create edge labels
    clean_scores = df_main['clean_score']
    clean_scores = df_main[df_main['clean_score'] > 0]['clean_score']
    
    # Create edge labels
    edge_labels = torch.stack([user_indices[clean_scores.index], food_indices[clean_scores.index]], dim=0)
    
    graph = HeteroData()
    user_feature_vectors = df_user_vector.drop_duplicates().set_index('SEQN').loc[user_ids].values
    food_feature_vectors = df_nutrition_vector.drop_duplicates().set_index('food_id').loc[food_ids].values
    print(user_feature_vectors.shape)
    graph['user'].x = torch.tensor(user_feature_vectors, dtype=torch.float)
    graph['food'].x = torch.tensor(food_feature_vectors, dtype=torch.float)

    graph['user'].node_id = user_features
    graph['food'].node_id = food_features
    # Add num_nodes attribute to every node type
    graph['user'].num_nodes = len(user_ids)
    graph['food'].num_nodes = len(food_ids)
    graph[('user', 'eats', 'food')].edge_index = edge_index
    graph[('user', 'eats', 'food')].edge_label_index = edge_labels

    macro_nutrition_length = 14  # 7 macro nutrition types, each with 'low' and 'high'
    user_tag_features = []
    for user_id in user_ids:
        if user_id in df_user_tags['SEQN'].values:
            user_tag_vector = torch.tensor(df_user_tags[df_user_tags['SEQN'] == user_id].iloc[0, 1:].values, dtype=torch.float)
            if macro_only:
                user_tag_vector = user_tag_vector[:macro_nutrition_length]
            user_tag_features.append(user_tag_vector)
        else:
            raise ValueError(f'User {user_id} does not have any health tags.')
    graph['user'].tags = torch.stack(user_tag_features)

    food_tag_features = []
    for food_id in food_ids:
        if food_id in df_nutrition_tags['food_id'].values:
            food_tag_vector = torch.tensor(df_nutrition_tags[df_nutrition_tags['food_id'] == food_id].iloc[0, 1:].values, dtype=torch.float)
            if macro_only:
                food_tag_vector = food_tag_vector[:macro_nutrition_length]
            food_tag_features.append(food_tag_vector)
        else:
            raise ValueError(f'Food {food_id} does not have any nutrition tags.')
    graph['food'].tags = torch.stack(food_tag_features)

    torch.save(graph, output_path)
    return graph

In [20]:
graph_all = graph_construction(df_all, df_user_vector, df_nutrition_vector, df_user_tags, df_nutrition_tags, '../processed_data/benchmark_all.pt')
graph_macro = graph_construction(df_macro, df_user_vector, df_nutrition_vector, df_user_tags, df_nutrition_tags, '../processed_data/benchmark_macro.pt', macro_only=True)

(13282, 38)
(8170, 38)
