In [1]:
import os 
import json
import pandas as pd

In [14]:
user_items = pd.read_csv('/Users/vince/Salk/mCC_Analysis/data/compliant_user_items.csv')

# Load JSON file containing food corrections
with open('/Users/vince/Salk/mCC_Analysis/food_corrections.json', 'r') as f:
    food_corrections = json.load(f)

# Create a new column with corrected food names
user_items['corrected_food'] = user_items['parsing_result'].map(food_corrections)
user_items['corrected_food'] = user_items['corrected_food'].str.lower()

In [12]:
import pickle 

with open('food_embeddings.pkl', 'rb') as f:
    food_embeddings = pickle.load(f)

food_embeddings

Unnamed: 0,food_item,embedding
0,nespresso,"[-0.0067176544, -0.02788784, -0.012478107, 0.0..."
1,oatmeal,"[0.02143874, 0.021305107, 0.022809047, -0.0474..."
2,milk,"[0.034871925, 0.0069323634, -0.035317637, -0.0..."
3,eggplant,"[-0.0074746986, -0.072444454, -0.004489695, 0...."
4,lasagna,"[0.020851888, -0.050679404, 0.03552969, 0.0090..."
...,...,...
2658,asahi vegan beer,"[0.005335356, 0.0065996544, -0.025653338, 0.00..."
2659,eye of round roast,"[0.0687832, -0.020699274, 0.03731008, -0.00327..."
2660,whey hydrolysate isolate zero,"[0.042257965, -0.020719532, -0.069381446, -0.0..."
2661,barukas,"[-0.0714513, 0.007650819, 0.023063982, -0.0228..."


In [15]:
user_items['embedding'] = user_items['corrected_food'].map(food_embeddings.set_index('food_item')['embedding'].to_dict())

In [16]:
user_items

Unnamed: 0,pid,compliance_days_passed,food_type,original_logtime,log_date,time,compliance_date,parsing_result,corrected_food,embedding
0,alqt150211047,0,b,2021-10-28 09:45:59,2021-10-28,9.766389,2021-10-28,nespresso,nespresso,"[-0.0067176544, -0.02788784, -0.012478107, 0.0..."
1,alqt150211047,0,f,2021-10-28 09:45:59,2021-10-28,9.766389,2021-10-28,oatmeal,oatmeal,"[0.02143874, 0.021305107, 0.022809047, -0.0474..."
2,alqt150211047,0,b,2021-10-28 09:45:59,2021-10-28,9.766389,2021-10-28,milk,milk,"[0.034871925, 0.0069323634, -0.035317637, -0.0..."
3,alqt150211047,0,f,2021-10-28 11:57:00,2021-10-28,11.950000,2021-10-28,eggplant,eggplant,"[-0.0074746986, -0.072444454, -0.004489695, 0...."
4,alqt150211047,0,f,2021-10-28 11:57:00,2021-10-28,11.950000,2021-10-28,lasagna,lasagna,"[0.020851888, -0.050679404, 0.03552969, 0.0090..."
...,...,...,...,...,...,...,...,...,...,...
3184274,alqt230941256543,13,b,2023-09-21 16:32:00,2023-09-21,16.533333,2023-09-21,hibiscus tea,hibiscus tea,"[0.020282893, -0.030562764, -0.028167933, 0.01..."
3184275,alqt230941256543,13,f,2023-09-21 16:32:00,2023-09-21,16.533333,2023-09-21,blueberry,blueberry,"[0.015543392, -0.052315928, 0.0055276155, -0.0..."
3184276,alqt230941256543,13,f,2023-09-21 19:25:00,2023-09-21,19.416667,2023-09-21,baked chicken,baked chicken,"[-0.0056958185, -0.014435805, 0.038879912, -0...."
3184277,alqt230941256543,13,f,2023-09-21 19:25:00,2023-09-21,19.416667,2023-09-21,mashed potato,mashed potato,"[-0.0072039505, -0.0070065106, -0.008313367, -..."


In [6]:
user_sleep = pd.read_csv('/Users/vince/Salk/mCC_Analysis/data/compliant_user_sleep.csv', index_col=0)

In [1]:
user_sleep['enough_sleep'].value_counts()

NameError: name 'user_sleep' is not defined

# Matrix Factorization

In [52]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from typing import Tuple, List, Dict

def prepare_user_food_matrix(user_items_df):
    """
    Create a user-food matrix with frequency counts.
    
    Assumptions:
    - Each food item is treated as a distinct category
    - The value in the matrix represents how many times a user consumed that food
    - No semantic similarity between foods is considered at this stage
    """
    # Select relevant columns
    df = user_items_df[['pid', 'corrected_food']]
    
    # Get unique users and foods
    users = df['pid'].unique().tolist()
    foods = df['corrected_food'].unique().tolist()
    
    print(f"Matrix will have {len(users)} users and {len(foods)} food items")
    
    # Create mappings
    user_map = {user: i for i, user in enumerate(users)}
    food_map = {food: i for i, food in enumerate(foods)}
    
    # Create matrix coordinates
    rows = df['pid'].map(user_map).values
    cols = df['corrected_food'].map(food_map).values
    values = np.ones(len(df))  # Count occurrences
    
    # Create a sparse matrix
    from scipy.sparse import csr_matrix
    user_food_matrix = csr_matrix((values, (rows, cols)), 
                                 shape=(len(users), len(foods)))
    
    return user_food_matrix, users, foods
 
# Intersection of users
food_users = user_items['pid'].unique()
sleep_users = user_sleep['pid'].unique()

intersection = set(food_users).intersection(set(sleep_users))

user_items_df = user_items[user_items['pid'].isin(intersection)]

user_food_matrix, users, foods = prepare_user_food_matrix(user_items_df)

Matrix will have 20201 users and 2661 food items


In [53]:
def apply_matrix_factorization(matrix, n_components=20):
    """
    Apply Non-Negative Matrix Factorization to extract latent factors.
    
    Assumptions:
    - The number of components (n_components) represents underlying patterns
    - Non-negativity constraint makes sense for food consumption data
    - These latent factors will represent patterns like "breakfast foods", "protein foods", etc.
    """
    from sklearn.decomposition import NMF
    
    print(f"Applying NMF with {n_components} components")
    model = NMF(
        n_components=n_components,
        init='random',
        random_state=42,
        max_iter=200
    )
    
    user_factors = model.fit_transform(matrix)
    food_factors = model.components_
    
    print(f"User factors shape: {user_factors.shape}")
    print(f"Food factors shape: {food_factors.shape}")
    
    return user_factors, food_factors, model

user_factors, food_factors, model = apply_matrix_factorization(user_food_matrix)

Applying NMF with 20 components
User factors shape: (20201, 20)
Food factors shape: (20, 2661)


In [54]:
def extract_temporal_features(user_items_df):
    """
    Extract features about when users eat.
    
    Assumptions:
    - Time of day is divided into 4 bins (night, morning, afternoon, evening)
    - The percentage of meals in each time bin is a meaningful feature
    - Variance in eating times represents regularity/irregularity
    - Average time between meals captures eating frequency
    """
    # Select relevant columns
    df = user_items_df[['pid', 'time']]
    
    # Create time bins
    time_bins = [0, 6, 12, 18, 24]
    bin_labels = ['night', 'morning', 'afternoon', 'evening']
    df['time_bin'] = pd.cut(df['time'], bins=time_bins, labels=bin_labels)
    
    # Get unique users
    users = df['pid'].unique().tolist()
    
    # Initialize features DataFrame
    temporal_features = pd.DataFrame({'pid': users})
    
    # Calculate percentage of meals in each time bin
    for bin_label in bin_labels:
        bin_counts = df[df['time_bin'] == bin_label].groupby('pid').size()
        bin_counts = bin_counts.reindex(users).fillna(0)
        
        total_counts = df.groupby('pid').size()
        total_counts = total_counts.reindex(users).fillna(1)  # Avoid division by zero
        
        percentage = (bin_counts / total_counts * 100).values
        temporal_features[f'pct_{bin_label}_meals'] = percentage
    
    # Calculate variance in eating times
    time_variance = df.groupby('pid')['time'].var().reindex(users).fillna(0)
    temporal_features['time_variance'] = time_variance.values
    
    # Calculate average time between meals per day
    df_sorted = df.sort_values(['pid', 'time'])
    df_sorted['time_diff'] = df_sorted.groupby('pid')['time'].diff()
    avg_time_diff = df_sorted.groupby('pid')['time_diff'].mean().reindex(users).fillna(0)
    temporal_features['avg_time_between_meals'] = avg_time_diff.values
    
    print(f"Temporal features created for {len(users)} users")
    return temporal_features, users

temporal_features, users = extract_temporal_features(user_items_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time_bin'] = pd.cut(df['time'], bins=time_bins, labels=bin_labels)


Temporal features created for 20201 users


In [55]:
def prepare_sleep_features(user_sleep_df):
    """
    Aggregate sleep data into user features.
    
    Assumptions:
    - Mean and standard deviation capture sleep patterns
    - Users with consistent sleep patterns will have low standard deviation
    - Missing values (e.g., for users with only one record) are filled with 0
    """
    # Select relevant columns
    df = user_sleep_df[['pid', 'sleep_time_decimal', 'sleep_duration_decimal', 'wakeup_time_decimal']]
    
    # Group by user and aggregate
    sleep_features = df.groupby('pid').agg({
        'sleep_time_decimal': ['mean', 'std'],
        'sleep_duration_decimal': ['mean', 'std'],
        'wakeup_time_decimal': ['mean', 'std']
    }).reset_index()
    
    # Flatten column names
    sleep_features.columns = ['pid'] + [
        f'{col[0]}_{col[1]}' for col in sleep_features.columns[1:]
    ]
    
    # Handle missing values
    sleep_features = sleep_features.fillna(0)
    
    users = sleep_features['pid'].tolist()
    print(f"Sleep features created for {len(users)} users")
    
    return sleep_features, users

sleep_features, users = prepare_sleep_features(user_sleep)

Sleep features created for 20201 users


In [56]:
def combine_features(user_factors, users_mf, temporal_features, sleep_features):
    """
    Combine all feature sets into a unified representation.
    """
    # Convert user factors to DataFrame
    user_factors_df = pd.DataFrame(
        user_factors, 
        columns=[f'factor_{i}' for i in range(user_factors.shape[1])]
    )
    user_factors_df['pid'] = users_mf  # This line is causing the error
    
    # Ensure users match between dataframes
    print(f"Users with food factors: {len(users_mf)}")
    print(f"Users with temporal features: {temporal_features['pid'].nunique()}")
    print(f"Users with sleep features: {sleep_features['pid'].nunique()}")
    
    # Merge with temporal features
    print("Combining features...")
    combined_df = user_factors_df.merge(temporal_features, on='pid', how='inner')
    print(f"After merging temporal: {combined_df.shape[0]} users")
    
    # Merge with sleep features
    combined_df = combined_df.merge(sleep_features, on='pid', how='inner')
    print(f"After merging sleep: {combined_df.shape[0]} users")
    
    print(f"Combined features for {combined_df.shape[0]} users with {combined_df.shape[1]-1} features each")
    
    return combined_df

combined_df = combine_features(user_factors, users, temporal_features, sleep_features)

Users with food factors: 20201
Users with temporal features: 20201
Users with sleep features: 20201
Combining features...
After merging temporal: 20201 users
After merging sleep: 20201 users
Combined features for 20201 users with 32 features each


In [57]:
def cluster_users(combined_df, n_clusters=5):
    """
    Cluster users based on combined features.
    
    Assumptions:
    - Features should be standardized before clustering
    - K-means is an appropriate algorithm for this data
    - The specified number of clusters will yield meaningful patterns
    - Silhouette score is a good metric for cluster quality
    """
    from sklearn.preprocessing import StandardScaler
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score
    
    # Extract features (exclude pid)
    features = combined_df.drop('pid', axis=1).values
    
    # Standardize features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    
    # Apply K-means clustering
    print(f"Clustering into {n_clusters} clusters...")
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    labels = kmeans.fit_predict(features_scaled)
    
    # Evaluate clustering
    score = silhouette_score(features_scaled, labels)
    print(f"Silhouette score: {score:.4f}")
    
    return labels, score

labels, score = cluster_users(combined_df)

Clustering into 5 clusters...
Silhouette score: 0.0442


In [58]:
def find_optimal_clusters(combined_df, max_clusters=10):
    """
    Find optimal number of clusters using silhouette score.
    
    Assumptions:
    - The optimal number of clusters is between 2 and max_clusters
    - Silhouette score is a reliable metric for cluster quality
    - Higher silhouette score indicates better clustering
    """
    results = {}
    for n_clusters in range(2, max_clusters + 1):
        labels, score = cluster_users(combined_df, n_clusters=n_clusters)
        results[n_clusters] = score
        print(f"Clusters: {n_clusters}, Silhouette Score: {score:.4f}")
    
    optimal_clusters = max(results, key=results.get)
    print(f"Optimal number of clusters: {optimal_clusters}")
    
    return results, optimal_clusters

results, optimal_clusters = find_optimal_clusters(combined_df)

Clustering into 2 clusters...
Silhouette score: 0.0711
Clusters: 2, Silhouette Score: 0.0711
Clustering into 3 clusters...
Silhouette score: 0.0586
Clusters: 3, Silhouette Score: 0.0586
Clustering into 4 clusters...
Silhouette score: 0.0521
Clusters: 4, Silhouette Score: 0.0521
Clustering into 5 clusters...
Silhouette score: 0.0442
Clusters: 5, Silhouette Score: 0.0442
Clustering into 6 clusters...
Silhouette score: 0.0471
Clusters: 6, Silhouette Score: 0.0471
Clustering into 7 clusters...
Silhouette score: 0.0487
Clusters: 7, Silhouette Score: 0.0487
Clustering into 8 clusters...
Silhouette score: 0.0497
Clusters: 8, Silhouette Score: 0.0497
Clustering into 9 clusters...
Silhouette score: 0.0431
Clusters: 9, Silhouette Score: 0.0431
Clustering into 10 clusters...
Silhouette score: 0.0463
Clusters: 10, Silhouette Score: 0.0463
Optimal number of clusters: 2


In [61]:
optimal_clusters

2