# Clustering on Open Food Facts dataset

### Load data

In [None]:
# Imports libraries
from scripts import analyse_columns, clean
from scripts.kmeans import kmeans_clustering, plot_elbow_method
from scripts.plots import plot_clusters_2d, plot_cluster_sizes, plot_feature_relationships
from scripts.model_utils import load_model
from scripts.dimension_reduction import reduce_dimensions

import pandas as pd
import numpy as np
# import mlflow
from sklearn.preprocessing import StandardScaler
import umap
import matplotlib.pyplot as plt

# mlflow.autolog()

In [None]:
# Load Open Food Facts dataset csv
path = "data/en.openfoodfacts.org.products.csv"

df = pd.read_csv(path, 
                 sep='\\t', 
                 encoding="utf-8",
                 on_bad_lines='skip',
                 nrows=300000)

In [None]:
# Display basic information about the dataset
print("\nFirst few rows:")
display(df.head())

In [None]:
# Display number of rows and columns
print(df.shape)

In [None]:
# Show summary statistics of numeric columns
# Display detailed summary statistics of numeric columns
print("\nSummary statistics of numeric columns:")
print(df.describe(include=[np.number], percentiles=[.05, .25, .5, .75, .95]))

# Show additional statistics
print("\nSkewness of numeric columns:")
print(df.select_dtypes(include=[np.number]).skew())

print("\nKurtosis of numeric columns:")
print(df.select_dtypes(include=[np.number]).kurtosis())

# Count number of non-null values for each numeric column
print("\nNumber of non-null values in numeric columns:")
print(df.select_dtypes(include=[np.number]).count())

### Data Processing

In [None]:
# Keep only relevant columns (nutritional columns)

# All other columns are irrelevant
irrelevant_columns = [col for col in df.columns if not col.endswith('_100g')]

# Manually remove energy-kcal_100g and fat_100g from irrelevant_columns
irrelevant_columns.extend(['energy-kcal_100g', 'fat_100g'])


# Clean dataset using the clean() function
df_cleaned = clean.clean(df, irrelevant_columns=irrelevant_columns, missing_threshold=0.5)

# Display cleaned dataset info
print("\nCleaned dataset shape:", df.shape)
print("\nRemaining columns:")
print(df_cleaned.columns.tolist())


In [None]:
display(df_cleaned.head())

In [None]:
# Handle missing values by filling with median for numeric columns
# Using median instead of mean to be more robust to outliers
df_cleaned = df_cleaned.fillna(df_cleaned.median())

# Verify no more NaN values exist
print("\nNumber of NaN values remaining:")
print(df_cleaned.isna().sum().sum())

# Display first few rows to verify changes
print("\nFirst few rows after handling missing values:")
display(df_cleaned.head())

print("\nShape")
print(df_cleaned.shape)


In [None]:
# Scale features using StandardScaler
scaler = StandardScaler()
df_scaled = pd.DataFrame(
    scaler.fit_transform(df_cleaned),
    columns=df_cleaned.columns
)

# Display first few rows of scaled data
print("First few rows of scaled data:")
display(df_scaled.head())

# Verify scaling - mean should be ~0 and std should be ~1
print("\nMean of scaled features:")
print(df_scaled.mean().round(2))
print("\nStandard deviation of scaled features:")
print(df_scaled.std().round(2))



In [None]:
# Remove outliers using IQR method
def remove_outliers(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    
    # Define bounds
    lower_bound = Q1 - 1.5 * IQR 
    upper_bound = Q3 + 1.5 * IQR
    
    # Create mask for values within bounds
    mask = ~((df < lower_bound) | (df > upper_bound)).any(axis=1)
    
    return df[mask]

# Remove outliers and store in new DataFrame
print("Shape before removing outliers:", df_scaled.shape)
df_scaled_no_outliers = remove_outliers(df_scaled)
print("Shape after removing outliers:", df_scaled_no_outliers.shape)
print(f"Removed {df_scaled.shape[0] - df_scaled_no_outliers.shape[0]} outliers")

# Update original data without outliers for consistency
df_cleaned = df_cleaned.loc[df_scaled_no_outliers.index]


In [11]:
# Create cleaned dataset


### Feature Engineering

In [12]:
# Create new features


In [13]:
# Do PCA to visualize high-dimensional data


In [14]:
# Analyze feature importance and correlations


### Clustering Analysis

In [None]:
### Dimensionality Reduction
print("Original data shape:", df_cleaned.shape)

# Apply UMAP reduction
reduced_data = reduce_dimensions(df_cleaned.values)
print("Reduced data shape:", reduced_data.shape)

# Visualize the reduced data
plt.figure(figsize=(10, 6))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], alpha=0.5)
plt.title('UMAP Projection of the Data')
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.show()

- #### K-Means

In [None]:
# Determine optimal number of clusters
plot_elbow_method(reduced_data)

In [None]:
# Perform clustering
kmeans, labels, metrics = kmeans_clustering(reduced_data)

In [None]:
# Plot clusters in the reduced space
plot_clusters_2d(reduced_data, labels, 
                title="K-means Clustering Results on UMAP Reduced Data")

In [None]:
# Show cluster sizes
plot_cluster_sizes(labels)

In [None]:
### Feature Analysis
# Create feature relationship plots using original data
feature_names = df_cleaned.columns.tolist()
plot_feature_relationships(
    data=df_cleaned.values,  # Using original data, not reduced_data
    labels=labels,
    feature_names=feature_names,
    n_features=5  # Adjust based on how many features you want to compare
)

In [21]:
# DBSCAN clustering

In [22]:
# Gaussian Mixture Model (GMM) clustering


In [23]:
# Hierarchical clustering


### Cluster evaluation and comparison

In [24]:
# Compare clustering results using metrics (Silhouette score, Calinski-Harabasz index, Davies-Bouldin index)

In [25]:
# Visualize cluster comparisons (word clouds, ...)

In [26]:
# Analyse cluster characteristics and interpret results

### Visualization and Insights

In [27]:
# Various visualizations (cluster distribution plots, feature importance within clusters, pairplots with key features, heatmaps of cluster characteristics)


In [28]:
# Generate insights about food product groupings

In [29]:
# Indentify patterns and trends in the clusters

### Conclusions and recommendations

In [30]:
# Summarize findings


In [31]:
# Compare strengths and weaknesses of different clustering methods


In [32]:
# Provide recommendations for practical applications


In [33]:
#Suggest potential areas for futher analysis