In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # Added numpy for percentile calculation

# ---------------------------------------------
# Setup
# ---------------------------------------------
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (16, 10)

# Assuming 'df' is your DataFrame loaded and columns cleaned

# ---------------------------------------------
# Define columns to explore
# ---------------------------------------------
nutritional_cols = [
    'calories_cal',
    'protein_g',
    'totalfat_g',
    'saturatedfat_g',
    'cholesterol_mg',
    'sodium_mg',
    'totalcarbohydrate_g',
    'dietaryfiber_g',
    'sugars_g',
    'duration',
    'ingredients_sizes',
    'who_score',
    'fsa_score',
    'nutri_score'

]

# Nicely formatted titles for plots
titles = {
    'calories_cal': 'Calories Distribution (99th Percentile)',
    'protein_g': 'Protein Distribution (99th Percentile)',
    'totalfat_g': 'Total Fat Distribution (99th Percentile)',
    'saturatedfat_g': 'Saturated Fat Distribution (99th Percentile)',
    'cholesterol_mg': 'Cholesterol Distribution (99th Percentile)',
    'sodium_mg': 'Sodium Distribution (99th Percentile)',
    'totalcarbohydrate_g': 'Total Carbohydrates Distribution (99th Percentile)',
    'dietaryfiber_g': 'Dietary Fiber Distribution (99th Percentile)',
    'sugars_g': 'Sugar Distribution (99th Percentile)',
    'duration': 'Cooking Duration Distribution (99th Percentile)',
    'ingredients_sizes': 'Ingredient Sizes Distribution (99th Percentile)',
    'who_score': 'WHO Score Distribution', # Scores usually don't need filtering, but kept consistent for structure
    'fsa_score': 'FSA Score Distribution',
    'nutri_score': 'Nutritional Score Distribution'
}

# ---------------------------------------------
# Plot all nutritional distributions in a grid (with percentile filtering)
# ---------------------------------------------
n = len(nutritional_cols)
rows = (n + 2) // 3  # roughly 3 per row

fig, axes = plt.subplots(rows, 3, figsize=(18, 5 * rows))
axes = axes.flatten()

for i, col in enumerate(nutritional_cols):
    if col in df.columns:
        # --- Filtering Added Here ---
        data_to_plot = df[col].dropna() # Drop NaNs for calculation and plotting
        if pd.api.types.is_numeric_dtype(data_to_plot) and not data_to_plot.empty:
            q99 = np.percentile(data_to_plot, 99)
            # Apply filtering mainly to columns prone to extreme outliers
            # Scores might not need it, but apply consistently unless specified otherwise
            if q99 > 0: # Avoid filtering if q99 is 0 or negative
               filtered_data = data_to_plot[data_to_plot <= q99]
            else:
               filtered_data = data_to_plot # Don't filter if percentile is non-positive
        else:
            filtered_data = data_to_plot # Use original data if not numeric or empty after dropna

        # Check if filtered_data is empty before plotting
        if not filtered_data.empty:
             sns.histplot(filtered_data, bins=50, ax=axes[i], color='mediumseagreen', edgecolor='black', kde=True)
             axes[i].set_title(titles[col], fontsize=14)
             axes[i].set_xlabel(col.replace('_', ' ').title())
             # Optionally set xlim to focus the view, especially for filtered columns
             if col not in ['who_score', 'fsa_score', 'nutri_score'] and q99 > 0: # Check q99 > 0
                  axes[i].set_xlim(0, q99)
             axes[i].set_ylabel('Frequency') # Add y-label
        else:
             axes[i].set_title(f"{titles[col]}\n(No data after filtering)", fontsize=14)
             axes[i].set_xlabel(col.replace('_', ' ').title())
             axes[i].set_ylabel('Frequency')

    else:
        axes[i].set_visible(False) # Hide axis if column not found

# Hide any empty subplots if the number of columns < grid size
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

fig.suptitle('Nutritional Attribute Distributions (Filtered to 99th Percentile)', fontsize=20)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()