In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import numpy as np
import os

# Ensure output directory exists
# Adjusted path for notebook location
os.makedirs('../reports/figures', exist_ok=True)

## Load Data
Loading datasets from the processed and raw data directories.

In [None]:
print("Loading data...")
# Load Data - Adjusted paths for notebook location
diet_df = pd.read_csv('../data/processed/cleaned-diet-compositions.csv.csv')
obesity_df = pd.read_csv('../data/processed/cleaned-obesity-rates.csv')
diabetes_df = pd.read_csv('../data/processed/cleaned-diabetes-prevalence.csv')
income_df = pd.read_csv('../data/raw/Income-category-wise-countries.csv')
gdp_df = pd.read_csv('../data/processed/cleaned-GDP-countries.csv')

## Preprocessing
Cleaning and preparing the dataframes for analysis.

In [None]:
# --- Preprocessing ---

# 1. Diet Data
if diet_df['year'].dtype == 'object':
    diet_df['Year_Int'] = pd.to_datetime(diet_df['year']).dt.year
else:
    diet_df['Year_Int'] = diet_df['year']
diet_df = diet_df.rename(columns={'entity': 'Country'})
nutrient_cols = [col for col in diet_df.columns if col not in ['Country', 'year', 'Year_Int']]

# 2. Income/Region Data
# Map Country Name to Region/Income
income_df = income_df.rename(columns={'TableName': 'Country'})
country_meta = income_df[['Country', 'Region', 'IncomeGroup']].dropna()

# 3. GDP Data (Melt)
gdp_melted = gdp_df.melt(id_vars=['Country Name', 'Country Code'], var_name='Year', value_name='GDP')
gdp_melted['Year'] = pd.to_numeric(gdp_melted['Year'], errors='coerce')
gdp_melted = gdp_melted.rename(columns={'Country Name': 'Country'})

# 4. Obesity Data
obesity_df = obesity_df.rename(columns={'Area': 'Country'})
obesity_df['Obesity_Rate'] = pd.to_numeric(obesity_df['Value'], errors='coerce')

# 5. Diabetes Data
diabetes_df['Diabetes_Rate'] = (diabetes_df['Men'] + diabetes_df['Women']) / 2

## Figure 1: Global Nutrient Trends

In [None]:
# --- Figure 1: Global Nutrient Trends ---
print("Generating Figure 1 (Global Trends)...")
global_trends = diet_df.groupby('Year_Int')[nutrient_cols].mean().reset_index()
global_melted = global_trends.melt(id_vars='Year_Int', var_name='Nutrient', value_name='Consumption')

fig1 = px.line(global_melted, x='Year_Int', y='Consumption', color='Nutrient',
              title='Global Average Nutrient Consumption Over Time',
              labels={'Year_Int': 'Year', 'Consumption': 'Avg Consumption (kcal/capita/day)'})
fig1.show()
fig1.write_image("../reports/figures/global_nutrient_trends.png", width=1000, height=600)

## Figure 2: GDP vs Obesity

In [None]:
# --- Figure 2: GDP vs Obesity (New) ---
print("Generating Figure 2 (GDP vs Obesity)...")
# Merge GDP and Obesity
gdp_obesity = pd.merge(gdp_melted, obesity_df, on=['Country', 'Year'], how='inner')
# Merge with Region
gdp_obesity = pd.merge(gdp_obesity, country_meta, on='Country', how='left')
# Filter for latest year available in both
latest_year = gdp_obesity['Year'].max()
plot_data = gdp_obesity[gdp_obesity['Year'] == latest_year].dropna(subset=['GDP', 'Obesity_Rate'])

fig2 = px.scatter(plot_data, x='GDP', y='Obesity_Rate', color='Region', hover_name='Country',
                 title=f'GDP per Capita vs. Obesity Rate ({latest_year})',
                 labels={'GDP': 'GDP per Capita (US$)', 'Obesity_Rate': 'Obesity Rate (%)'},
                 log_x=True) # Log scale for GDP often looks better
fig2.show()
fig2.write_image("../reports/figures/gdp_vs_obesity.png", width=800, height=600)

## Figure 3: Regional Diet Composition

In [None]:
# --- Figure 3: Regional Diet Composition (New) ---
print("Generating Figure 3 (Regional Diets)...")
# Merge Diet with Region
diet_region = pd.merge(diet_df, country_meta, on='Country', how='inner')
# Filter for latest year
latest_diet_year = diet_region['Year_Int'].max()
diet_latest = diet_region[diet_region['Year_Int'] == latest_diet_year]
# Group by Region
region_means = diet_latest.groupby('Region')[nutrient_cols].mean().reset_index()
region_melted = region_means.melt(id_vars='Region', var_name='Nutrient', value_name='Consumption')

fig3 = px.bar(region_melted, x='Region', y='Consumption', color='Nutrient',
             title=f'Average Diet Composition by Region ({latest_diet_year})',
             labels={'Consumption': 'Avg Consumption (kcal/capita/day)'})
fig3.show()
fig3.write_image("../reports/figures/regional_diet_composition.png", width=1000, height=600)

## Figure 4: Correlation Heatmap

In [None]:
# --- Figure 4: Correlation Heatmap ---
print("Generating Figure 4 (Correlation)...")
merged_all = pd.merge(diet_df, obesity_df[['Country', 'Year', 'Obesity_Rate']], left_on=['Country', 'Year_Int'], right_on=['Country', 'Year'], how='inner')
merged_all = pd.merge(merged_all, diabetes_df[['Country', 'Year', 'Diabetes_Rate']], on=['Country', 'Year'], how='inner')

corr_cols = nutrient_cols + ['Obesity_Rate', 'Diabetes_Rate']
corr_matrix = merged_all[corr_cols].corr()
outcome_corr = corr_matrix.loc[nutrient_cols, ['Obesity_Rate', 'Diabetes_Rate']].sort_values('Obesity_Rate', ascending=False)

fig4 = px.imshow(outcome_corr, text_auto=True, aspect='auto',
                title='Correlation: Nutrients vs. Health Outcomes',
                color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
fig4.show()
fig4.write_image("../reports/figures/correlation_heatmap.png", width=800, height=600)

## Figure 5: Predictive Power

In [None]:
# --- Figure 5: Predictive Power ---
print("Generating Figure 5 (Predictive Power)...")
X = merged_all[nutrient_cols]
y = merged_all['Obesity_Rate']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
model = LinearRegression()
model.fit(X_scaled, y)

coef_df = pd.DataFrame({'Nutrient': nutrient_cols, 'Coefficient': model.coef_})
coef_df = coef_df.sort_values('Coefficient', ascending=True)

fig5 = px.bar(coef_df, x='Coefficient', y='Nutrient', orientation='h',
             title='Predictive Power for Obesity (Standardized Coefficients)',
             color='Coefficient', color_continuous_scale='RdBu_r')
fig5.show()
fig5.write_image("../reports/figures/predictive_power_obesity.png", width=800, height=500)

## Figure 6: Forecast

In [None]:
# --- Figure 6: Forecast ---
print("Generating Figure 6 (Forecast)...")
# Forecast Diet
global_history = merged_all.groupby('Year')[nutrient_cols].mean().reset_index()
future_years = np.arange(2020, 2041).reshape(-1, 1)
future_df = pd.DataFrame({'Year': future_years.flatten()})

for nutrient in nutrient_cols:
    model_nut = LinearRegression()
    model_nut.fit(global_history['Year'].values.reshape(-1, 1), global_history[nutrient].values)
    future_df[nutrient] = model_nut.predict(future_years)

# Forecast Disease
model_dis = LinearRegression()
model_dis.fit(merged_all[nutrient_cols], merged_all['Obesity_Rate'])
future_predictions = model_dis.predict(future_df[nutrient_cols])

hist_avg = merged_all.groupby('Year')['Obesity_Rate'].mean().reset_index()

fig6 = go.Figure()
fig6.add_trace(go.Scatter(x=hist_avg['Year'], y=hist_avg['Obesity_Rate'], name='Historical', line=dict(color='blue')))
fig6.add_trace(go.Scatter(x=future_df['Year'], y=future_predictions, name='Forecast', line=dict(color='red', dash='dash')))
fig6.update_layout(title='Projected Global Obesity Rate (2030-2040)', xaxis_title='Year', yaxis_title='Obesity Rate (%)')
fig6.show()
fig6.write_image("../reports/figures/forecast_obesity_2040.png", width=1000, height=600)

print("Done!")