In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

print("Libraries imported.")

Libraries imported.


## 1. Data Loading & Integration
We need to merge three different datasets. We will standardize the Country and Year columns to ensure a clean merge.

In [2]:
# Load Datasets
diet_df = pd.read_csv('../../data/processed/cleaned-diet-compositions.csv.csv')
obesity_df = pd.read_csv('../../data/processed/cleaned-obesity-rates.csv')
diabetes_df = pd.read_csv('../../data/processed/cleaned-diabetes-prevalence.csv')

# --- Preprocessing Diet Data ---
if diet_df['year'].dtype == 'object':
    diet_df['Year'] = pd.to_datetime(diet_df['year']).dt.year
else:
    diet_df['Year'] = diet_df['year']
diet_df = diet_df.rename(columns={'entity': 'Country'})
# Drop original 'year' column to avoid confusion
diet_df = diet_df.drop(columns=['year'])

# --- Preprocessing Obesity Data ---
obesity_df = obesity_df.rename(columns={'Area': 'Country'})
# Ensure 'Value' is numeric
obesity_df['Obesity_Rate'] = pd.to_numeric(obesity_df['Value'], errors='coerce')
obesity_df = obesity_df[['Country', 'Year', 'Obesity_Rate']]

# --- Preprocessing Diabetes Data ---
# Calculate Average Diabetes Rate (Men + Women) / 2
diabetes_df['Diabetes_Rate'] = (diabetes_df['Men'] + diabetes_df['Women']) / 2
diabetes_df = diabetes_df[['Country', 'Year', 'Diabetes_Rate']]

# --- Merging ---
# Merge Diet + Obesity
merged_df = pd.merge(diet_df, obesity_df, on=['Country', 'Year'], how='inner')
# Merge + Diabetes
merged_df = pd.merge(merged_df, diabetes_df, on=['Country', 'Year'], how='inner')

print(f"Merged Dataset Shape: {merged_df.shape}")
print(f"Year Range: {merged_df['Year'].min()} - {merged_df['Year'].max()}")
merged_df.head()

Merged Dataset Shape: (2020, 14)
Year Range: 2000 - 2013


Unnamed: 0,Country,cereals_and_grains,pulses,starchy_roots,sugar,oils_fats,meat,dairy_eggs,fruit_and_vegetables,other,alcoholic_beverages,Year,Obesity_Rate,Diabetes_Rate
0,Afghanistan,1334,10.0,22,31,112.0,93.0,125.0,54.0,9,0.0,2000,4.3,0.056497
1,Afghanistan,1345,17.0,21,30,96.0,79.0,82.0,62.0,5,0.0,2001,4.6,0.057947
2,Afghanistan,1396,10.0,20,34,95.0,78.0,131.0,54.0,8,0.0,2002,5.0,0.059601
3,Afghanistan,1451,9.0,31,42,105.0,73.0,119.0,52.0,10,0.0,2003,5.4,0.061341
4,Afghanistan,1490,7.0,24,54,140.0,77.0,117.0,48.0,10,0.0,2004,5.8,0.063123


## 2. Correlation Analysis
Let's look at the raw correlations between nutrient intake and disease rates.

In [3]:
# Select columns for correlation (Nutrients + Outcomes)
nutrient_cols = [col for col in diet_df.columns if col not in ['Country', 'Year']]
outcome_cols = ['Obesity_Rate', 'Diabetes_Rate']
analysis_cols = nutrient_cols + outcome_cols

# Calculate Correlation Matrix
corr_matrix = merged_df[analysis_cols].corr()

# Filter to show only correlations with Outcomes
outcome_corr = corr_matrix.loc[nutrient_cols, outcome_cols].sort_values('Obesity_Rate', ascending=False)

# Visualization
fig = px.imshow(outcome_corr, text_auto=True, aspect='auto',
                title='<b>Correlation: Nutrients vs. Health Outcomes</b>',
                color_continuous_scale='RdBu_r', zmin=-1, zmax=1)
fig.update_layout(height=600)
fig.show()

### Interpretation
- **Positive Correlation (Red)**: Nutrients that increase as disease rates increase (Potential risk factors).
- **Negative Correlation (Blue)**: Nutrients that decrease as disease rates increase (Potential protective factors).

## 3. Predictive Modeling (Linear Regression)
We will use a simple Linear Regression model to quantify the "predictive power" (coefficient magnitude) of each nutrient.

In [4]:
def analyze_predictive_power(target_col):
    print(f"\n--- Analyzing Predictive Power for {target_col} ---")
    
    # Prepare Data
    X = merged_df[nutrient_cols]
    y = merged_df[target_col]
    
    # Standardize Features (Important for comparing coefficients!)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Train Model
    model = LinearRegression()
    model.fit(X_scaled, y)
    
    # Get Coefficients
    coef_df = pd.DataFrame({
        'Nutrient': nutrient_cols,
        'Coefficient': model.coef_
    })
    
    # Sort by absolute impact
    coef_df['Abs_Coefficient'] = coef_df['Coefficient'].abs()
    coef_df = coef_df.sort_values('Abs_Coefficient', ascending=False)
    
    # Model Performance
    y_pred = model.predict(X_scaled)
    r2 = r2_score(y, y_pred)
    print(f"Model R² Score: {r2:.4f} (Explains {r2*100:.2f}% of variance)")
    
    # Plot
    fig = px.bar(coef_df, x='Coefficient', y='Nutrient', orientation='h',
                 title=f'<b>Predictive Power of Nutrients for {target_col} (Standardized Coefficients)</b>',
                 color='Coefficient', color_continuous_scale='RdBu_r')
    fig.update_layout(height=500)
    fig.show()
    
    return coef_df

# Run for Obesity
obesity_coefs = analyze_predictive_power('Obesity_Rate')

# Run for Diabetes
diabetes_coefs = analyze_predictive_power('Diabetes_Rate')


--- Analyzing Predictive Power for Obesity_Rate ---
Model R² Score: 0.4938 (Explains 49.38% of variance)



--- Analyzing Predictive Power for Diabetes_Rate ---
Model R² Score: 0.5016 (Explains 50.16% of variance)


## Conclusion
The coefficients above represent the **standardized effect size**.
- A **large positive coefficient** implies that higher consumption of this nutrient is strongly associated with higher disease rates, holding other factors constant.
- A **large negative coefficient** implies an inverse relationship.

*Note: Correlation does not imply causation. These relationships might be confounded by other factors like physical activity, genetics, or total caloric intake.*