In [9]:
# Import Libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

print("Libraries imported.")

Libraries imported.


## 1. Data Preparation
We load and merge the datasets to get a complete picture of historical Diet and Disease.

In [10]:
# Load Datasets
diet_df = pd.read_csv('../../data/processed/cleaned-diet-compositions.csv.csv')
obesity_df = pd.read_csv('../../data/processed/cleaned-obesity-rates.csv')
diabetes_df = pd.read_csv('../../data/processed/cleaned-diabetes-prevalence.csv')

# --- Preprocessing ---
# Diet
if diet_df['year'].dtype == 'object':
    diet_df['Year'] = pd.to_datetime(diet_df['year']).dt.year
else:
    diet_df['Year'] = diet_df['year']
diet_df = diet_df.rename(columns={'entity': 'Country'}).drop(columns=['year'])

# Obesity
obesity_df = obesity_df.rename(columns={'Area': 'Country'})
obesity_df['Obesity_Rate'] = pd.to_numeric(obesity_df['Value'], errors='coerce')
obesity_df = obesity_df[['Country', 'Year', 'Obesity_Rate']]

# Diabetes
diabetes_df['Diabetes_Rate'] = (diabetes_df['Men'] + diabetes_df['Women']) / 2
diabetes_df = diabetes_df[['Country', 'Year', 'Diabetes_Rate']]

# Merge All
merged_df = pd.merge(diet_df, obesity_df, on=['Country', 'Year'], how='inner')
merged_df = pd.merge(merged_df, diabetes_df, on=['Country', 'Year'], how='inner')

# Define Features (Nutrients) and Targets
nutrient_cols = [col for col in diet_df.columns if col not in ['Country', 'Year']]
print(f"Data Merged. Historical Years: {merged_df['Year'].min()} - {merged_df['Year'].max()}")

Data Merged. Historical Years: 2000 - 2013


## 2. Step 1: Forecast Dietary Trends (2020-2040)
We will create a "Future Scenario" where current dietary trends continue linearly.
We will do this for the **Global Average** to get a general projection.

In [11]:
# Calculate Global Average per Year (Historical)
global_history = merged_df.groupby('Year')[nutrient_cols].mean().reset_index()

# Define Future Years
future_years = np.arange(2020, 2041).reshape(-1, 1)
# print(future_years)
future_df = pd.DataFrame({'Year': future_years.flatten()})
# print(future_df)

# Forecast each nutrient
print("Forecasting Diet Composition...")
for nutrient in nutrient_cols:
    # Train on history
    X_hist = global_history['Year'].values.reshape(-1, 1)
    y_hist = global_history[nutrient].values
    
    model = LinearRegression()
    model.fit(X_hist, y_hist)
    
    # Predict future
    future_df[nutrient] = model.predict(future_years)

# Visualize one forecasted nutrient as a sanity check
fig = go.Figure()
fig.add_trace(go.Scatter(x=global_history['Year'], y=global_history['sugar'], name='Historical Sugar'))
fig.add_trace(go.Scatter(x=future_df['Year'], y=future_df['sugar'], name='Forecasted Sugar', line=dict(dash='dash')))
fig.update_layout(title='Projected Global Sugar Consumption (Example)', xaxis_title='Year', yaxis_title='Consumption')
fig.show()

Forecasting Diet Composition...


## 3. Step 2 & 3: Train Disease Models & Predict Future Rates
Now we train a model to understand how Diet impacts Disease, and apply it to our forecasted diet.

In [12]:
def forecast_disease(target_col, title):
    # 1. Train Model (Historical Data)
    X = merged_df[nutrient_cols]
    y = merged_df[target_col]
    
    model = LinearRegression()
    model.fit(X, y)
    r2 = model.score(X, y)
    print(f"Model R² for {target_col}: {r2:.4f}")
    
    # 2. Predict Future (using Forecasted Diet)
    X_future = future_df[nutrient_cols]
    future_predictions = model.predict(X_future)
    
    # 3. Combine for Plotting
    # Historical Global Average for the Target
    hist_avg = merged_df.groupby('Year')[target_col].mean().reset_index()
    
    # Plot
    fig = go.Figure()
    
    # Historical Data
    fig.add_trace(go.Scatter(x=hist_avg['Year'], y=hist_avg[target_col], 
                             name='Historical (Actual)', line=dict(color='blue')))
    
    # Forecast Data
    fig.add_trace(go.Scatter(x=future_df['Year'], y=future_predictions, 
                             name='Forecast (Projected)', line=dict(color='red', dash='dash')))
    
    # Add confidence interval or range? (Optional, keeping it simple for now)
    
    fig.update_layout(title=f'<b>{title}: Historical vs. Projected (2030-2040)</b>',
                      xaxis_title='Year', yaxis_title='Rate / Prevalence',
                      template='plotly_white', height=600)
    
    # Add annotation for 2030 and 2040 values
    val_2030 = future_predictions[future_df['Year'] == 2030][0]
    val_2040 = future_predictions[future_df['Year'] == 2040][0]
    
    fig.add_annotation(x=2030, y=val_2030, text=f"2030: {val_2030:.2f}", showarrow=True, arrowhead=1)
    fig.add_annotation(x=2040, y=val_2040, text=f"2040: {val_2040:.2f}", showarrow=True, arrowhead=1)
    
    fig.show()

# Run Forecasts
forecast_disease('Obesity_Rate', 'Global Obesity Rate Projection')
forecast_disease('Diabetes_Rate', 'Global Diabetes Prevalence Projection')

Model R² for Obesity_Rate: 0.4938


Model R² for Diabetes_Rate: 0.5016


## Summary of Projections
Based on the linear continuation of current dietary trends (e.g., rising sugar/meat consumption):
- **Obesity Rates** are projected to reach the values shown above by 2040.
- **Diabetes Prevalence** follows a similar trajectory, driven by the strong correlation with dietary factors.

*Disclaimer: This is a simplified projection assuming that the relationship between diet and disease remains constant and that dietary trends continue linearly without intervention.*