In [None]:
# 1. Import Libraries
# pandas: Used for working with data tables (DataFrames)
# sklearn (scikit-learn): The most popular library for Machine Learning in Python
# matplotlib: Used for creating static charts

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

print("Libraries imported successfully!")

In [None]:
# 2. Load Data
# We load the CSV files from the raw-data folder.

diabetes_path = 'raw-data/Diabetes/NCD_RisC_Lancet_2016_DM_crude_countries (1).csv'
obesity_path = 'raw-data/Obesity-Trends/FAOSTAT_data_en_11-5-2025.csv'
diet_path = 'raw-data/Diet-Compositions/Diet compositions by commodity categories - FAO (2017).csv'

df_diabetes = pd.read_csv(diabetes_path)
df_obesity = pd.read_csv(obesity_path)
df_diet = pd.read_csv(diet_path)

print("Data loaded.")
print(f"Diabetes data shape: {df_diabetes.shape}")
print(f"Obesity data shape: {df_obesity.shape}")
print(f"Diet data shape: {df_diet.shape}")

In [None]:
# 3. Clean and Prepare Data

# Prepare Diabetes Data
# We group by Country and Year to get the average prevalence
diabetes_avg = df_diabetes.groupby(['Country/Region/World', 'Year']).agg({
    'Crude diabetes prevalence': 'mean'
}).reset_index()
diabetes_avg.columns = ['Country', 'Year', 'Diabetes_Prevalence']
diabetes_avg['Diabetes_Prevalence'] *= 100  # Convert to percentage

# Prepare Obesity Data
obesity_clean = df_obesity[['Area', 'Year', 'Value']].copy()
obesity_clean.columns = ['Country', 'Year', 'Obesity_Prevalence']

# Prepare Diet Data
# We select columns related to food groups (FAO)
nutrient_cols = [col for col in df_diet.columns if 'FAO' in col]
diet_clean = df_diet[['Entity', 'Year'] + nutrient_cols].copy()

# Rename columns for easier reading
new_columns = ['Country', 'Year'] + [col.split('(')[0].strip().replace(' ', '_') for col in nutrient_cols]
diet_clean.columns = new_columns

print("Data cleaned.")

In [None]:
# 4. Merge Datasets
# We combine all three datasets based on 'Country' and 'Year'
merged = diabetes_avg.merge(obesity_clean, on=['Country', 'Year'], how='inner')
merged = merged.merge(diet_clean, on=['Country', 'Year'], how='inner')

# Remove any rows with missing values
merged = merged.dropna()

print(f"Merged dataset shape: {merged.shape}")
merged.head()

In [None]:
# 5. Prepare for Machine Learning
# Features (X): The diet components and the Year
# Targets (y): Diabetes and Obesity rates

feature_cols = [col for col in merged.columns if col not in ['Country', 'Year', 'Diabetes_Prevalence', 'Obesity_Prevalence']]
X = merged[feature_cols].copy()

# We also add 'Year' as a feature so the model learns the time trend
X['Year'] = merged['Year']

y_diabetes = merged['Diabetes_Prevalence']
y_obesity = merged['Obesity_Prevalence']

print(f"Features used: {feature_cols}")

In [None]:
# 6. Train Linear Regression Model (Diabetes)

# Split data into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y_diabetes, test_size=0.2, random_state=42)

# Create and train the model
model_diabetes = LinearRegression()
model_diabetes.fit(X_train, y_train)

# Evaluate
score = model_diabetes.score(X_test, y_test)
print(f"Diabetes Model Accuracy (R2 Score): {score:.2f}")

In [None]:
# 7. Train Linear Regression Model (Obesity)

X_train_ob, X_test_ob, y_train_ob, y_test_ob = train_test_split(X, y_obesity, test_size=0.2, random_state=42)

model_obesity = LinearRegression()
model_obesity.fit(X_train_ob, y_train_ob)

score_ob = model_obesity.score(X_test_ob, y_test_ob)
print(f"Obesity Model Accuracy (R2 Score): {score_ob:.2f}")

In [None]:
# 8. Predict Future (2030-2040)
# To predict the future, we need to estimate what the diet will look like in 2030-2040.
# For simplicity, we will assume the diet stays similar to the last known year (2013) 
# but we will update the 'Year' feature to let the model project the trend.

future_years = range(2030, 2041)
predictions = []

# Let's pick a specific country to visualize, e.g., 'United States'
country = 'United States'
country_data = merged[merged['Country'] == country].iloc[-1] # Get the last known data row

print(f"Predicting for {country}...")

for year in future_years:
    # Create a hypothetical row for the future year
    future_row = country_data[feature_cols].copy()
    # We keep the diet the same as 2013, but change the year
    # This asks: "If we keep eating like we did in 2013, what will happen by 2040?"
    future_row['Year'] = year
    
    # Reshape for the model (1 row)
    X_future = pd.DataFrame([future_row])
    
    # Predict
    pred_diabetes = model_diabetes.predict(X_future)[0]
    pred_obesity = model_obesity.predict(X_future)[0]
    
    predictions.append({
        'Year': year,
        'Predicted_Diabetes': pred_diabetes,
        'Predicted_Obesity': pred_obesity
    })

df_pred = pd.DataFrame(predictions)
df_pred

In [None]:
# 9. Visualize Predictions
plt.figure(figsize=(10, 6))
plt.plot(df_pred['Year'], df_pred['Predicted_Diabetes'], label='Diabetes Prediction', marker='o')
plt.plot(df_pred['Year'], df_pred['Predicted_Obesity'], label='Obesity Prediction', marker='x')
plt.title(f"Predicted Health Trends for {country} (2030-2040)")
plt.xlabel("Year")
plt.ylabel("Prevalence (%)")
plt.legend()
plt.grid(True)
plt.show()