In [None]:
import os
import sys
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
import joblib

DATASET_PATH = "diet_disease_risk_dataset.csv"
MODEL_PATH = "diet_risk_model.joblib"
RANDOM_STATE = 42

def compute_bmi(height_cm: float, weight_kg: float) -> float:
    h_m = float(height_cm) / 100.0
    if h_m <= 0:
        return np.nan
    return round(float(weight_kg) / (h_m ** 2), 1)

def coerce_int(x, default=None):
    try:
        return int(x)
    except:
        return default

def coerce_float(x, default=None):
    try:
        return float(x)
    except:
        return default

def yes_no_to_int(s):
    s = str(s).strip().lower()
    return 1 if s in ["1", "y", "yes", "true", "t"] else 0

def print_header(title):
    print("\n" + "=" * 70)
    print(title)
    print("=" * 70)

def ensure_dataset():
    if Path(DATASET_PATH).exists():
        df = pd.read_csv(DATASET_PATH)
        return df

    print("[Info] No dataset found. Creating a small synthetic fallback dataset to proceed.")
    rng = np.random.default_rng(RANDOM_STATE)
    n = 400
    df = pd.DataFrame({
        "age": rng.integers(18, 70, n),
        "gender": rng.choice(["Male", "Female"], n),
        "height_cm": rng.uniform(145, 195, n).round(1),
        "weight_kg": rng.uniform(40, 120, n).round(1),
        "diet_type": rng.choice(["Vegetarian","Non-Vegetarian","Eggetarian"], n),
        "daily_meals": rng.integers(2,5,n),
        "outside_food_freq_per_week": rng.integers(0,10,n),
        "sugary_drinks_per_day": rng.integers(0,5,n),
        "fried_food_per_week": rng.integers(0,7,n),
        "processed_meat_per_week": rng.integers(0,7,n),
        "fruit_servings_per_day": rng.integers(0,6,n),
        "vegetable_servings_per_day": rng.integers(0,6,n),
        "whole_grains_servings_per_day": rng.integers(0,6,n),
        "fiber_g_per_day": rng.uniform(5,45,n).round(1),
        "water_liters_per_day": rng.uniform(0.5,5.0,n).round(1),
        "sleep_hours": rng.uniform(4,10,n).round(1),
        "exercise_days_per_week": rng.integers(0,7,n),
        "steps_per_day": rng.integers(1000,15000,n),
        "smoking_status": rng.choice(["Never","Former","Current"], n),
        "alcohol_units_per_week": rng.integers(0,20,n),
        "stress_level": rng.integers(1,5,n),
        "family_history_diabetes": rng.integers(0,1+1,n),
        "family_history_heart": rng.integers(0,1+1,n),
        "salt_addition": rng.integers(0,1+1,n),
        "oil_type": rng.choice(["Refined","Olive","Mustard","Ghee"], n),
        "spicy_food_freq_per_week": rng.integers(0,10,n),
    })
    df["bmi"] = df.apply(lambda r: compute_bmi(r["height_cm"], r["weight_kg"]), axis=1)

    def noisy_thresh(col, q=0.6):
        return (col > pd.Series(col).quantile(q)).astype(int)

    risk_diabetes = noisy_thresh(0.1*df["bmi"] + 0.2*df["sugary_drinks_per_day"] + 0.05*df["outside_food_freq_per_week"] + 0.3*df["family_history_diabetes"])
    risk_hypertension = noisy_thresh(0.07*df["bmi"] + 0.08*(df["age"]/10) + 0.1*df["salt_addition"] + 0.3*df["family_history_heart"])
    risk_heart = noisy_thresh(0.06*df["bmi"] + 0.05*(df["age"]/10) + 0.1*(df["smoking_status"]=="Current").astype(int) + 0.3*df["family_history_heart"])
    risk_obesity = noisy_thresh(0.15*df["bmi"] + 0.08*df["fried_food_per_week"] + 0.06*df["outside_food_freq_per_week"] - 0.05*df["exercise_days_per_week"])
    risk_gastritis = noisy_thresh(0.06*df["spicy_food_freq_per_week"] + 0.05*df["outside_food_freq_per_week"])

    df["risk_diabetes"] = risk_diabetes
    df["risk_hypertension"] = risk_hypertension
    df["risk_heart"] = risk_heart
    df["risk_obesity"] = risk_obesity
    df["risk_gastritis"] = risk_gastritis

    df.to_csv(DATASET_PATH, index=False)
    return df

def train_or_load_model():
    if Path(MODEL_PATH).exists():
        return joblib.load(MODEL_PATH)

    df = ensure_dataset()

    if "bmi" not in df.columns:
        df["bmi"] = df.apply(lambda r: compute_bmi(r["height_cm"], r["weight_kg"]), axis=1)

    targets = ["risk_diabetes","risk_hypertension","risk_heart","risk_obesity","risk_gastritis"]

    feature_cols = [
        "age","gender","height_cm","weight_kg","bmi","diet_type","daily_meals",
        "outside_food_freq_per_week","sugary_drinks_per_day","fried_food_per_week",
        "processed_meat_per_week","fruit_servings_per_day","vegetable_servings_per_day",
        "whole_grains_servings_per_day","fiber_g_per_day","water_liters_per_day",
        "sleep_hours","exercise_days_per_week","steps_per_day","smoking_status",
        "alcohol_units_per_week","stress_level","family_history_diabetes",
        "family_history_heart","salt_addition","oil_type","spicy_food_freq_per_week"
    ]

    X = df[feature_cols].copy()
    y = df[targets].copy()

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y.sum(axis=1)
    )

    numeric_features = [
        "age","height_cm","weight_kg","bmi","daily_meals",
        "outside_food_freq_per_week","sugary_drinks_per_day","fried_food_per_week",
        "processed_meat_per_week","fruit_servings_per_day","vegetable_servings_per_day",
        "whole_grains_servings_per_day","fiber_g_per_day","water_liters_per_day",
        "sleep_hours","exercise_days_per_week","steps_per_day",
        "alcohol_units_per_week","stress_level","family_history_diabetes",
        "family_history_heart","salt_addition","spicy_food_freq_per_week"
    ]
    categorical_features = ["gender","diet_type","smoking_status","oil_type"]

    preproc = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numeric_features),
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ],
        remainder="drop"
    )

    base_clf = LogisticRegression(max_iter=1000)
    clf = MultiOutputClassifier(base_clf)

    pipeline = Pipeline(steps=[
        ("preprocess", preproc),
        ("clf", clf)
    ])

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    print_header("Validation Report (per-risk)")
    for i, target in enumerate(targets):
        print(f"\n--- {target} ---")
        print(classification_report(y_test[target], y_pred[:, i], zero_division=0))

    joblib.dump(pipeline, MODEL_PATH)
    print(f"\n[Saved] Trained model pipeline -> {MODEL_PATH}")
    return pipeline

def manual_prediction(pipeline):
    print_header("Manual Prediction (enter values; sample in brackets)")

    def prompt(txt, cast, default):
        val = input(f"{txt} [{default}]: ").strip()
        if val == "":
            return default
        return cast(val)

    age = prompt("Age (years)", coerce_int, 25)
    gender = input("Gender (Male/Female) [Male]: ").strip() or "Male"
    height_cm = prompt("Height (cm)", coerce_float, 172)
    weight_kg = prompt("Weight (kg)", coerce_float, 68)
    diet_type = input("Diet Type (Vegetarian/Non-Vegetarian/Eggetarian) [Non-Vegetarian]: ").strip() or "Non-Vegetarian"
    daily_meals = prompt("Daily meals (2-5)", coerce_int, 3)
    outside_food = prompt("Outside food frequency per week (0-10)", coerce_int, 3)
    sugary = prompt("Sugary drinks per day (0-5)", coerce_int, 1)
    fried = prompt("Fried food per week (0-7)", coerce_int, 2)
    processed_meat = prompt("Processed meat per week (0-7)", coerce_int, 1)
    fruit = prompt("Fruit servings per day (0-6)", coerce_int, 2)
    veg = prompt("Vegetable servings per day (0-6)", coerce_int, 3)
    grains = prompt("Whole grains servings per day (0-6)", coerce_int, 2)
    fiber = prompt("Fiber grams per day (5-45)", coerce_float, 22)
    water = prompt("Water liters per day (0.5-5.0)", coerce_float, 2.5)
    sleep = prompt("Sleep hours (4-10)", coerce_float, 7.5)
    exercise_days = prompt("Exercise days per week (0-7)", coerce_int, 4)
    steps = prompt("Steps per day (1000-15000)", coerce_int, 9000)
    smoking = input("Smoking status (Never/Former/Current) [Never]: ").strip() or "Never"
    alcohol = prompt("Alcohol units per week (0-20)", coerce_int, 2)
    stress = prompt("Stress level (1=low to 5=high)", coerce_int, 2)
    fam_dia = yes_no_to_int(input("Family history of diabetes? (Yes/No) [No]: ") or "No")
    fam_heart = yes_no_to_int(input("Family history of heart disease? (Yes/No) [No]: ") or "No")
    salt_add = yes_no_to_int(input("Adds extra salt to meals? (Yes/No) [Yes]: ") or "Yes")
    oil_type = input("Oil type (Refined/Olive/Mustard/Ghee) [Refined]: ").strip() or "Refined"
    spicy = prompt("Spicy food frequency per week (0-10)", coerce_int, 6)

    bmi = compute_bmi(height_cm, weight_kg)

    row = {
        "age": age, "gender": gender, "height_cm": height_cm, "weight_kg": weight_kg,
        "bmi": bmi, "diet_type": diet_type, "daily_meals": daily_meals,
        "outside_food_freq_per_week": outside_food, "sugary_drinks_per_day": sugary,
        "fried_food_per_week": fried, "processed_meat_per_week": processed_meat,
        "fruit_servings_per_day": fruit, "vegetable_servings_per_day": veg,
        "whole_grains_servings_per_day": grains, "fiber_g_per_day": fiber,
        "water_liters_per_day": water, "sleep_hours": sleep,
        "exercise_days_per_week": exercise_days, "steps_per_day": steps,
        "smoking_status": smoking, "alcohol_units_per_week": alcohol,
        "stress_level": stress, "family_history_diabetes": fam_dia,
        "family_history_heart": fam_heart, "salt_addition": salt_add,
        "oil_type": oil_type, "spicy_food_freq_per_week": spicy
    }

    X = pd.DataFrame([row])
    feature_cols = [
        "age","gender","height_cm","weight_kg","bmi","diet_type","daily_meals",
        "outside_food_freq_per_week","sugary_drinks_per_day","fried_food_per_week",
        "processed_meat_per_week","fruit_servings_per_day","vegetable_servings_per_day",
        "whole_grains_servings_per_day","fiber_g_per_day","water_liters_per_day",
        "sleep_hours","exercise_days_per_week","steps_per_day","smoking_status",
        "alcohol_units_per_week","stress_level","family_history_diabetes",
        "family_history_heart","salt_addition","oil_type","spicy_food_freq_per_week"
    ]
    for c in feature_cols:
        if c not in X.columns:
            X[c] = np.nan

    proba = pipeline.named_steps["clf"].estimators_
    preds = pipeline.predict(X)[0]
    probs = [est.predict_proba(pipeline.named_steps["preprocess"].transform(X))[:,1][0] for est in proba]

    targets = ["Diabetes", "Hypertension", "Heart Disease", "Obesity", "Gastritis"]
    print_header("Result (Manual)")
    print(f"Computed BMI: {bmi} (kg/m^2)")
    for t, p, pr in zip(targets, preds, probs):
        label = "High Risk" if int(p)==1 else "Lower Risk"
        print(f"- {t}: {label} (probability = {pr:.2f})")

def csv_prediction(pipeline, csv_path="prediction_template.csv"):
    if not Path(csv_path).exists():
        print(f"[Error] Could not find '{csv_path}'. Upload your CSV or check the path.")
        return

    df = pd.read_csv(csv_path)
    print(f"[Info] Loaded {len(df)} records from {csv_path}")
    
    if "bmi" not in df.columns:
        df["bmi"] = df.apply(lambda r: compute_bmi(r["height_cm"], r["weight_kg"]), axis=1)

    feature_cols = [
        "age","gender","height_cm","weight_kg","bmi","diet_type","daily_meals",
        "outside_food_freq_per_week","sugary_drinks_per_day","fried_food_per_week",
        "processed_meat_per_week","fruit_servings_per_day","vegetable_servings_per_day",
        "whole_grains_servings_per_day","fiber_g_per_day","water_liters_per_day",
        "sleep_hours","exercise_days_per_week","steps_per_day","smoking_status",
        "alcohol_units_per_week","stress_level","family_history_diabetes",
        "family_history_heart","salt_addition","oil_type","spicy_food_freq_per_week"
    ]

    for c in feature_cols:
        if c not in df.columns:
            df[c] = np.nan

    preds = pipeline.predict(df[feature_cols])
    proba_estimators = pipeline.named_steps["clf"].estimators_
    Xt = pipeline.named_steps["preprocess"].transform(df[feature_cols])
    probs = np.column_stack([est.predict_proba(Xt)[:,1] for est in proba_estimators])

    print_header("Disease Risk Predictions - Results")
    
    targets = ["Diabetes", "Hypertension", "Heart Disease", "Obesity", "Gastritis"]
    risk_labels = ["risk_diabetes", "risk_hypertension", "risk_heart", "risk_obesity", "risk_gastritis"]
    
    for i, row in df.iterrows():
        print(f"\n{'='*50}")
        print(f"PERSON {i+1}")
        print(f"{'='*50}")
        print(f"Age: {row['age']} years | Gender: {row['gender']}")
        print(f"Height: {row['height_cm']} cm | Weight: {row['weight_kg']} kg")
        print(f"BMI: {row['bmi']:.1f} | Diet: {row['diet_type']}")
        print(f"Outside food frequency: {row['outside_food_freq_per_week']}/week")
        print(f"Exercise: {row['exercise_days_per_week']} days/week | Steps: {row['steps_per_day']}/day")
        print(f"\nRISK ASSESSMENT:")
        print("-" * 40)
        
        for j, (target, risk_label) in enumerate(zip(targets, risk_labels)):
            risk_value = preds[i, j]
            probability = probs[i, j]
            risk_status = "HIGH RISK" if risk_value == 1 else "LOW RISK"
            risk_color = "🔴" if risk_value == 1 else "🟢"
            
            print(f"{risk_color} {target:15}: {risk_status:9} (Probability: {probability:.3f})")
        
        total_risks = sum(preds[i, :])
        if total_risks >= 3:
            overall = "⚠️  HIGH OVERALL RISK - Consider lifestyle changes"
        elif total_risks >= 1:
            overall = "⚡ MODERATE RISK - Monitor closely"
        else:
            overall = "✅ LOW OVERALL RISK - Keep up good habits"
        
        print(f"\nOVERALL ASSESSMENT: {overall}")
        print(f"Total Risk Factors: {total_risks}/5")
    
    print(f"\n{'='*70}")
    print("SUMMARY STATISTICS")
    print(f"{'='*70}")
    print(f"Total people analyzed: {len(df)}")
    for j, target in enumerate(targets):
        high_risk_count = sum(preds[:, j])
        percentage = (high_risk_count / len(df)) * 100
        print(f"{target:15}: {high_risk_count:3d}/{len(df)} people at high risk ({percentage:5.1f}%)")
    
    print(f"\nAverage BMI: {df['bmi'].mean():.1f}")
    print(f"Average outside food frequency: {df['outside_food_freq_per_week'].mean():.1f} times/week")
    print(f"Average exercise days: {df['exercise_days_per_week'].mean():.1f} days/week")

def main():
    print_header("Training / Loading Model")
    pipeline = train_or_load_model()

    print_header("Choose an option")
    print("1) Manual prediction (enter values)")
    print("2) CSV prediction (batch) - Display results in output")
    print("3) Exit")

    choice = input("Enter 1/2/3 [1]: ").strip() or "1"
    if choice == "1":
        manual_prediction(pipeline)
    elif choice == "2":
        path = input("Enter CSV path [prediction_template.csv]: ").strip() or "prediction_template.csv"
        csv_prediction(pipeline, csv_path=path)
    else:
        print("Goodbye.")

if __name__ == "__main__":
    main()