<a href="https://colab.research.google.com/github/yashcoder0007/anemia_predictor/blob/main/anemia_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import sys
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import ticker
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve, auc
from flask import Flask, request, render_template, send_from_directory
import warnings
warnings.filterwarnings("ignore")

DATA_FILE = "anemia.csv"
TEMPLATES_DIR = "templates"
STATIC_DIR = "static"
PLOTS_DIR = os.path.join(STATIC_DIR, "plots")
MODELS_DIR = "models"
os.makedirs(TEMPLATES_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

if not os.path.exists(DATA_FILE):
    try:
        from google.colab import files
        uploaded = files.upload()
        if uploaded:
            DATA_FILE = list(uploaded.keys())[0]
    except Exception:
        pass

df = pd.read_csv(DATA_FILE)
if 'Result' not in df.columns:
    raise ValueError("Dataset must contain a 'Result' column as the target.")

df_columns = df.columns.tolist()

desc = df.describe(include='all').transpose()
desc.to_csv(os.path.join(STATIC_DIR, "descriptive_statistics.csv"))

if 'Gender' in df.columns:
    try:
        df['Gender'] = pd.to_numeric(df['Gender'], errors='coerce')
    except Exception:
        df['Gender'] = df['Gender'].astype('category').cat.codes

for col in df.select_dtypes(include=[np.number]).columns:
    plt.figure(figsize=(6,3))
    plt.hist(df[col].dropna(), bins=30, edgecolor='k', alpha=0.7)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(os.path.join(PLOTS_DIR, f"dist_{col}.png"))
    plt.close()

num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for col in num_cols:
    plt.figure(figsize=(6,4))
    plt.boxplot(df[col].dropna(), vert=False)
    plt.title(f"Boxplot of {col}")
    plt.tight_layout()
    plt.savefig(os.path.join(PLOTS_DIR, f"box_{col}.png"))
    plt.close()

if 'Result' in df.columns:
    for col in num_cols:
        if col == 'Result':
            continue
        plt.figure(figsize=(6,4))
        try:
            df.boxplot(column=col, by='Result')
            plt.title(f"{col} by Result")
            plt.suptitle("")
            plt.xlabel("Result")
            plt.tight_layout()
            plt.savefig(os.path.join(PLOTS_DIR, f"by_result_{col}.png"))
        except Exception:
            pass
        plt.close()

if len(num_cols) >= 2:
    corr = df[num_cols].corr()
    plt.figure(figsize=(8,6))
    im = plt.imshow(corr, cmap='RdYlBu', vmin=-1, vmax=1)
    plt.colorbar(im, fraction=0.046, pad=0.04)
    plt.xticks(range(len(num_cols)), num_cols, rotation=45, ha='right')
    plt.yticks(range(len(num_cols)), num_cols)
    plt.title("Correlation Matrix")
    plt.tight_layout()
    plt.savefig(os.path.join(PLOTS_DIR, "correlation_matrix.png"))
    plt.close()
    corr.to_csv(os.path.join(STATIC_DIR, "correlation_matrix.csv"))

features = ['Gender','Hemoglobin','MCH','MCHC','MCV']
for f in features:
    if f not in df.columns:
        raise ValueError(f"Required feature '{f}' not found in dataset.")

X = df[features].copy()
y = df['Result'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

preprocessor = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=200, random_state=42),
    'SVC': SVC(probability=True, kernel='rbf', C=1.0, random_state=42)
}

results_summary = []
trained_models = {}
X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, clf in models.items():
    clf_pipe = Pipeline([
        ('clf', clf)
    ])
    clf_pipe.fit(X_train_proc, y_train)
    trained_models[name] = clf_pipe
    if hasattr(clf_pipe.named_steps['clf'], "predict_proba"):
        y_proba = clf_pipe.predict_proba(X_test_proc)[:,1]
    else:
        y_proba = clf_pipe.decision_function(X_test_proc)
        y_proba = (y_proba - y_proba.min()) / (y_proba.max() - y_proba.min() + 1e-12)
    y_pred = clf_pipe.predict(X_test_proc)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    try:
        roc = roc_auc_score(y_test, y_proba)
    except Exception:
        roc = float('nan')
    cv_scores = cross_val_score(clf_pipe, preprocessor.transform(X), y, cv=cv, scoring='roc_auc')
    results_summary.append({
        'model': name,
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1': f1,
        'roc_auc': roc,
        'cv_roc_auc_mean': np.mean(cv_scores),
        'cv_roc_auc_std': np.std(cv_scores)
    })
    joblib.dump(clf_pipe, os.path.join(MODELS_DIR, f"{name}.pkl"))
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(4,3))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(f"Confusion Matrix: {name}")
    plt.colorbar()
    tick_marks = np.arange(2)
    plt.xticks(tick_marks, ['Normal','Anemic'])
    plt.yticks(tick_marks, ['Normal','Anemic'])
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], 'd'),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > cm.max()/2 else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(os.path.join(PLOTS_DIR, f"cm_{name}.png"))
    plt.close()
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(6,4))
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.3f})')
    plt.plot([0,1],[0,1], color='navy', lw=1, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {name}')
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(PLOTS_DIR, f"roc_{name}.png"))
    plt.close()
    report = classification_report(y_test, y_pred, output_dict=True)
    pd.DataFrame(report).transpose().to_csv(os.path.join(STATIC_DIR, f"classification_report_{name}.csv"))

results_df = pd.DataFrame(results_summary).sort_values(by='roc_auc', ascending=False)
results_df.to_csv(os.path.join(STATIC_DIR, "model_comparison.csv"), index=False)

best_model_name = results_df.iloc[0]['model']
best_model_path = os.path.join(MODELS_DIR, f"{best_model_name}.pkl")
best_model = joblib.load(best_model_path)

index_html = """<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width,initial-scale=1">
  <title>Anemia Sense</title>
  <style>
    body{font-family:'Poppins',sans-serif;background:#f9fafc;margin:0;display:flex;flex-direction:column;align-items:center}
    h1{color:#ff6b6b;margin-top:60px;text-align:center}
    .form-container{background:#fff;padding:35px 30px;border-radius:16px;box-shadow:0 4px 12px rgba(0,0,0,0.1);width:100%;max-width:420px;margin-top:25px}
    form{display:flex;flex-direction:column;gap:15px}
    input[type=number]{padding:12px;border:1px solid #ddd;border-radius:8px;font-size:1rem;transition:border .2s}
    input[type=number]:focus{border-color:#ff6b6b;outline:none}
    input[type=submit]{padding:12px;background:#ff6b6b;color:#fff;border:none;border-radius:8px;font-weight:600;cursor:pointer;transition:background .3s,transform .2s}
    input[type=submit]:hover{background:#ff4c4c;transform:scale(1.02)}
    footer{margin-top:60px;text-align:center;color:#888;font-size:.9rem}
  </style>
</head>
<body>
  <h1>Predict Anemia</h1>
  <div class="form-container">
    <form action="/predict" method="post">
      <input type="number" name="Hemoglobin" placeholder="Hemoglobin (g/dL)" step="any" required value="11.0">
      <input type="number" name="MCH" placeholder="MCH (pg)" step="any" required value="28.0">
      <input type="number" name="MCHC" placeholder="MCHC (g/dL)" step="any" required value="33.0">
      <input type="number" name="MCV" placeholder="MCV (fL)" step="any" required value="90.0">
      <input type="number" name="Gender" placeholder="RBC (million/µL)" step="any" required value="4.5">
      <input type="submit" value="Predict">
    </form>
  </div>
  <footer>&copy; 2025 Anemia Sense</footer>
</body>
</html>"""

predict_html = """<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width,initial-scale=1">
  <title>Anemia Prediction Result</title>
  <style>
    body{font-family:'Poppins',sans-serif;background:#f9fafc;margin:0;display:flex;flex-direction:column;align-items:center}
    h1{color:#ff6b6b;margin-top:60px;text-align:center}
    .result-container{background:#fff;padding:35px 30px;border-radius:16px;box-shadow:0 4px 12px rgba(0,0,0,0.1);width:100%;max-width:420px;margin-top:25px}
    .result{font-size:1.2rem;font-weight:600;margin-bottom:20px}
    .anemic{color:#e63946}
    .normal{color:#2a9d8f}
    .features{text-align:left;background:#f9f9f9;border-radius:10px;padding:15px 20px;margin-top:15px;font-size:.95rem}
    .features p{margin:8px 0;color:#555}
    a{display:inline-block;margin-top:25px;text-decoration:none;background:#ff6b6b;color:#fff;padding:12px 25px;border-radius:8px;font-weight:600}
    a:hover{background:#ff4c4c}
    footer{margin-top:60px;text-align:center;color:#888;font-size:.9rem}
  </style>
</head>
<body>
  <h1>Prediction Result</h1>
  <div class="result-container">
    <p class="result {{ result_class }}">{{ result_text }}</p>
    <div class="features">
      <p><strong>Hemoglobin:</strong> {{ Hemoglobin }} g/dL</p>
      <p><strong>MCH:</strong> {{ MCH }} pg</p>
      <p><strong>MCHC:</strong> {{ MCHC }} g/dL</p>
      <p><strong>MCV:</strong> {{ MCV }} fL</p>
      <p><strong>RBC:</strong> {{ Gender }} million/µL</p>
    </div>
    <a href="/">Predict Again</a>
  </div>
  <footer>&copy; 2025 Anemia Sense</footer>
</body>
</html>"""

with open(os.path.join(TEMPLATES_DIR, "index.html"), "w") as f:
    f.write(index_html)
with open(os.path.join(TEMPLATES_DIR, "predict.html"), "w") as f:
    f.write(predict_html)

app = Flask(__name__, template_folder=TEMPLATES_DIR, static_folder=STATIC_DIR)

@app.route("/")
def home():
    return render_template("index.html")

@app.route("/predict", methods=["POST"])
def do_predict():
    try:
        values = [request.form.get(f) for f in ['Gender','Hemoglobin','MCH','MCHC','MCV']]
        data = [float(x) for x in values]
        data_arr = np.array(data).reshape(1,-1)
        data_proc = preprocessor.transform(data_arr)
        pred = best_model.predict(data_proc)[0]
        cls = "anemic" if int(pred)==1 else "normal"
        txt = "Anemic" if int(pred)==1 else "Normal"
        return render_template("predict.html", result_text=txt, result_class=cls, Hemoglobin=values[1], MCH=values[2], MCHC=values[3], MCV=values[4], Gender=values[0])
    except Exception as e:
        return str(e), 400

@app.route("/plots/<path:filename>")
def plots(filename):
    return send_from_directory(PLOTS_DIR, filename)

if __name__ == "__main__":
    print("Best model:", best_model_name)
    print("Model comparison saved to:", os.path.join(STATIC_DIR, "model_comparison.csv"))
    app.run(host="0.0.0.0", port=5000, debug=False)



Best model: LogisticRegression
Model comparison saved to: static/model_comparison.csv
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


<Figure size 600x400 with 0 Axes>

<Figure size 600x400 with 0 Axes>

<Figure size 600x400 with 0 Axes>

<Figure size 600x400 with 0 Axes>

<Figure size 600x400 with 0 Axes>

In [None]:
!echo Flask >> requirements.txt
!echo numpy >> requirements.txt
!echo pandas >> requirements.txt
!echo scikit-learn >> requirements.txt
!echo joblib >> requirements.txt
!echo gunicorn >> requirements.txt

