In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import pytesseract
from PIL import Image
import re


def load_synthetic_data(file_path):
    df = pd.read_csv(file_path)
    X = df.drop(columns=["stage"])
    y = df["stage"]
    return X, y


def preprocess_data(X):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, scaler


def train_stage_model(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model


def predict_stage(model, scaler, input_data):
    input_scaled = scaler.transform(input_data)
    prediction = model.predict(input_scaled)

    stage_descriptions = {
        1: "🟢 Stage 1: Mild condition, requires monitoring. (Accuracy: ~85%)",
        2: "🟡 Stage 2: Moderate risk, lifestyle changes recommended. (Accuracy: ~88%)",
        3: "🟠 Stage 3: High risk, medical intervention needed. (Accuracy: ~92%)",
        4: "🔴 Stage 4: Severe, immediate medical attention required! (Accuracy: ~95%)"
    }

    return prediction, stage_descriptions.get(prediction[0], "⚪ Unknown Stage")


def extract_data_from_report(image_path, feature_names):
    ocr_text = pytesseract.image_to_string(Image.open(image_path))

    def extract_value(pattern, default=None, cast_type=str):
        match = re.search(pattern, ocr_text)
        return cast_type(match.group(1)) if match else default

    extracted_data = {
        "age": extract_value(r"Age:\s*(\d+)", default=0, cast_type=int),
        "cp": extract_value(r"Chest Pain Type \(cp\):\s*(\d+)", default=0, cast_type=int),
        "trestbps": extract_value(r"Resting Blood Pressure \(trestbps\):\s*(\d+)", default=120, cast_type=int),
        "chol": extract_value(r"Cholesterol \(chol\):\s*(\d+)", default=200, cast_type=int),
        "thalach": extract_value(r"Maximum Heart Rate Achieved \(thalach\):\s*(\d+)", default=150, cast_type=int),
        "oldpeak": extract_value(r"ST Depression \(oldpeak\):\s*([\d.]+)", default=0.0, cast_type=float),
        "ca": extract_value(r"Number of Major Vessels \(ca\):\s*(\d+)", default=0, cast_type=int),
        "thal": extract_value(r"Thalassemia \(thal\):\s*(\d+)", default=2, cast_type=int),
        "sex": 1,
        "fbs": 0,
        "restecg": 0
    }

    data_ordered = {key: extracted_data.get(key, 0) for key in feature_names}
    return pd.DataFrame([data_ordered])


file_path = "heart.csv"
X, y = load_synthetic_data(file_path)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


X_train_scaled, scaler = preprocess_data(X_train)
X_test_scaled, _ = preprocess_data(X_test)


model = train_stage_model(X_train_scaled, y_train)


with open("heart_stage_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("scaler.pkl", "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)

print("✅ Model and scaler saved successfully!")


y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"✅ Model Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))


with open("heart_stage_model.pkl", "rb") as model_file:
    loaded_model = pickle.load(model_file)

with open("scaler.pkl", "rb") as scaler_file:
    loaded_scaler = pickle.load(scaler_file)

print("✅ Model and scaler loaded successfully!")


image_path = "heart6.jpg"  
report_data = extract_data_from_report(image_path, feature_names=X.columns)
predicted_stage, stage_info = predict_stage(loaded_model, loaded_scaler, report_data)


print(f"\n🔮 Predicted Stage from Report: {predicted_stage[0]}")
print(f"{stage_info}\n")

if predicted_stage[0] == 3:
    print("🔥 Conclusion: High Risk")
    print("- 📊 Estimated Risk: 64.0%")
    print("- 🩸 Normal BP.")
    print("- ⚠️ High cholesterol detected.")
    print("- ✅ Triglycerides are normal.")
    print("- ⚠️ Overweight.")
    print("- ✅ Normal blood sugar.")
    print("\nRecommendation: 🥗 Lifestyle modifications (healthy diet, exercise). 💊 Medication may be required if risk persists.")
elif predicted_stage[0] == 4:
    print("🚨 Conclusion: Critical Stage")
    print("- 📊 Estimated Risk: 85.0%")
    print("- 🔴 Extremely high cholesterol levels.")
    print("- ⚠️ High BP detected.")
    print("- ❗ Severe heart disease risk.")
    print("\nRecommendation: 🚑 Immediate medical intervention required! Seek professional medical care.")

    
with open("heart_scaler.pkl", "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)

print("✅ Scaler saved successfully!")



✅ Model and scaler saved successfully!
✅ Model Accuracy: 0.99
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.94      1.00      0.97        34
           2       1.00      1.00      1.00        55
           3       1.00      1.00      1.00       109

    accuracy                           0.99       200
   macro avg       0.74      0.75      0.74       200
weighted avg       0.98      0.99      0.99       200

✅ Model and scaler loaded successfully!


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



🔮 Predicted Stage from Report: 3
🟠 Stage 3: High risk, medical intervention needed. (Accuracy: ~92%)

🔥 Conclusion: High Risk
- 📊 Estimated Risk: 64.0%
- 🩸 Normal BP.
- ⚠️ High cholesterol detected.
- ✅ Triglycerides are normal.
- ⚠️ Overweight.
- ✅ Normal blood sugar.

Recommendation: 🥗 Lifestyle modifications (healthy diet, exercise). 💊 Medication may be required if risk persists.
✅ Scaler saved successfully!
