In [8]:
import pandas as pd
import numpy as np
import joblib
import pytesseract
from PIL import Image
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
dataset_path = "synthetic_blood_count.csv"   # Update with your dataset filename
df = pd.read_csv(dataset_path)

def diagnose(row):
    if row["HGB"] < 11 or row["RBC"] < 4.0 or row["PCV"] < 35:
        return "Anemia"
    elif row["TLC"] > 11:
        return "Infection"
    elif row["PLT /mm3"] < 150:
        return "Low Platelets"
    else:
        return "Normal"

df["Diagnosis"] = df.apply(diagnose, axis=1)


X = df.drop(columns=["Diagnosis"])
y = df["Diagnosis"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

rf_acc = accuracy_score(y_test, rf_model.predict(X_test_scaled))
print(f"🔥 Random Forest Accuracy: {rf_acc:.2f}")


joblib.dump(rf_model, "blood_count_model.pkl")
joblib.dump(scaler, "blood_count_scaler.pkl")
joblib.dump(X.columns.tolist(), "blood_count_features.pkl")
print("✅ Model, scaler, and feature columns saved!")


def predict_from_report(image_path):
    rf_model = joblib.load("blood_count_model.pkl")
    scaler = joblib.load("blood_count_scaler.pkl")
    feature_columns = joblib.load("blood_count_features.pkl")

    ocr_text = pytesseract.image_to_string(Image.open(image_path))

    def extract_value(pattern, default=None, cast_type=float):
        match = re.search(pattern, ocr_text)
        return cast_type(match.group(1)) if match else default

    extracted_data = {feature: extract_value(fr"{feature}:\\s*([\\d.]+)", default=0) for feature in feature_columns}
    report_data = np.array([[extracted_data[feature] for feature in feature_columns]])
    report_scaled = scaler.transform(report_data)
    rf_pred = rf_model.predict(report_scaled)

    result = f"Prediction: {rf_pred[0]}"
    print(result)
    return result


joblib.dump(predict_from_report, "blood_count_predictor.pkl")
print("✅ Prediction function saved as a pickle file!")


image_path = "blood.jpg"  
predict_from_report(image_path)


🔥 Random Forest Accuracy: 0.99
✅ Model, scaler, and feature columns saved!
✅ Prediction function saved as a pickle file!
Prediction: Anemia




'Prediction: Anemia'