# Problem Statement
Predicting SpO2 level and selecting best ML model

In [None]:
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR


Predicting SpO2 level and selecting best ML model


# Data Loading

In [3]:
data = pd.read_csv("Synthetic_patient-HealthCare-Monitoring_dataset.csv")
data.columns = data.columns.str.strip()

# Data Understanding

In [4]:
print(data.head())
print(data.info())
print(data.describe())
print(data.columns)

   Patient Number  Heart Rate (bpm)  SpO2 Level (%)  \
0               1                98              96   
1               2               105              97   
2               3                90              85   
3               4               102              87   
4               5                81              95   

   Systolic Blood Pressure (mmHg)  Diastolic Blood Pressure (mmHg)  \
0                             120                               86   
1                             177                              104   
2                             139                               57   
3                             101                               77   
4                             172                              119   

   Body Temperature (°C) Fall Detection  Predicted Disease  Data Accuracy (%)  \
0                   38.1             No  Diabetes Mellitus                 95   
1                   37.6             No      Heart Disease                 91   
2    

# Data Preprocessing

In [5]:
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)


 # Target Variable

In [6]:
y = data["SpO2 Level (%)"]
X = data.drop("SpO2 Level (%)", axis=1)


# Feature Engineering

In [7]:
X = pd.get_dummies(X, drop_first=True)


# Feature Selection

In [8]:
feature_names = X.columns.tolist()
with open("feature_names.pkl", "wb") as f:
    pickle.dump(feature_names, f)


# Feature Scaling

In [9]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train , testing and Splitting  data

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Model Selection

In [11]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "SVR": SVR(),
    "Polynomial Regression": Pipeline([
        ("poly", PolynomialFeatures(degree=2)),
        ("lr", LinearRegression())
    ])
}

accuracy_results = {}

print("\nModel Accuracy Results (R² Score in %):\n")

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = r2_score(y_test, y_pred) * 100
    accuracy_results[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.2f}%")



Model Accuracy Results (R² Score in %):

Linear Regression Accuracy: 77.85%
Decision Tree Accuracy: 53.77%
Random Forest Accuracy: 76.68%
SVR Accuracy: 77.18%
Polynomial Regression Accuracy: 77.83%


# Best Model Selection

In [12]:
best_model_name = max(accuracy_results, key=accuracy_results.get)
best_model = models[best_model_name]
best_accuracy = accuracy_results[best_model_name]

print("\nBest Model Selected")
print("Model Name:", best_model_name)
print(f"Accuracy: {best_accuracy:.2f}%")

# -----------------------------
# Save Best Model & Scaler
# -----------------------------
with open("best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("Best model, scaler & feature names saved successfully")


Best Model Selected
Model Name: Linear Regression
Accuracy: 77.85%
Best model, scaler & feature names saved successfully


# Saving the model

In [13]:
with open("best_model.pkl", "rb") as f:
    model = pickle.load(f)

with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

with open("feature_names.pkl", "rb") as f:
    feature_names = pickle.load(f)

# New patient Data

In [14]:
new_patient_data = {
    "Heart Rate (bpm)": 98,
    "Systolic Blood Pressure (mmHg)": 130,
    "Diastolic Blood Pressure (mmHg)": 85,
    "Body Temperature (°C)": 38.2,
    "Fall Detection": "No",
    "Data Accuracy (%)": 95,
    "Heart Rate Alert": "High",
    "Blood Pressure Alert": "High",
    "Temperature Alert": "High"
}

# Check all required features exist
new_df = pd.DataFrame([new_patient_data])
new_df = pd.get_dummies(new_df)
new_df = new_df.reindex(columns=feature_names, fill_value=0)

# Scale features
new_data_scaled = scaler.transform(new_df)

# Predict SpO2 level
prediction = model.predict(new_data_scaled)

print("\nPredicted SpO2 Level (%):", round(prediction[0], 2))


Predicted SpO2 Level (%): 88.99
