In [81]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import pickle
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

In [83]:
# Generate dataset
np.random.seed(42)

# 10000 samples
n_samples = 10000

In [84]:
# Generate features
data = {
    'Age': np.random.randint(18, 81, n_samples),
    'Disease_Severity': np.random.randint(1, 11, n_samples),
    'BMI': np.random.normal(25, 5, n_samples).clip(16, 45),
    'Heart_Rate': np.random.normal(75, 10, n_samples).clip(50, 120),
    'Glucose_Level': np.random.normal(100, 20, n_samples).clip(70, 200),
    'Oxygen_Saturation': np.random.normal(97, 2, n_samples).clip(90, 100),
    'Chronic_Condition': np.random.binomial(1, 0.3, n_samples),
    'Previous_Treatments': np.random.binomial(1, 0.4, n_samples),
    'Diabetes': np.random.binomial(1, 0.2, n_samples),
    'Hypertension': np.random.binomial(1, 0.25, n_samples),
    'Heart_Disease': np.random.binomial(1, 0.15, n_samples),
    'Treatment_Compliance': np.random.randint(1, 6, n_samples)
}

In [85]:
# Add categorical features
blood_pressure_categories = ['Normal', 'Elevated', 'High']
exercise_levels = ['Sedentary', 'Light', 'Moderate', 'Active']
smoking_status = ['Never', 'Former', 'Current']

data['Blood_Pressure'] = np.random.choice(blood_pressure_categories, n_samples)
data['Exercise_Level'] = np.random.choice(exercise_levels, n_samples)
data['Smoking_Status'] = np.random.choice(smoking_status, n_samples)

In [86]:
# Create DataFrame
df = pd.DataFrame(data)

In [87]:
df.head()

Unnamed: 0,Age,Disease_Severity,BMI,Heart_Rate,Glucose_Level,Oxygen_Saturation,Chronic_Condition,Previous_Treatments,Diabetes,Hypertension,Heart_Disease,Treatment_Compliance,Blood_Pressure,Exercise_Level,Smoking_Status
0,56,9,23.693592,83.066809,71.802021,100.0,0,1,0,1,0,1,Elevated,Sedentary,Current
1,69,7,26.249031,73.931358,118.996241,97.064332,0,0,0,0,0,3,High,Light,Never
2,46,8,20.317917,71.690782,91.737865,96.647722,1,1,1,0,0,3,Elevated,Light,Current
3,32,1,27.126602,88.742163,76.838453,95.947502,0,1,0,0,0,3,High,Active,Never
4,60,2,25.598619,82.23702,110.745764,99.170051,0,0,0,0,1,2,Elevated,Sedentary,Current


In [88]:
len(df)

10000

In [90]:
# Target variable (Treatment Duration)
base_duration = 20

In [91]:
# Add effects of various factors
duration = base_duration + \
          df['Disease_Severity'] * 5 + \
          (df['Age'] - 50) * 0.2 + \
          (df['BMI'] - 25) * 0.5 + \
          (df['Glucose_Level'] - 100) * 0.1 + \
          df['Chronic_Condition'] * 10 + \
          df['Previous_Treatments'] * (-5) + \
          df['Diabetes'] * 7 + \
          df['Hypertension'] * 5 + \
          df['Heart_Disease'] * 8 + \
          (df['Treatment_Compliance'] - 3) * (-3)

# Add categorical effects
duration += np.where(df['Blood_Pressure'] == 'High', 7,
                    np.where(df['Blood_Pressure'] == 'Elevated', 3, 0))
duration += np.where(df['Exercise_Level'] == 'Sedentary', 5,
                    np.where(df['Exercise_Level'] == 'Light', 2,
                            np.where(df['Exercise_Level'] == 'Moderate', -2, -4)))
duration += np.where(df['Smoking_Status'] == 'Current', 8,
                    np.where(df['Smoking_Status'] == 'Former', 4, 0))

# Add some random noise
duration += np.random.normal(0, 5, n_samples)

# Ensure duration is positive and round to nearest day
df['Treatment_Duration'] = np.maximum(duration, 7).round()

In [92]:
# Save the dataset
df.to_csv('healthcare_dataset.csv', index=False)

In [None]:
# Load your dataset
df = pd.read_csv('healthcare_dataset.csv')

In [105]:
def best_ML_algorithm(df, target_column, algorithms):
    """Find the best performing machine learning algorithm"""
    X = df.drop(target_column, axis=1)
    y = df[target_column]

    # Convert categorical variables to dummy variables
    X = pd.get_dummies(X, columns=['Blood_Pressure', 'Exercise_Level', 'Smoking_Status'])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    results = {}
    best_model = None
    best_score = -np.inf

    for algorithm in algorithms:
        model = algorithm()
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)

        MAE = mean_absolute_error(y_test, y_pred)
        MSE = mean_squared_error(y_test, y_pred)
        RMSE = np.sqrt(MSE)
        r2 = r2_score(y_test, y_pred)

        results[algorithm.__name__] = {
            'Mean Absolute Error': MAE,
            'Mean Squared Error': MSE,
            'Root Mean Squared Error': RMSE,
            'R² score': r2
        }

        # Save the best model
        if r2 > best_score:
            best_score = r2
            best_model = model

    best_algorithm = max(results.items(), key=lambda item: item[1]['R² score'])[0]

    # Save the best model and scaler
    with open('treatment_duration_model.pkl', 'wb') as f:
        pickle.dump((best_model, scaler), f)

    # Save feature names
    with open('feature_names.pkl', 'wb') as f:
        pickle.dump(list(X.columns), f)

    return best_algorithm, results

# List of algorithms
algorithms = [
    LinearRegression,
    DecisionTreeRegressor,
    ExtraTreesRegressor,
    RandomForestRegressor,
    GradientBoostingRegressor,
    XGBRegressor
]

# Run the function
best_algo, results = best_ML_algorithm(df, 'Treatment_Duration', algorithms)
print("\nBest Algorithm Results:")
print(f"Best Algorithm: {best_algo}")
results_df = pd.DataFrame(results).transpose()
print("\nAll Algorithm Results:")
print(results_df)

print("Model training completed and saved!")



Best Algorithm Results:
Best Algorithm: LinearRegression

All Algorithm Results:
                           Mean Absolute Error  Mean Squared Error  \
LinearRegression                      3.976824           24.707075   
DecisionTreeRegressor                 8.971667          128.550333   
ExtraTreesRegressor                   5.559813           48.468072   
RandomForestRegressor                 5.784007           52.714660   
GradientBoostingRegressor             4.493315           31.946151   
XGBRegressor                          4.778072           35.875339   

                           Root Mean Squared Error  R² score  
LinearRegression                          4.970621  0.930512  
DecisionTreeRegressor                    11.338004  0.638455  
ExtraTreesRegressor                       6.961901  0.863685  
RandomForestRegressor                     7.260486  0.851741  
GradientBoostingRegressor                 5.652093  0.910152  
XGBRegressor                              5.98960

In [106]:
def plot_correlation_heatmap(df):
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    corr_matrix = df[numeric_cols].corr()
    fig = px.imshow(corr_matrix,
                    title='Feature Correlation Heatmap',
                    color_continuous_scale='RdBu_r')
    return fig

plot_correlation_heatmap(df).show()

In [107]:
# Constants for input validation
INPUT_RANGES = {
    'Age': (18, 80),
    'BMI': (16.0, 45.0),
    'Disease_Severity': (1, 10),
    'Heart_Rate': (50, 120),
    'Glucose_Level': (70, 200),
    'Oxygen_Saturation': (90, 100),
    'Treatment_Compliance': (1, 5)
}

In [108]:
def load_model():
    """Load the trained model and scaler"""
    with open('treatment_duration_model.pkl', 'rb') as f:
        model, scaler = pickle.load(f)
    with open('feature_names.pkl', 'rb') as f:
        feature_names = pickle.load(f)
    return model, scaler, feature_names

In [110]:
def create_prediction_input():
    """Create a sample input for prediction"""
    input_data = {
        'Age': 45,
        'BMI': 25.0,
        'Disease_Severity': 5,
        'Heart_Rate': 75,
        'Glucose_Level': 100,
        'Oxygen_Saturation': 97,
        'Treatment_Compliance': 3,
        'Chronic_Condition': 1,
        'Previous_Treatments': 1,
        'Diabetes': 0,
        'Hypertension': 0,
        'Heart_Disease': 0,
        'Blood_Pressure': 'Normal',
        'Exercise_Level': 'Moderate',
        'Smoking_Status': 'Never'
    }

    return input_data

In [111]:
def predict_treatment_duration(model, scaler, feature_names, input_data):
    """Predict treatment duration based on input data"""
    # Create DataFrame
    input_df = pd.DataFrame([input_data])

    # One-hot encoding for categorical variables
    input_encoded = pd.get_dummies(input_df,
                                    columns=['Blood_Pressure',
                                             'Exercise_Level',
                                             'Smoking_Status'])

    # Align columns with training data
    for col in feature_names:
        if col not in input_encoded.columns:
            input_encoded[col] = 0
    input_encoded = input_encoded[feature_names]

    # Scale features
    input_scaled = scaler.transform(input_encoded)

    # Make prediction
    prediction = model.predict(input_scaled)[0]
    return prediction

In [112]:
model, scaler, feature_names = load_model()

input_data = create_prediction_input()

# Make prediction
estimated_duration = predict_treatment_duration(model, scaler, feature_names, input_data)
print(f"Estimated Treatment Duration: {estimated_duration:.0f} days")

Estimated Treatment Duration: 47 days


In [119]:
input_data = {
    'Age': 45,
    'BMI': 25.0,
    'Disease_Severity': 5,
    'Heart_Rate': 75,
    'Glucose_Level': 100,
    'Oxygen_Saturation': 97,
    'Treatment_Compliance': 3,
    'Chronic_Condition': 1,
    'Previous_Treatments': 1,
    'Diabetes': 0,
    'Hypertension': 0,
    'Heart_Disease': 0,
    'Blood_Pressure': 'High',
    'Exercise_Level': 'Moderate',
    'Smoking_Status': 'Never'
}

In [117]:
def risk_analysis(data):
    """Analyze patient risk factors and provide recommendations"""
    risk_factors = []
    risk_explanations = []

    # Check various risk factors
    if input_data['BMI'] >= 30:
        risk_factors.append("High BMI")
        risk_explanations.append("BMI ≥ 30 indicates obesity")

    if input_data['Glucose_Level'] > 126:
        risk_factors.append("Elevated Glucose")
        risk_explanations.append("Fasting glucose > 126 mg/dL indicates diabetes risk")

    if input_data['Blood_Pressure'] == 'High':
        risk_factors.append("High Blood Pressure")
        risk_explanations.append("High blood pressure increases treatment complexity")

    if input_data['Smoking_Status'] == 'Current':
        risk_factors.append("Current Smoker")
        risk_explanations.append("Smoking may complicate treatment and recovery")

    if input_data['Exercise_Level'] == 'Sedentary':
        risk_factors.append("Sedentary Lifestyle")
        risk_explanations.append("Low physical activity may slow recovery")

    if input_data['Treatment_Compliance'] <= 2:
        risk_factors.append("Low Treatment Compliance")
        risk_explanations.append("Poor compliance history may affect treatment success")

    if input_data['Heart_Rate'] > 100:
        risk_factors.append("Elevated Heart Rate")
        risk_explanations.append("Resting heart rate > 100 bpm indicates stress")

    if input_data['Oxygen_Saturation'] < 95:
        risk_factors.append("Low Oxygen Saturation")
        risk_explanations.append("O2 saturation < 95% may indicate respiratory issues")

    recommendations = []
    for factor in risk_factors:
        if factor == "High BMI":
            recommendations.append("Consider nutrition consultation and structured exercise program")
        elif factor == "Elevated Glucose":
            recommendations.append("Regular glucose monitoring and dietary management recommended")
        elif factor == "High Blood Pressure":
            recommendations.append("Regular blood pressure monitoring and medication review needed")
        elif factor == "Current Smoker":
            recommendations.append("Smoking cessation program recommended")
        elif factor == "Sedentary Lifestyle":
            recommendations.append("Gradual increase in physical activity with professional guidance")
        elif factor == "Low Treatment Compliance":
            recommendations.append("Consider patient education program and simplified treatment plan")

    return {
        'risk_factors': risk_factors,
        'risk_explanations': risk_explanations,
        'recommendations': recommendations
    }

In [120]:
risk_analysis(input_data)

{'risk_factors': ['High Blood Pressure'],
 'risk_explanations': ['High blood pressure increases treatment complexity'],
 'recommendations': ['Regular blood pressure monitoring and medication review needed']}