In [1]:
import os
import pickle
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from pathlib import Path

In [2]:
%pwd


'e:\\AI DA Portfolio\\Multiple-Disease-Prediction\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'e:\\AI DA Portfolio\\Multiple-Disease-Prediction'

In [5]:
# defining the paths to the models and test datasets 

artifacts_root = Path('artifacts')

In [6]:
# paths to trained models pickle files

heart_disease_model_path = artifacts_root / 'models' / 'heart_disease.pkl'
parkinsons_model_path = artifacts_root / 'models' / 'parkinsons.pkl'
diabetes_model_path = artifacts_root / 'models' / 'diabetes.pkl'

In [7]:
# paths to test dataset files

heart_disease_test_data_path = artifacts_root / 'data_ingestion' / 'heart_disease' / 'heart_test_data.csv'
parkinsons_test_data_path = artifacts_root / 'data_ingestion' / 'parkinsons' / 'parkinsons_test_data.csv'
diabetes_test_data_path = artifacts_root / 'data_ingestion' / 'diabetes' / 'diab_test_data.csv'

In [8]:
# loading the models 

with open(heart_disease_model_path, 'rb') as file:
    heart_disease = pickle.load(file)

with open(parkinsons_model_path, 'rb') as file:
    parkinsons = pickle.load(file)

with open(diabetes_model_path, 'rb') as file:
    diabetes = pickle.load(file)

In [9]:
# loading test datasets 

heart_disease_test_data = pd.read_csv(heart_disease_test_data_path)
parkinsons_test_data = pd.read_csv(parkinsons_test_data_path)
diabetes_test_data = pd.read_csv(diabetes_test_data_path)

In [18]:
# Function for ensuring features  

def ensure_features(model, X):
    try:
        # Get expected features from the scaler step in the pipeline
        expected_features = model.named_steps['scaler'].feature_names_in_
    except (KeyError, AttributeError):
        # If the scaler doesn't exist or doesn't have feature_names_in_, use current columns
        expected_features = X.columns
    
    # Ensure that the test data has all the expected features
    for feature in expected_features:
        if feature not in X.columns:
            X[feature] = 0  # Fill missing features with 0
    
    # Return the dataframe with columns ordered as expected
    return X[expected_features]

In [16]:
# Heart Disease Data
X_heart_disease = ensure_features(heart_disease, heart_disease_test_data.iloc[:, :-1])
y_heart_disease = heart_disease_test_data.iloc[:, -1]

# Parkinson's Data
X_parkinsons = ensure_features(parkinsons, parkinsons_test_data.iloc[:, :-1])
y_parkinsons = parkinsons_test_data.iloc[:, -1]

# Diabetes Data
X_diabetes = ensure_features(diabetes, diabetes_test_data.iloc[:, :-1])
y_diabetes = diabetes_test_data.iloc[:, -1]

In [19]:
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }

# Evaluate each model
heart_disease_metrics = evaluate_model(heart_disease, X_heart_disease, y_heart_disease)
parkinsons_metrics = evaluate_model(parkinsons, X_parkinsons, y_parkinsons)
diabetes_metrics = evaluate_model(diabetes, X_diabetes, y_diabetes)

ValueError: columns are missing: {'PPE'}