# Packages & Libraries

In [None]:
# Data Loading & Pre-Processing
import numpy as np
import pandas as pd
import subprocess, sys
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pingouin as pg
#!pip install imbalanced-learn
from imblearn.over_sampling import SMOTE

# Data Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import set_config
from sklearn.pipeline import Pipeline

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier # LightGBM is 6 times faster than XGBoost.
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# ML Model Evaluation
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_curve, auc, matthews_corrcoef, cohen_kappa_score, log_loss
from sklearn.model_selection import cross_val_score, StratifiedKFold

# Other imports
import os
import time
import warnings
warnings.filterwarnings('once')
warnings.filterwarnings('ignore', category = DeprecationWarning)
warnings.filterwarnings('ignore', category = FutureWarning)

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

try:
    import ydata_profiling
except ImportError:
    install('ydata_profiling')

from ydata_profiling import ProfileReport

# Project 1: Anemia Prediction

## Dataset Overview

Anemia dataset containing attributes Gender, Hemoglobin, MCHC, MCV, MCH and Results. This dataset is used to predict if a patient is likely to suffer from anemia. Machine learning binary classifier algorithm to be used.

Gender:
- 0 - male
- 1 - female


---


**Hemoglobin (g/dl)**: Hemoglobin is a protein in your red blood cells that carries oxygen to your body's organs and tissues and transports carbon dioxide from your organs and tissues back to your lungs


---


**MCH (pg)**: MCH is short for "mean corpuscular hemoglobin." It's the average amount in each of your red blood cells of a protein called hemoglobin, which carries oxygen around your body.


---


**MCHC (g/dl)**: MCHC stands for mean corpuscular hemoglobin concentration. It's a measure of the average concentration of hemoglobin inside a single red blood cell.


---


**MCV (f/l)**: MCV stands for mean corpuscular volume. An MCV blood test measures the average size of your red blood cells.


---


Results:
- 0- not anemic
- 1-anemic

Kaggle link: https://www.kaggle.com/datasets/biswaranjanrao/anemia-dataset

In [None]:
os.getcwd()

In [None]:
data = pd.read_csv('../data/anemia.csv')

# Generating a report of the data
profile = ProfileReport(data, title = "Anemia Dataset")

# Saving the report to .html for inspection
profile.to_file("anemia_dataset_characteristics.html")

print(len(data))
data.head()

**Upon reviewing the report, the following information is derived:**

* The dataset comprises 6 columns/features (2 categorical and 4 numeric) with a total of 1421 observations with 0% missing data.

* The 'Gender' column is labeleld as categorical in the report, but upon observing it we say it consists of only two unique values (0 - male & 1 - female). Same is applicable for the 'Result' column.

* There are 472 duplicate rows, accounting for 33.2% of the entire dataset.


In [None]:
print(data.iloc[[0, 1]])
print((data.iloc[0] - data.iloc[1]).abs())

## Exploratory Data Analysis

In [None]:
# checking the types of the columns in the dataset
print(data.dtypes)

# checking for missing data in the columns of the dataset
print('\n', data.isna().sum(), '\n')

# dataset characteristics
data.describe()

In [None]:
def countplot(col, title, xlabel, ylabel, hue = None):

    plt.figure()  # Starts a new figure
    ax = sns.countplot(x = col, data = data, hue = hue)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.xticks(ticks = [0, 1], labels = ['Male', 'Female'])
    plt.ylabel(ylabel)
    
    for p in ax.patches:
        height = p.get_height()
    
        ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
                    ha = 'center', va = 'baseline', fontsize = 10, color = 'black', xytext = (0, 2),
                    textcoords = 'offset points')

# gender distribution
countplot('Gender', 'Gender Distribution in the Dataset', 'Gender', '# of Patients')

# gender distribution of anemia cases
countplot('Gender', 'Gender Distribution in the Dataset', 'Gender', '# of Patients', hue = 'Result')

* There is slight imbalance in the dataset (740 female cases vs 681 male cases).

* **Nearly twice as many females have anemia compared to males.**

In [None]:
anemia_cases = data[data['Result'] == 1]

female_anemia_cases = anemia_cases[anemia_cases['Gender'] == 1]
male_anemia_cases = anemia_cases[anemia_cases['Gender'] == 0]

print("Female anemia cases as proportion of all anemia cases: ", np.round(len(female_anemia_cases)/len(anemia_cases), 2))

In [None]:
# generating KDE plots for the other columns in the dataset
columns_to_plot = [col for col in data.columns if col not in ['Gender', 'Result']]

def plot_gender_specific(columns, data, gender_col = 'Gender'):
    
    for col in columns:
        plt.figure()

        # Check if the column is categorical or numerical
        if data[col].dtype == 'object' or data[col].nunique() < 10:
            sns.countplot(x = col, data = data, hue = gender_col)
            plt.title(f'Distribution of {col} by Gender')
            plt.ylabel('# of Cases')
        else:
            sns.histplot(data = data, x = col, hue = gender_col, kde = True, element = 'step', stat = 'density', common_norm = False)
            plt.title(f'Distribution of {col} by Gender')
            plt.ylabel('Density')
        
        plt.xlabel(col)
        plt.legend(title = 'Gender', labels = ['Male', 'Female'])
        plt.show()

plot_gender_specific(columns_to_plot, data)

In [None]:
# Correlation matrix (heatmap)
corr_matrix = data.corr()

plt.figure(figsize=(12, 5))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, vmin=-1, vmax=1)

plt.show()

**Hemoglobin vs. Result (-0.8): There is a strong negative correlation, indicating that as the hemoglobin increases, the 'Result' tends to decrease, meaning that: "the higher the hemoglobin, the lower the chance of having anemia".**

Gender vs. Result (0.25): There is a weak positive correlation, meaning there is a slight tendency for the 'Result' to increase as 'Gender' increases, though this relationship is not strong. This means that gender has some statistical importance when determining whether a patient has anemia.

### Data Limitations

* **DUPLICATE ROWS**
* no patient's age provided
* no information about other potential illnesses of a patient, nor medical history
* no data on:
  - MPV
  - RDWc
  - GRA%
  - LYM%
  - GRA
  - MID
  - LYM
  - thrombocytes
  - leukocytes
  - erythrocytes
  - hematocrits
  - platelets

## Model Training & Evaluation

In [None]:
# train features (X) and target (y)
X = data.drop('Result', axis = 1)
y = data['Result']

# Splitting the data into training and testing sets (80% train & 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fitting a Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model's performance
print(f'Accuracy: {np.round(accuracy_score(y_test, y_pred) * 100, 2)}%')
print(f'Precision: {np.round(precision_score(y_test, y_pred) * 100, 2)}%')
print(f'Recall: {recall_score(y_test, y_pred)}')
print(f'F1 Score: {np.round(f1_score(y_test, y_pred), 2)}\n')

print("Classification Report: ")
print(classification_report(y_test, y_pred))

print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred), '\n')

# ROC Curve and AUC value
print("ROC Curve: ")
y_prob = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc_score = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label = f'ROC curve (area = {auc_score:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc = 'lower right')
plt.show()

print('\nMatthews Correlation Coefficient:', np.round(matthews_corrcoef(y_test, y_pred),2))

print('Cohen\'s Kappa Score:', np.round(cohen_kappa_score(y_test, y_pred), 2))

In [None]:
female_data = data[data['Gender'] == 1]
male_data = data[data['Gender'] == 0]

## Testing on patient data

Gender: Female (1)

Hemoglobin: 12.4 (g/dl)

MCH: 31.8 (pg)

MCV: 77 (f/l)

MCHC: 41.1 (g/dl)

In [None]:
gender = int(input("Enter the Gender (0 for Male, 1 for Female): "))
hemoglobin = float(input("Enter the Hemoglobin value: "))
mch = float(input("Enter the MCH value: "))
mcv = float(input("Enter the MCV value: "))
mchc = float(input("Enter the MCHC value: "))

new_data = pd.DataFrame({'Gender': [gender], 'Hemoglobin': [hemoglobin], 'MCH': [mch], 'MCHC': [mchc], 'MCV': [mcv]})
new_data_scaled = scaler.transform(new_data)

prediction = model.predict(new_data_scaled)

if prediction == 0:
  print("The patient is predicted to NOT have anemia.")
else:
  print("Model Outcome: The patient is predicted to have anemia.")

females_with_anemia = female_data[female_data['Result'] == 1]
# Create a new DataFrame combining the user's data and the original data
combined_data_anemia = pd.concat([females_with_anemia, new_data], ignore_index = True)
combined_data_no_anemia = pd.concat([female_data, new_data], ignore_index = True)

# Loop through features and create plots
for feature in ['Hemoglobin', 'MCH', 'MCV', 'MCHC']:
    plt.figure(figsize = (8, 5))

    # Plot the histogram with KDE
    sns.histplot(combined_data_no_anemia[feature], kde = True)

    # Highlight user's data as a dotted vertical line
    plt.axvline(new_data[feature].values[0], color = 'red', linestyle = '--', label = 'Patient Data')

    plt.xlim(combined_data_no_anemia[feature].min(), combined_data_no_anemia[feature].max())
    plt.title(f'Distribution of {feature} for females')
    plt.xlabel(feature)
    plt.ylabel('Density')
    plt.legend()
    plt.show()

# Project 2: Anemia Type Classification

HGB: The amount of hemoglobin in the blood, crucial for oxygen transport.

PlT: The number of platelets in the blood, involved in blood clotting.

WBC: The count of white blood cells, vital for immune response.

RBC: The count of red blood cells, responsible for oxygen transport.

MCV (Mean Corpuscular Volume): Average volume of a single red blood cell.

MCH (Mean Corpuscular Hemoglobin): Average amount of hemoglobin per red blood cell.

MCHC (Mean Corpuscular Hemoglobin Concentration): Average concentration of hemoglobin in red blood cells.

PDW: a measurement of the variability in platelet size distribution in the blood

PCT: A procalcitonin test can help your health care provider diagnose if you have sepsis from a bacterial infection or if you have a high risk of developing sepsis

Diagnosis: Anemia type based on the CBC parameters

In [None]:
anemia_data = pd.read_csv('../data/diagnosed_cbc_data_v4-original_data.csv')

print(anemia_data.shape)

print(anemia_data.Diagnosis.value_counts())

anemia_data.describe()

# Generating a report of the data
#anemia_type_dataset_profile = ProfileReport(anemia_data, title = "Anemia Type Dataset")

# Saving the report to .html for inspection
#anemia_type_dataset_profile.to_file("anemia_type_dataset_characteristics.html")

# visualize the number of diagnosis in the dataset as a pie chart and give the exact numbers in parenthesis next to the percentages
plt.figure(figsize = (10, 6))
plt.pie(anemia_data['Diagnosis'].value_counts(), labels = anemia_data['Diagnosis'].value_counts().index, autopct = '%1.1f%%')
plt.title('Distribution of Anemia Types')
plt.show()

In [None]:
anemia_data.head()

In [None]:
anemia_data.drop(columns = ['LYMp', 'NEUTp'], inplace = True)

The dataset is heavily imbalanced under the 'Diagnosis' column.

In [None]:
# Correlation matrix (heatmap)
corr_matrix = anemia_data.iloc[:,:-1].corr().round(2)

plt.figure(figsize = (12, 5))
sns.heatmap(corr_matrix, annot = True, center = 0, vmin = -1, vmax = 1)

plt.show()

**Correlation Ranges and Their Interpretation:**

***Perfect Positive Correlation (+1):***

* This means that two features move together perfectly; if one feature increases, the other feature always increases in a directly proportional way.
Example: If A and B have a correlation of +1, then as A increases, B always increases in the exact same manner.

***High Positive Correlation (+0.7 to +1):***

* Strong relationship where an increase in one feature is highly likely to be accompanied by an increase in the other feature.
* Action: Investigate if features are redundant and consider dropping one of the features if they contain similar information.

***Moderate Positive Correlation (+0.4 to +0.7):***

* There is a clear positive relationship, but it is not perfect. These features may still contain useful independent information.
* Action: Generally, no need to drop either feature unless domain knowledge suggests redundancy.

***Low Positive Correlation (+0.1 to +0.4):***

* Weak positive relationship; the features increase together, but only slightly.
* Action: Low concern for multicollinearity. Keep both features unless otherwise indicated.

***No Correlation (-0.1 to +0.1):***

* No discernible linear relationship between the features.
* Action: Both features can coexist without causing issues of multicollinearity.

***Low Negative Correlation (-0.1 to -0.4):***

* Weak inverse relationship; as one feature increases, the other tends to decrease slightly.
* Action: Similar to weak positive correlation, usually not a concern.

***Moderate Negative Correlation (-0.4 to -0.7):***

* Clear inverse relationship; as one feature increases, the other decreases in a moderate, predictable way.
* Action: Consider if the features are providing redundant information in an inverse way.

***High Negative Correlation (-0.7 to -1):***

* Strong inverse relationship; as one feature increases, the other decreases in a very predictable and proportional way.
* Action: Similar to high positive correlation, you may need to drop or combine features to reduce redundancy.

***Perfect Negative Correlation (-1):***

* This means that two features are perfectly inversely correlated. As one increases, the other decreases in exact proportion.
* Example: If A and B have a correlation of -1, then as A increases, B always decreases by the same amount.

In [None]:
# Pairwise correlation with the target variable
anemia_data_copy = anemia_data.copy()

# encoding the target column
label_encoder = LabelEncoder()
anemia_data_copy['Diagnosis'] = label_encoder.fit_transform(anemia_data_copy['Diagnosis'])

# List of all features (excluding 'Diagnosis' and the control variable 'HGB')
features = [col for col in anemia_data_copy.columns if col not in ['Diagnosis', 'HGB']]

# Perform partial correlation between each feature and 'Diagnosis', controlling for 'HGB'
partial_corr_results = {}
for feature in features:
    result = pg.partial_corr(data=anemia_data_copy, x=feature, y='Diagnosis', covar='HGB')
    partial_corr_results[feature] = result['r'].values[0]

partial_corr_df = pd.DataFrame(partial_corr_results.items(), columns=['Feature', 'Partial Correlation (r)'])
partial_corr_df = round(partial_corr_df.sort_values(by='Partial Correlation (r)', ascending=False), 2)
partial_corr_df

**r (Partial Correlation Coefficient):**

This is the main result: the partial correlation coefficient between 'RBC' and 'Diagnosis', while controlling for 'HGB'.

Range: The value of r ranges from -1 to 1.

* r = 1: Perfect positive correlation (as 'RBC' increases, 'Diagnosis' increases, after controlling for 'HGB').
  
* r = -1: Perfect negative correlation (as 'RBC' increases, 'Diagnosis' decreases, after controlling for 'HGB').
  
* r = 0: No linear relationship between 'RBC' and 'Diagnosis', after controlling for 'HGB'.

In [None]:
# Separating the features (X) and the target variable (y)
X = anemia_data.drop(columns = ['Diagnosis'])
y = anemia_data['Diagnosis']

In [None]:
# Adding a constant to the features to account for the intercept
X_with_constant = pd.concat([pd.DataFrame({'Intercept': 1}, index=X.index), X], axis=1)

# Calculating VIF for each feature
vif_data = pd.DataFrame()
vif_data['Feature'] = anemia_data.iloc[:, :-1].columns # all columns but the last which is the diagnosis (categorical)
vif_data['VIF'] = [np.round(variance_inflation_factor(X_with_constant.values, i), 2) for i in range(1, X_with_constant.shape[1])]

vif_data

**The Variance Inflation Factor (VIF)** measures the severity of multicollinearity in regression analysis. It is a statistical concept that indicates the increase in the variance of a regression coefficient as a result of collinearity.

**Interpreting VIF Values:**

* VIF = 1: No multicollinearity.

* VIF between 1 and 5: Moderate multicollinearity (usually acceptable).

* VIF > 5: High multicollinearity (you should consider removing or combining highly correlated variables).

* VIF > 10: Indicates severe multicollinearity, which is typically problematic.

In [None]:
# Applying over-sampling the minority classes (SMOTE)
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combining the resampled X and y back into a DataFrame
anemia_data_resampled = X_resampled.copy()
anemia_data_resampled['Diagnosis'] = y_resampled

print(anemia_data_resampled.Diagnosis.value_counts())

anemia_data_resampled.head()

In [None]:
# Split the data into features (X) and target (y)
X = anemia_data_resampled.drop('Diagnosis', axis = 1)
y = anemia_data_resampled['Diagnosis']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Data Pre-Processing (only done after the data split): Feature scaling (optional but recommended for some algorithms)
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train) # fit on training data
X_test = scaler.transform(X_test) # transform on test data

# Encoding the diagnosis label
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

### Considerations

**Scaler**

* Use RobustScaler() if your data has outliers and you want a robust way to scale your features without being skewed by extreme values. RobustScaler() will scale the data using the median and IQR, making it less sensitive to outliers.
  
* Use MinMaxScaler() if your models benefit from having input features strictly within a defined range (especially neural networks), but be cautious of outliers. MinMaxScaler() will scale the data between 0 and 1.

* Use StandardScaler() if you are working with models that assume normally distributed data and your features are relatively normally distributed without extreme outliers. StandardScaler() will center the data around 0 with a standard deviation of 1.

**Evaluation Metrics**

***micro:***
* Aggregates all TP, FP, FN across classes and calculates the metric globally. Treats every sample equally, making it suitable when you care about the overall performance on individual samples.

***macro:***
* Averages the metric for each class equally, regardless of how many samples are in each class. Useful when you care about all classes equally.

***weighted:***
* Like macro, but accounts for the number of samples in each class. It's good when you want to consider class imbalance while still evaluating each class individually.

In [None]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score

model_pipeline = [
    XGBClassifier(),
    SVC(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GaussianNB()
]

model_list = ['XGBoost', 'SVM', 'KNN', 'Decision Tree', 'Random Forest', 'Naive Bayes']
acc_list = [] # to store the Accuracy for each model
precision_list = [] # to store the Precission for each model
recall_list = [] # to store the Recall for each model
f1_list = [] # to store the F1 Score for each model

times = [] # to store the computation times for each model
auc_list = [] # to store the are under the curve for each model
cm_list = [] # to store the confusion matrix for each model

cf_reports = []

y_test_binarized = label_binarize(y_test, classes = range(len(label_encoder.classes_)))

for model in model_pipeline:
    start_time = time.time()
    model.fit(X_train, y_train)
    times.append(round(time.time() - start_time, 4)) # recording the model fitting time
    y_pred = model.predict(X_test)
    
    acc_list.append(round(accuracy_score(y_test, y_pred), 3))
    precision_list.append(round(precision_score(y_test, y_pred, average = 'macro'), 3)) # averages the metric across all classes equally.
    recall_list.append(round(recall_score(y_test, y_pred, average = 'macro'), 3))
    f1_list.append(round(f1_score(y_test, y_pred, average = 'macro'), 3))

    # For ROC AUC, use predict_proba if available, otherwise decision_function
    try:
        y_proba = model.predict_proba(X_test)
    except AttributeError:
        y_proba = model.decision_function(X_test)

    # Compute AUC score for multiclass
    auc_list.append(round(roc_auc_score(y_test_binarized, y_proba, average = 'macro', multi_class = 'ovr'), 3))
    #clf = LogisticRegression(solver="liblinear").fit(X, y)
    #roc_auc_score(y, clf.predict_proba(X), multi_class='ovr')

    cm_list.append(confusion_matrix(y_test, y_pred))
    
    report_dict = classification_report(y_test, y_pred, target_names = label_encoder.classes_, output_dict = True)
    
    report_df = pd.DataFrame(report_dict).transpose() # converting the dictionary to a DataFrame
    report_df[['precision', 'recall', 'f1-score']] = report_df[['precision', 'recall', 'f1-score']].round(3) # rounding to 3 decimal places
    report_df['Model'] = model_list[model_pipeline.index(model)]
    report_df['Class'] = report_df.index
    cf_reports.append(report_df)

cf_reports_df = pd.concat(cf_reports, ignore_index = True) # Concatenating all classification reports into a single DataFrame

cf_reports_df = cf_reports_df[['Model', 'Class', 'precision', 'recall', 'f1-score', 'support']] # columns rearrangement

result_df = pd.DataFrame({
    'Model' : model_list,
    'Accuracy' : acc_list,
    'Precision' : precision_list,
    'Recall' : recall_list,
    'F1 Score' : f1_list,
    'AUC Score' : auc_list,
    'Training Time (s)' : times
})
result_df

In [None]:
pd.set_option('display.max_rows', None)
cf_reports_df

In [None]:
fig = plt.figure(figsize = (18, 15))

# Assuming 'label_encoder' is the same one used for encoding the labels
decoded_labels = label_encoder.inverse_transform([i for i in range(len(label_encoder.classes_))])

for i in range(len(cm_list)):
    cm = cm_list[i]
    model = model_list[i]
    
    sub = fig.add_subplot(2, 3, i + 1).set_title(model)
    
    cm_plot = sns.heatmap(cm, annot = True, cmap = 'Blues_r', xticklabels = decoded_labels, yticklabels = decoded_labels)
    cm_plot.set_xlabel('Predicted Values')
    cm_plot.set_ylabel('Actual Values')

plt.tight_layout()
plt.show()

# Patient Data

Enter the WBC value: 6.5

Enter the LYMp value: 20.4

Enter the NEUTp value: 77

Enter the LYMn value: 1.33

Enter the NEUTn value: 5.14

Enter the RBC value: 3.89

Enter the HGB value: 12.4

Enter the HCT value: 30

Enter the MCV value: 77

Enter the MCH value: 31.8

Enter the MCHC value: 41.1

Enter the PLT value: 213

Enter the PDW value: 13

Enter the PCT value: 0.32

https://ramuslab.com/%D1%85%D0%B5%D0%BC%D0%B0%D1%82%D0%BE%D0%BA%D1%80%D0%B8%D1%82-hct/

In [None]:
# initializing an empty dictionary to hold the patient data
patient_data = {}

# for each column in the dataset, we prompt the user to enter an input
for col in anemia_data.columns:
    if col != 'Diagnosis':
        user_input = float(input(f"Enter the {col} value: "))
        patient_data[col] = user_input

# converting the dictionary into a dataframe with one row
patient_data = pd.DataFrame([patient_data])
patient_data

In [None]:
patient_data.to_excel('patient_data_october.xlsx')

In [None]:
# Scale the patient data
patient_data_scaled = scaler.transform(patient_data)

# List to store the results (Model name and corresponding prediction)
predicted_diagnosis = []

for i, model in enumerate(model_pipeline):
    patient_pred = model.predict(patient_data_scaled)
    patient_pred = label_encoder.inverse_transform(patient_pred) # decoding the prediction on the patient data
    predicted_diagnosis.append({'Model': model_list[i], 'Prediction': patient_pred[0]})
 
result_df = pd.DataFrame(predicted_diagnosis) # converting the list of dictionaries to a DataFrame

result_df