# Initial Model Evaluation
This notebook will contain testing of basic scikit-learn models, and depending on results will lead to neural network development.

Imports:

In [158]:
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA, QuadraticDiscriminantAnalysis as QDA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
import pickle

Read processed data

In [159]:
heart = pd.read_csv(r'data/input-files/heart.csv')
cancer = pd.read_csv(r'data/input-files/cancer.csv')
diabetes = pd.read_csv(r'data/input-files/diabetes.csv')
liver = pd.read_csv(r'data/input-files/liver.csv')
stroke = pd.read_csv(r'data/input-files/stroke.csv')

## Correlation Analysis

In [160]:
# initialize encoder
encoder = LabelEncoder()

# fit encoder to gender data, print correlation vector
heart["gender"] = encoder.fit_transform(heart["gender"])
heart.corr()['diagnosis']

activity       -0.008640
age            -0.007247
alcohol         0.007565
bmi             0.019876
diabetes       -0.002389
diagnosis       1.000000
gender         -0.005758
genetic_risk   -0.001833
hypertension   -0.005096
smoking         0.006163
Name: diagnosis, dtype: float64

In [161]:
# fit encoder to gender data, print correlation vector
cancer["gender"] = encoder.fit_transform(cancer["gender"])
cancer.corr()['diagnosis']

activity         -0.150089
age               0.196603
alcohol           0.212772
bmi               0.187560
cancer_history    0.392188
diagnosis         1.000000
gender           -0.250336
genetic_risk      0.141599
smoking           0.226999
Name: diagnosis, dtype: float64

In [162]:
# fit encoder to gender data, print correlation vector
diabetes["gender"] = encoder.fit_transform(diabetes["gender"])
diabetes.corr()['diagnosis']

age              0.258008
bmi              0.214357
diagnosis        1.000000
gender           0.037411
heart_disease    0.171727
hypertension     0.197823
smoking          0.092998
Name: diagnosis, dtype: float64

In [163]:
# fit encoder to gender data, print correlation vector
liver["gender"] = encoder.fit_transform(liver["gender"])
liver.corr()['diagnosis']

activity       -0.116689
age             0.156099
alcohol         0.349610
bmi             0.167655
diabetes        0.107480
diagnosis       1.000000
gender         -0.189558
hypertension    0.170683
genetic_risk    0.118292
smoking         0.200071
Name: diagnosis, dtype: float64

In [164]:
# fit encoder to gender data, print correlation vector
stroke["gender"] = encoder.fit_transform(stroke["gender"])
stroke.corr()['diagnosis']

age              0.242495
bmi              0.011673
diagnosis        1.000000
gender           0.012167
heart_disease    0.138553
hypertension     0.143647
smoking          0.034922
Name: diagnosis, dtype: float64

# Testing all models (Logistic Regression, KNN, Naive Bayes, LDA, QDA, DTC, Random Forest) on each dataset
- 10-fold cross validation is used

**Defining a model evaluation function**

In [165]:
# iterate through classifiers and evaluate
def evaluate_model(dataname, data):

    # define inputs and outputs
    X = data.drop(columns=['diagnosis'])
    y = data['diagnosis']

    # define classifiers
    classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "LDA": LDA(),
    "QDA": QDA(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
    }

    # set up 10-fold cross-validation
    kf = KFold(n_splits=10, shuffle=True)

    # create results storage structure
    results = {}

    # iterate through classifiers
    for name, clf in classifiers.items():
        scores = cross_val_score(clf, X, y, cv=kf) 
        results[name] = np.mean(scores)

    # sort and print results
    sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
    print(f"\n{dataname} Sorted Results:")
    for name, acc in sorted_results:
        print(f"{name}: {acc:.4f}")

# Iterate through models

In [166]:
datatypes = {'Heart': heart, 'Cancer': cancer, 'Diabetes': diabetes, 'Liver': liver, 'Stroke': stroke}
for name, data in datatypes.items():
    evaluate_model(name, data)


Heart Sorted Results:
QDA: 0.7968
Logistic Regression: 0.7968
Naive Bayes: 0.7968
LDA: 0.7968
Random Forest: 0.7813
KNN: 0.7641
Decision Tree: 0.6557

Cancer Sorted Results:
Random Forest: 0.8633
Logistic Regression: 0.8267
LDA: 0.8253
QDA: 0.8073
Decision Tree: 0.8007
Naive Bayes: 0.7827
KNN: 0.6713

Diabetes Sorted Results:
Logistic Regression: 0.9148
LDA: 0.9094
KNN: 0.9061
Random Forest: 0.8924
Decision Tree: 0.8786
Naive Bayes: 0.8686
QDA: 0.8665

Liver Sorted Results:
Logistic Regression: 0.7529
Random Forest: 0.7529
LDA: 0.7512
QDA: 0.7400
Naive Bayes: 0.7300
Decision Tree: 0.7035
KNN: 0.5859

Stroke Sorted Results:
Logistic Regression: 0.9475
KNN: 0.9437
LDA: 0.9413
Random Forest: 0.9329
Decision Tree: 0.9060
QDA: 0.8964
Naive Bayes: 0.8862


## Saving models for each of the datasets
- The entire dataset was used, as the models will now be embedded into a consumer-ready app
- Scalers were not used as they did not improve accuracy

Re-open csvs to ensure data is processed properly

In [175]:
heart = pd.read_csv(r'data/input-files/heart.csv')
cancer = pd.read_csv(r'data/input-files/cancer.csv')
diabetes = pd.read_csv(r'data/input-files/diabetes.csv')
liver = pd.read_csv(r'data/input-files/liver.csv')
stroke = pd.read_csv(r'data/input-files/stroke.csv')

Define a function that fits and dumps a model

In [176]:
def fit_dump_model(name, model, data):

    # encode gender
    encoder = LabelEncoder()
    data["gender"] = encoder.fit_transform(data["gender"])

    # select inputs and target
    X = data.drop(columns=['diagnosis'])
    y = data['diagnosis']

    # fit model
    model.fit(X, y)

    # dump model
    with open(f"models/{name}_model.pkl", "wb") as file:
        pickle.dump(model, file)

    # confirm dumping
    print(f"{name} model trained and dumped")

Train and dump models

In [177]:
# fitting logistic regression for heart dataset
model = LogisticRegression(max_iter=5000)
fit_dump_model("heart", model, heart)

# fitting random forest for cancer dataset
model = RandomForestClassifier()
fit_dump_model("cancer", model, cancer)

# fitting logistic regression for diabetes dataset
model = LogisticRegression(max_iter=5000)
fit_dump_model("diabetes", model, diabetes)

# fitting random forest for liver dataset
model = RandomForestClassifier()
fit_dump_model("liver", model, liver)

# fitting logistic regression for stroke dataset
model = LogisticRegression(max_iter=5000)
fit_dump_model("stroke", model, stroke)

heart model trained and dumped
cancer model trained and dumped
diabetes model trained and dumped
liver model trained and dumped
stroke model trained and dumped


Load and test a model

In [None]:
# load a model
with open("models/heart_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)

# fit a scaler to gender
encoder = LabelEncoder()  # You should initialize the encoder if it's not already defined
heart["gender"] = encoder.fit_transform(heart["gender"])

# define inputs and outputs
X = heart.drop(columns=['diagnosis'])
y = heart['diagnosis']

# make prediction 
prediction = loaded_model.predict(X)

# print accuracy
accuracy = accuracy_score(y, prediction)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.7968
