# Initial Model Evaluation
This notebook will contain preliminary testing of basic scikit-learn models, and depending on results will lead to neural network development.

In [79]:
import pandas as pd
heart = pd.read_csv(r'data/input-files/heart.csv')
cancer = pd.read_csv(r'data/input-files/cancer.csv')
diabetes = pd.read_csv(r'data/input-files/diabetes.csv')
liver = pd.read_csv(r'data/input-files/liver.csv')
stroke = pd.read_csv(r'data/input-files/stroke.csv')

In [80]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA, QuadraticDiscriminantAnalysis as QDA
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np

### Logistic Regression

In [81]:
# Print the correlation matrix
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
heart["gender"] = encoder.fit_transform(heart["gender"])

heart.corr()['diagnosis']

activity       -0.008640
age            -0.007247
alcohol         0.007565
bmi             0.019876
diabetes       -0.002389
diagnosis       1.000000
gender         -0.005758
genetic_risk   -0.001833
hypertension   -0.005096
smoking         0.006163
Name: diagnosis, dtype: float64

In [82]:
cancer["gender"] = encoder.fit_transform(cancer["gender"])
cancer.corr()['diagnosis']

activity         -0.150089
age               0.196603
alcohol           0.212772
bmi               0.187560
cancer_history    0.392188
diagnosis         1.000000
gender           -0.250336
genetic_risk      0.141599
smoking           0.226999
Name: diagnosis, dtype: float64

In [83]:
diabetes["gender"] = encoder.fit_transform(diabetes["gender"])
diabetes.corr()['diagnosis']

age              0.258008
bmi              0.214357
diagnosis        1.000000
gender           0.037411
heart_disease    0.171727
hypertension     0.197823
smoking          0.092998
Name: diagnosis, dtype: float64

In [84]:
liver["gender"] = encoder.fit_transform(liver["gender"])
liver.corr()['diagnosis']

activity       -0.116689
age             0.156099
alcohol         0.349610
bmi             0.167655
diabetes        0.107480
diagnosis       1.000000
gender         -0.189558
hypertension    0.170683
genetic_risk    0.118292
smoking         0.200071
Name: diagnosis, dtype: float64

In [85]:
stroke["gender"] = encoder.fit_transform(stroke["gender"])
stroke.corr()['diagnosis']

age              0.242495
bmi              0.011673
diagnosis        1.000000
gender           0.012167
heart_disease    0.138553
hypertension     0.143647
smoking          0.034922
Name: diagnosis, dtype: float64

In [86]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [87]:
# Select features (excluding 'diagnosis') and target
X = stroke.drop(columns=['diagnosis'])
y = stroke['diagnosis']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
model = LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9359


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [88]:
# Select features (excluding 'diagnosis') and target
X = heart.drop(columns=['diagnosis'])
y = heart['diagnosis']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
model = LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8032


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression, KNN, Naive Bayes, LDA, QDA, DTC, Random Forest

In [None]:
# define classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "LDA": LDA(),
    "QDA": QDA(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

# set up 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True)

# iterate through classifiers and evaluate
def evaluate_model(name, data):
    
results = {}
for name, clf in classifiers.items():
    scores = cross_val_score(clf, X, y, cv=kf) 
    results[name] = np.mean(scores)

# print results sorted by performance
sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
print("\nSorted Results:")
for name, acc in sorted_results:
    print(f"{name}: {acc:.4f}")


Logistic Regression: Mean Accuracy = 0.7968
KNN: Mean Accuracy = 0.7631
Naive Bayes: Mean Accuracy = 0.7968
LDA: Mean Accuracy = 0.7968
QDA: Mean Accuracy = 0.7968
Decision Tree: Mean Accuracy = 0.6619
Random Forest: Mean Accuracy = 0.7799

Sorted Results:
Logistic Regression: 0.7968
Naive Bayes: 0.7968
QDA: 0.7968
LDA: 0.7968
Random Forest: 0.7799
KNN: 0.7631
Decision Tree: 0.6619
