In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.utils import resample
import numpy as np

In [2]:
data = pd.read_csv("framingham.csv")

In [3]:
# Handling missing values using imputation strategies
mean_imputer = SimpleImputer(strategy='mean')
mean_cols = ['heartRate']
data[mean_cols] = mean_imputer.fit_transform(data[mean_cols])

median_imputer = SimpleImputer(strategy='median')
median_cols = ['education', 'cigsPerDay', 'BPMeds', 'totChol', 'BMI', 'glucose']
data[median_cols] = median_imputer.fit_transform(data[median_cols])

In [4]:
def handle_outliers_with_IQR(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)

continuous_cols = ['age', 'cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']
for col in continuous_cols:
    handle_outliers_with_IQR(data, col)

In [5]:
target_column = 'TenYearCHD'
data_majority = data[data[target_column] == 0]
data_minority = data[data[target_column] == 1]
data_minority_upsampled = resample(data_minority, replace=True, n_samples=len(data_majority), random_state=123)
data_upsampled = pd.concat([data_majority, data_minority_upsampled])

In [6]:
X = data_upsampled.drop(target_column, axis=1)
y = data_upsampled[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
def print_confusion_matrix(y_test, y_pred):

    cm = confusion_matrix(y_test, y_pred)

    num_classes = len(cm)

    # Print confusion matrix header
    print("\nConfusion Matrix:")
    print("True Labels ->")
    print("Predicted Labels v")
    print(f"{'':<10}", end="")
    for i in range(num_classes):
        print(f"{i:<10}", end="")
    print()

    # Print confusion matrix contents
    for i in range(num_classes):
        print(f"{i:<10}", end="")
        for j in range(num_classes):
            print(f"{cm[i][j]:<10}", end="")
        print()

In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def Metrics(y_test, y_pred, y_pred_proba):

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)

    # Create dictionary to store metrics
    metrics_dict = {
        'Accuracy': round(accuracy*100, 4),
        'Precision': round(precision*100, 4),
        'Recall': round(recall*100, 4),
        'F1': round(f1*100, 4),
        'AUC': round(auc*100, 4)
    }

    # Print the metrics dictionary neatly
    print("\nClassification Metrics:")
    for metric, value in metrics_dict.items():
        print(f"{metric} Score: [{value}] %")

    return metrics_dict

In [9]:
# Initialize the XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42)

# Train the model
xgb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_classifier.predict(X_test)
y_prob = xgb_classifier.predict_proba(X_test)[:,1]

# Evaluate the model
print_confusion_matrix(y_test, y_pred)
Metrics(y_test,y_pred ,y_prob)


Confusion Matrix:
True Labels ->
Predicted Labels v
          0         1         
0         657       96        
1         11        674       

Classification Metrics:
Accuracy Score: [92.5591] %
Precision Score: [87.5325] %
Recall Score: [98.3942] %
F1 Score: [92.646] %
AUC Score: [97.9343] %


{'Accuracy': 92.5591,
 'Precision': 87.5325,
 'Recall': 98.3942,
 'F1': 92.646,
 'AUC': 97.9343}

In [10]:
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:,1]

# Evaluate the model
print_confusion_matrix(y_test, y_pred)
Metrics(y_test,y_pred ,y_prob)


Confusion Matrix:
True Labels ->
Predicted Labels v
          0         1         
0         744       9         
1         8         677       

Classification Metrics:
Accuracy Score: [98.8178] %
Precision Score: [98.688] %
Recall Score: [98.8321] %
F1 Score: [98.76] %
AUC Score: [99.569] %


{'Accuracy': 98.8178,
 'Precision': 98.688,
 'Recall': 98.8321,
 'F1': 98.76,
 'AUC': 99.569}

In [11]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:,1]

# Evaluate the model
print_confusion_matrix(y_test, y_pred)
Metrics(y_test,y_pred ,y_prob)


Confusion Matrix:
True Labels ->
Predicted Labels v
          0         1         
0         719       34        
1         7         678       

Classification Metrics:
Accuracy Score: [97.1488] %
Precision Score: [95.2247] %
Recall Score: [98.9781] %
F1 Score: [97.0651] %
AUC Score: [99.4743] %


{'Accuracy': 97.1488,
 'Precision': 95.2247,
 'Recall': 98.9781,
 'F1': 97.0651,
 'AUC': 99.4743}

In [12]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf = LinearDiscriminantAnalysis()
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:,1]

# Evaluate the model
print_confusion_matrix(y_test, y_pred)
Metrics(y_test,y_pred ,y_prob)


Confusion Matrix:
True Labels ->
Predicted Labels v
          0         1         
0         486       267       
1         189       496       

Classification Metrics:
Accuracy Score: [68.2893] %
Precision Score: [65.0066] %
Recall Score: [72.4088] %
F1 Score: [68.5083] %
AUC Score: [73.5367] %


{'Accuracy': 68.2893,
 'Precision': 65.0066,
 'Recall': 72.4088,
 'F1': 68.5083,
 'AUC': 73.5367}

In [13]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:,1]

# Evaluate the model
print_confusion_matrix(y_test, y_pred)
Metrics(y_test,y_pred ,y_prob)


Confusion Matrix:
True Labels ->
Predicted Labels v
          0         1         
0         459       294       
1         232       453       

Classification Metrics:
Accuracy Score: [63.4214] %
Precision Score: [60.6426] %
Recall Score: [66.1314] %
F1 Score: [63.2682] %
AUC Score: [69.9803] %


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'Accuracy': 63.4214,
 'Precision': 60.6426,
 'Recall': 66.1314,
 'F1': 63.2682,
 'AUC': 69.9803}