In [61]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

# Step 1: Data pre-processing phase
def preprocess_data(data):
    
    # Categorical to Numeric
    data = pd.get_dummies(data, columns=['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], dtype='int')
    
    columns_to_process = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
    
    for column in columns_to_process:
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        
        # Apply outlier removal only to the selected column
        data = data.loc[(data[column] >= (Q1 - 1.5 * IQR)) & (data[column] <= (Q3 + 1.5 * IQR))]
    return data

def normalize_data(org_df,col):
    col_array = np.array(org_df[col]).reshape(-1, 1)
    scaler = preprocessing.StandardScaler()
    scaler.fit(col_array)
    org_df[col] = scaler.transform(col_array)
    return org_df

# Load data
file_path = '/Users/zhangxijing/MasterNEU/INFO6105DataScienceEngineeringMethodsandTools/Dataset/Heart_Failure.csv'  # Update this path to your new dataset
data = pd.read_csv(file_path)

data = preprocess_data(data)

data = normalize_data(data,'Age')
data = normalize_data(data,'RestingBP')
data = normalize_data(data,'Cholesterol')
data = normalize_data(data,'MaxHR')
print(data)

          Age  RestingBP  Cholesterol  FastingBS     MaxHR  Oldpeak  \
0   -1.334331   0.548422     0.984547          0  1.290461      0.0   
1   -0.390470   1.845686    -1.182352          0  0.633456      1.0   
2   -1.648952  -0.100209     0.865268          0 -1.748185      0.0   
3   -0.495344   0.418696    -0.506439          0 -1.337557      1.5   
4    0.133897   1.197054    -0.884155          0 -0.762678      0.0   
..        ...        ...          ...        ...       ...      ...   
913 -0.809964  -1.397473     0.487552          0 -0.352050      1.2   
914  1.602125   0.807875    -0.923915          1  0.017515      3.4   
915  0.448517  -0.100209    -2.156463          0 -1.050118      1.2   
916  0.448517  -0.100209    -0.069083          0  1.372586      0.0   
917 -1.544078   0.418696    -1.281751          0  1.331524      0.0   

     HeartDisease  Sex_F  Sex_M  ChestPainType_ASY  ...  ChestPainType_NAP  \
0               0      0      1                  0  ...              

In [62]:
# Step 3: Feature Extraction
from sklearn.decomposition import PCA

# Define features (X) and target (y)
X = data.drop('HeartDisease', axis=1)  # Exclude the outcome column
y = data['HeartDisease']

# Split data into training and test sets (considering 20% for test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Standardize the features (important for PCA)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA to create 3 new components from existing features
pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Transfer training and test data to the new dimensions (PCs)
X_train_transformed = pd.DataFrame(X_train_pca, columns=['PC1', 'PC2', 'PC3'])
X_test_transformed = pd.DataFrame(X_test_pca, columns=['PC1', 'PC2', 'PC3'])

# Append the outcome column back to the transformed data
X_train_transformed['outcome'] = y_train.reset_index(drop=True)
X_test_transformed['outcome'] = y_test.reset_index(drop=True)

print("Training data after PCA:")
print(X_train_transformed.head())

print("Testing data after PCA:")
print(X_test_transformed.head())

Training data after PCA:
        PC1       PC2       PC3  outcome
0  2.558396 -1.349264 -1.341968        1
1 -1.551547 -1.506064  0.450147        0
2  3.624999 -1.041258  0.422356        1
3  1.934112  0.358389  0.422603        1
4  2.672908  3.006461 -1.672042        1
Testing data after PCA:
        PC1       PC2       PC3  outcome
0  2.920527 -0.964144 -1.484184        1
1  2.742748 -1.210056 -1.164814        1
2  1.789398 -1.858782 -1.518943        1
3 -1.804309  0.867948  1.881906        0
4  0.831860 -1.225375 -0.314270        1


In [63]:
import time
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score
import numpy as np

# Step 4: Define base classifiers and meta learner
base_classifiers = {
    'Naive Bayes': GaussianNB(),
    'Neural Network': MLPClassifier(max_iter=1000),  # You can adjust max_iter as needed
    'KNN': KNeighborsClassifier()
}

meta_learner = DecisionTreeClassifier()

print("Step 4: Training base classifiers and meta learner using 5-fold cross validation...")

start_time = time.time()  # Record start time

# Step 4: Train base classifiers and meta learner using 5-fold cross validation
meta_features_train = []
for name, clf in base_classifiers.items():
    print(f"Training {name}...")
    
    # Fit the base classifier on the entire training data
    clf.fit(X_train_transformed.drop('outcome', axis=1), X_train_transformed['outcome'])
    
    # Generate predictions from the base classifier using cross-validation
    predictions = cross_val_predict(clf, X_train_transformed.drop('outcome', axis=1), X_train_transformed['outcome'], cv=5)
    meta_features_train.append(predictions)

# Stack predictions horizontally to form meta features
meta_features_train = np.array(meta_features_train).T

print("Training meta learner (Decision Tree) on meta features...")
# Train the meta learner (decision tree) on the meta features
meta_learner.fit(meta_features_train, X_train_transformed['outcome'])

print("Step 4: Finding best hyperparameters using GridSearchCV...")

# Step 4: Find best hyperparameters using GridSearchCV for Neural Network
param_grid_nn = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'alpha': [0.0001, 0.001, 0.01]
}

grid_search_nn = GridSearchCV(base_classifiers['Neural Network'], param_grid=param_grid_nn, cv=5)
grid_search_nn.fit(X_train_transformed.drop('outcome', axis=1), X_train_transformed['outcome'])

best_nn_classifier = grid_search_nn.best_estimator_

print("Step 4: Evaluating model on test data...")

# Step 4: Evaluate model on test data
meta_features_test = []
for name, clf in base_classifiers.items():
    print(f"Generating predictions using {name} on test data...")
    
    # Generate predictions from the base classifier on test data
    predictions_test = clf.predict(X_test_transformed.drop('outcome', axis=1))
    meta_features_test.append(predictions_test)

# Stack predictions horizontally for test data
meta_features_test = np.array(meta_features_test).T

# Use the trained meta learner to make predictions on test data
final_predictions = meta_learner.predict(meta_features_test)

# Calculate accuracy of the final model on test data
accuracy = accuracy_score(X_test_transformed['outcome'], final_predictions)
print(f"Accuracy of the Super Learner on Test Data: {accuracy:.4f}")

end_time = time.time()  # Record end time
calculation_time = end_time - start_time
print(f"Total Calculation Time: {calculation_time:.2f} seconds")

Step 4: Training base classifiers and meta learner using 5-fold cross validation...
Training Naive Bayes...
Training Neural Network...
Training KNN...
Training meta learner (Decision Tree) on meta features...
Step 4: Finding best hyperparameters using GridSearchCV...
Step 4: Evaluating model on test data...
Generating predictions using Naive Bayes on test data...
Generating predictions using Neural Network on test data...
Generating predictions using KNN on test data...
Accuracy of the Super Learner on Test Data: 0.8794
Total Calculation Time: 15.26 seconds
