# Machine Learning Analysis for Early Detection of Heart Failure

## Import data

In [1]:
"""
Data Preprocessing Script
Author: deng.wei
Date: 3.27
Function: Preprocess the heart disease dataset, including handling outliers, categorical encoding, feature standardization, and data splitting.
"""

import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import os
from IPython.display import display

data_file = os.path.join(os.getcwd(), '..','data', 'heart.csv')
print(os.getcwd())

# Check if the data file exists
if not os.path.exists(data_file):
    display(f"Data file not found: {data_file}")
else:
    try:
        data = pd.read_csv(data_file)
        print(f"Data loaded successfully, shape: {data.shape}")  # Expected shape is (918, 12)
    except Exception as e:
        display(f"Error loading data: {e}")


c:\Users\14810\Desktop\DSML\DSML-main\feature_implementation\.ipynb_checkpoints


'Data file not found: c:\\Users\\14810\\Desktop\\DSML\\DSML-main\\feature_implementation\\.ipynb_checkpoints\\..\\data\\heart.csv'

---
## Preprocessing

In [2]:
"""
Data Preprocessing Script
Author: deng.wei
Date: 3.27
Function: Preprocessing the heart disease dataset
"""

import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import os
from IPython.display import display

# Go up one directory to access the dataset
data_file = os.path.join(os.getcwd(), '..', 'data', 'heart.csv')
# Check if the data file exists
if not os.path.exists(data_file):
    display(f"Data file not found: {data_file}")
else:
    try:
        data = pd.read_csv(data_file)
        print(f"Data loaded successfully, shape: {data.shape}")  # Expected shape is (918, 12)
    except Exception as e:
        display(f"Error loading data: {e}") 
    
    # Handling outliers
    '''
    deng.wei: Blood pressure and cholesterol should not be zero
    '''
    bp_zero_count = (data['RestingBP'] == 0).sum()
    chol_zero_count = (data['Cholesterol'] == 0).sum()
    data['RestingBP'] = data['RestingBP'].replace(0, data['RestingBP'].median())
    data['Cholesterol'] = data['Cholesterol'].replace(0, data['Cholesterol'].median())
    print("Outlier handling completed")
    print(f"Outlier handling: Replaced {bp_zero_count} zero values in blood pressure, {chol_zero_count} zero values in cholesterol")
    
    '''
    yue.yao: Copy global data for EDA analysis
    '''
    eda_data = data.copy()
    
    # Category encoding
    categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

    for col in categorical_cols:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
    print("Categorical feature encoding completed")

    # Splitting dataset
    X = data.drop('HeartDisease', axis=1)
    y = data['HeartDisease']
    
    # First, split into training and temporary sets (80% for training, 20% for temporary)
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, 
        test_size=0.2, 
        random_state=42, 
        stratify=y
    )
    
    # Then, split the temporary set into validation and test sets (each 10% of the original data)
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, 
        test_size=0.5, 
        random_state=42, 
        stratify=y_temp
    )
    
    print(f"Dataset split completed:")
    print(f"Training set {X_train.shape} ({len(X_train)/len(X):.1%})")
    print(f"Validation set {X_val.shape} ({len(X_val)/len(X):.1%})")
    print(f"Test set {X_test.shape} ({len(X_test)/len(X):.1%})")
    
    # Standardizing numerical features
    numerical_cols = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
    scaler = StandardScaler()
    # Remove the line: data[numerical_cols] = scaler.fit_transform(data[numerical_cols])
    # Because standardization is done only for training, validation, and test sets

    # Standardizing the training set using its mean and standard deviation
    X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])

    # Standardizing the validation and test sets using the training set's mean and standard deviation
    X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
    X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

    print("Numerical feature standardization completed")


'Data file not found: c:\\Users\\14810\\Desktop\\DSML\\DSML-main\\feature_implementation\\.ipynb_checkpoints\\..\\data\\heart.csv'

## Dataset Persistence

In [3]:
"""
Dataset Persistence
Author: deng.wei
Date: 4.15
"""
# Combine features and labels for export
train_full = X_train.copy()
train_full["HeartDisease"] = y_train

val_full = X_val.copy()
val_full["HeartDisease"] = y_val

# Not include the reult in test data
test_full = X_test.copy()

# Create 'report' folder if it doesn't exist
report_dir = os.path.join(os.getcwd(), '..', 'report')
os.makedirs(report_dir, exist_ok=True)


# Define output paths
train_path = os.path.join(report_dir, f"train_set_snapshot_.csv")
val_path   = os.path.join(report_dir, f"val_set_snapshot_.csv")
test_path  = os.path.join(report_dir, f"test_set_snapshot_.csv")

train_full.to_csv(train_path, index=False)
val_full.to_csv(val_path, index=False)
test_full.to_csv(test_path, index=False)

print("Dataset snapshots saved")
print(f"Training set: {train_path}")
print(f"Validation set: {val_path}")
print(f"Test set: {test_path}")


NameError: name 'X_train' is not defined

---
## Exploratory Data Analysis (EDA)

In [None]:
"""
探索性分析数据EDA
作者: yue.yao
日期: 4.6
功能: 对心脏疾病数据集进行探索性分析，完成数据可视化
"""
import matplotlib.pyplot as plt
import seaborn as sns

# 数据全局概览
eda_data.describe(include = 'all')

In [None]:
# 检测空值
eda_data.isnull().sum()

In [None]:
# 检测重复数据
eda_data.duplicated().sum()

In [None]:
# 概览非数值数据分布
for i in eda_data.columns:
  if eda_data[i].dtype == 'object':
    print(f'column : {i}')
    print(eda_data[i].value_counts())
    print('----------------------------')

In [None]:
# 对数值数据进行相关性分析
num_cor = eda_data.corr(numeric_only=True)
num_cor

In [None]:
# 绘制相关性热力图
plt.figure(figsize=(8,6))
sns.heatmap(num_cor, annot=True, cmap='coolwarm')

> 结论：  
与HeartDisease相关性较高的数据是Oldpeak，FastingBS，Age  
> Conclusion:  
The data with high correlation with HeartDisease are Oldpeak, FastingBS, and Age

In [None]:
# 对比Oldpeak对于患病和非患病人群的平均数值分布
print(eda_data.groupby('HeartDisease')['Oldpeak'].describe())

In [None]:
# 箱型图对比Oldpeak数值分布
sns.boxplot(x='HeartDisease', y='Oldpeak', data=eda_data, hue='HeartDisease')

> 结论：  
心力衰竭患者的Oldpeak指数通常比非患者要高，数值大多分布在1.5及以上  
Conclusion:  
The Oldpeak index was generally higher in patients with heart failure than in non-patients, with values of 1.5 and above mostly distributed

In [None]:
# 患者和非患者的年龄分布
sns.histplot(
    data=eda_data, 
    x='Age', 
    hue='HeartDisease',
    bins=30, 
    kde=True, 
    alpha=0.7
)
plt.title("Age Distribution by Heart Disease Status")
plt.show()

In [None]:
# 箱型图对比Age数值分布
sns.boxplot(x='HeartDisease', y='Age', data=eda_data, hue='HeartDisease')

In [None]:
# 静息血压岁年龄变化
sns.lineplot(x='Age', y='Oldpeak', data=eda_data)
plt.show()

> 结论：  
Oldpeak数值随年龄增大而增加。心力衰竭患者数量随年龄上升而增加，大多患者年龄分布在60岁左右。  
Conclusion:  
Oldpeak value increases with age. The number of heart failure patients increases with age, and most patients are around 60 years old.

In [None]:
# 箱型图对比FastingBS数值分布
sns.boxplot(x='HeartDisease', y='FastingBS', data=eda_data, hue='HeartDisease')

> 结论：  
心力衰竭患者普遍能检测到FastingBS数值分布在0-1之间，而非患者普遍数值为0。FastingBS数值与患病情况强相关。  
Conclusion:  
Patients with heart failure generally had FastingBS values ranging from 0 to 1, while non-patients generally had values of 0. FastingBS values were strongly correlated with disease status.

In [None]:
# 非数值数据的患病分布
categoricalfeatures = ["Sex", "ChestPainType", "RestingECG", "ExerciseAngina", "ST_Slope"]
plt.figure(figsize=(12, 6))
for i in categoricalfeatures:
    ax = plt.subplot(2,3,categoricalfeatures.index(i)+1)
    sns.countplot(x= eda_data[i] , hue = eda_data['HeartDisease'])
plt.tight_layout()
plt.show()

> 结论：  
1.心力衰竭患者在男性中占比更高，在女性中占比低。  
2.在几种胸痛类型中，患者表现出无症状的数量最多。  
3.RestingECG在患者和非患者间表现出数值相差不明显。  
4.在患者中，运动诱发性心绞痛占比例较高。  
5.根据ST_Slope数值显示，患者大多ST曲线表现为水平型。  
Conclusion:  
1.Heart failure is more common in men and less common in women.  
2.Of the several types of chest pain, the largest number of patients showed no symptoms.  
3.RestingECG showed no significant difference in values between patients and non-patients.  
4.Exercise-induced angina pectoris accounted for a high proportion of patients.  
5.According to the value of ST_Slope, the ST curve of most patients is horizontal.

In [None]:
# 散点图矩阵
sns.pairplot(eda_data,hue='HeartDisease')
plt.legend()

---
## Model Analysis

### Public function(Making code more elegent)

In [None]:
"""
Author: deng.wei
Date: 4.15
Function: To generate metrix images by elegant way
"""
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix


def plot_confusion_matrix(cm, title="Confusion Matrix", labels=["0", "1"]):
    """
    Function to plot confusion matrix.

    Parameters:
    cm : ndarray
        Confusion matrix to be plotted.
    title : str
        Title of the plot.
    labels : list
        List of labels for the confusion matrix (default is ["0", "1"]).
    """
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=labels, yticklabels=labels)
    plt.title(title)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.show()

In [None]:
"""
Author: deng.wei
Date: 4.15
Function: To generate evaluation indicators by elegant way(WITHOUT validation_set)
"""
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score


def evaluate_model_basic(y_train, y_train_pred, y_test, y_test_pred):
    print("Training Set:")
    print(f"Accuracy     : {accuracy_score(y_train, y_train_pred):.4f}")
    print(f"Precision    : {precision_score(y_train, y_train_pred):.4f}")
    print(f"F1 Score     : {f1_score(y_train, y_train_pred):.4f}")
    print(f"Recall       : {recall_score(y_train, y_train_pred):.4f}")

    print("\nTest Set:")
    print(f"Accuracy     : {accuracy_score(y_test, y_test_pred):.4f}")
    print(f"Precision    : {precision_score(y_test, y_test_pred):.4f}")
    print(f"F1 Score     : {f1_score(y_test, y_test_pred):.4f}")
    print(f"Recall       : {recall_score(y_test, y_test_pred):.4f}")


In [None]:
"""
Author: deng.wei
Date: 4.15
Function: To generate evaluation indicators by elegant way(INCLUDING validation_set)
"""
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score

def evaluate_model_outputs(y_train, y_train_pred, y_val, y_val_pred, y_test, y_test_pred):
    print("Training Set:")
    print(f"Accuracy     : {accuracy_score(y_train, y_train_pred):.4f}")
    print(f"Precision    : {precision_score(y_train, y_train_pred):.4f}")
    print(f"F1 Score     : {f1_score(y_train, y_train_pred):.4f}")
    print(f"Recall       : {recall_score(y_train, y_train_pred):.4f}")

    print("\nValidation Set:")
    print(f"Accuracy     : {accuracy_score(y_val, y_val_pred):.4f}")
    print(f"Precision    : {precision_score(y_val, y_val_pred):.4f}")
    print(f"F1 Score     : {f1_score(y_val, y_val_pred):.4f}")
    print(f"Recall       : {recall_score(y_val, y_val_pred):.4f}")

    print("\nTest Set:")
    print(f"Accuracy     : {accuracy_score(y_test, y_test_pred):.4f}")
    print(f"Precision    : {precision_score(y_test, y_test_pred):.4f}")
    print(f"F1 Score     : {f1_score(y_test, y_test_pred):.4f}")
    print(f"Recall       : {recall_score(y_test, y_test_pred):.4f}")


### 1.KNN 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix


# Create and train the KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Predictions
y_train_pred = knn.predict(X_train)
y_test_pred = knn.predict(X_test)

evaluate_model_basic(y_train, y_train_pred, y_test, y_test_pred)

# Cross-validation
cv_scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')
print("\nCross-Validation (5-Fold):")
print(f"CV Scores    : {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean():.4f}")

# Tune K value using CV
k_range = range(1, 21)
cv_results = []
for k in k_range:
    knn_k = KNeighborsClassifier(n_neighbors=k)
    score = cross_val_score(knn_k, X_train, y_train, cv=5, scoring='accuracy').mean()
    cv_results.append(score)

best_k = k_range[cv_results.index(max(cv_results))]
print(f"\nBest K value: {best_k}")

# Retrain with best K
knn_best = KNeighborsClassifier(n_neighbors=best_k)
knn_best.fit(X_train, y_train)
y_test_pred_best = knn_best.predict(X_test)
print(f"Test set accuracy after tuning: {accuracy_score(y_test, y_test_pred_best):.4f}")

# Confusion matrices
cm_train = confusion_matrix(y_train, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)
plot_confusion_matrix(cm_train, title="Confusion Matrix for Training Set")
plot_confusion_matrix(cm_test, title="Confusion Matrix for Test Set")


### 2.Decision Tree

In [None]:
"""
Decision Tree Model
Author: deng.wei
Date: 4.14
Function: Decision Tree model implementation and evaluation
"""

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, confusion_matrix

dt = DecisionTreeClassifier(random_state=42, max_depth=5, min_samples_split=10, min_samples_leaf=5)
dt.fit(X_train, y_train)

y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

evaluate_model_basic(y_train, y_train_pred, y_test, y_test_pred)

cm_train = confusion_matrix(y_train, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)
plot_confusion_matrix(cm_train, title="Confusion Matrix for Training Set")
plot_confusion_matrix(cm_test, title="Confusion Matrix for Test Set")


### 3.Random Forest

In [None]:
"""
Random Forest Model
Author: deng.wei
Date: 4.13
Function: Random Forest model implementation and evaluation
"""

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Create and train the Random Forest model
rf = RandomForestClassifier(
    random_state=42, 
    n_estimators=100,
    max_depth=8,
    min_samples_split=6,
    min_samples_leaf=3
)
rf.fit(X_train, y_train)

# Make predictions
y_train_pred = rf.predict(X_train)
y_val_pred = rf.predict(X_val)
y_test_pred = rf.predict(X_test)

# Evaluate with your shared evaluation function (train + val + test)
evaluate_model_outputs(y_train, y_train_pred, y_val, y_val_pred, y_test, y_test_pred)

# Cross-validation (10-fold)
cv_scores = cross_val_score(rf, X_train, y_train, cv=10, scoring='accuracy') 
print("\nCross-Validation (10-Fold):")
print(f"CV Scores    : {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean():.4f}")

# Confusion matrices
cm_train = confusion_matrix(y_train, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)
plot_confusion_matrix(cm_train, title="Confusion Matrix for Training Set")
plot_confusion_matrix(cm_test, title="Confusion Matrix for Test Set")


### 4.Logistic Regression

In [None]:
"""
Logistic Regression Model
Author: deng.wei
Date: 4.14
Function: Logistic Regression model implementation and evaluation
"""

from sklearn.linear_model import LogisticRegression

# Create and train the Logistic Regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# Make predictions on the training and test sets
y_train_pred = log_reg.predict(X_train)
y_test_pred = log_reg.predict(X_test)

# Evaluate using shared utility (no validation set)
evaluate_model_basic(y_train, y_train_pred, y_test, y_test_pred)

# Confusion matrices
cm_train = confusion_matrix(y_train, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)
plot_confusion_matrix(cm_train, title="Confusion Matrix for Training Set")
plot_confusion_matrix(cm_test, title="Confusion Matrix for Test Set")


### 5.SVM

In [None]:
"""
SVM Model  
Author: deng.wei  
Date: 4.14  
Function: Implementation and evaluation of the SVM model
"""

from sklearn.svm import SVC

svm = SVC(random_state=42)
svm.fit(X_train, y_train)

y_train_pred = svm.predict(X_train)
y_test_pred = svm.predict(X_test)

evaluate_model_basic(y_train, y_train_pred, y_test, y_test_pred)

cm_train = confusion_matrix(y_train, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)
plot_confusion_matrix(cm_train, title="Confusion Matrix for Training Set")
plot_confusion_matrix(cm_test, title="Confusion Matrix for Test Set")


### 6.Naive Bayes

In [None]:
"""
Naive Bayes Model  
Author: deng.wei  
Date: 4.14  
Function: Implementation and evaluation of the Naive Bayes model
"""

from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train, y_train)

y_train_pred = nb.predict(X_train)
y_test_pred = nb.predict(X_test)

evaluate_model_basic(y_train, y_train_pred, y_test, y_test_pred)

cm_train = confusion_matrix(y_train, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)
plot_confusion_matrix(cm_train, title="Confusion Matrix for Training Set")
plot_confusion_matrix(cm_test, title="Confusion Matrix for Test Set")


### 7.Neural Network

In [None]:
"""
Neural Network Model  
Author: deng.wei  
Date: 4.14  
Function: Implementation and evaluation of a neural network model
"""

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) 

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Create a neural network model with regularization and early stopping
mlp = MLPClassifier(
    max_iter=1000,
    learning_rate_init=0.001,
    alpha=0.001,            # L2 regularization
    early_stopping=True,    # Enable early stopping
    random_state=42
)
mlp.fit(X_train, y_train)

y_train_pred = mlp.predict(X_train)
y_test_pred = mlp.predict(X_test)

evaluate_model_basic(y_train, y_train_pred, y_test, y_test_pred)

cross_val_scores = cross_val_score(mlp, X_train, y_train, cv=5)
print(f"\nCross-validation accuracy scores: {cross_val_scores}")
print(f"Mean cross-validation accuracy: {cross_val_scores.mean():.4f}")

cm_train = confusion_matrix(y_train, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)
plot_confusion_matrix(cm_train, title="Confusion Matrix for Training Set")
plot_confusion_matrix(cm_test, title="Confusion Matrix for Test Set")


### 8.Gradient Descent 

In [None]:
"""
Gradient Descent Model  
Author: deng.wei  
Date: 4.15  
Function: Implementation and evaluation of a gradient-based model
"""

from sklearn.linear_model import SGDClassifier


sgd = SGDClassifier(loss='log_loss', random_state=42)
sgd.fit(X_train, y_train)

y_train_pred = sgd.predict(X_train)
y_test_pred = sgd.predict(X_test)

evaluate_model_basic(y_train, y_train_pred, y_test, y_test_pred)

cm_train = confusion_matrix(y_train, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)
plot_confusion_matrix(cm_train, title="Confusion Matrix for Training Set")
plot_confusion_matrix(cm_test, title="Confusion Matrix for Test Set")


---
## Model Evaluation

In [None]:
# 评估模型

## Predict result

In [None]:
# 选择合适的模型后进行结果预测