In [159]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

In [160]:
df = pd.read_csv('Data/heart.csv')
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [161]:
string_col = df.select_dtypes(include="object").columns
df[string_col] = df[string_col].astype("string")
print(df.dtypes) # check if converted properly

Age                        int64
Sex               string[python]
ChestPainType     string[python]
RestingBP                  int64
Cholesterol                int64
FastingBS                  int64
RestingECG        string[python]
MaxHR                      int64
ExerciseAngina    string[python]
Oldpeak                  float64
ST_Slope          string[python]
HeartDisease               int64
dtype: object


In [162]:
# distribution of categorical values:
df[string_col].head()
for col in string_col:
    print(f"The distribution of categorical valeus in the {col} is : ")
    print(df[col].value_counts())

The distribution of categorical valeus in the Sex is : 
Sex
M    725
F    193
Name: count, dtype: Int64
The distribution of categorical valeus in the ChestPainType is : 
ChestPainType
ASY    496
NAP    203
ATA    173
TA      46
Name: count, dtype: Int64
The distribution of categorical valeus in the RestingECG is : 
RestingECG
Normal    552
LVH       188
ST        178
Name: count, dtype: Int64
The distribution of categorical valeus in the ExerciseAngina is : 
ExerciseAngina
N    547
Y    371
Name: count, dtype: Int64
The distribution of categorical valeus in the ST_Slope is : 
ST_Slope
Flat    460
Up      395
Down     63
Name: count, dtype: Int64


In [163]:
# label encoding of categorical variables (this is fine for random forest)
# Initialize a dictionary to store label encoders
label_encoders = {}

# Label encoding of categorical variables
for col in string_col:
    le = LabelEncoder() # initialize label encoder
    df[col] = le.fit_transform(df[col])
    # fit: During the fitting process, LabelEncoder learns the unique classes present in the data and assigns each class a unique integer.
    # transform: After learning the classes, it then transforms the original categorical values into their corresponding integer codes.
    label_encoders[col] = le

In [164]:
# Print the label encoding mapping for each categorical feature
for col in string_col:
    le = label_encoders[col]
    print(f"Label encoding for {col}:")
    for class_, label in zip(le.classes_, le.transform(le.classes_)):
        print(f"{class_} -> {label}")
    print()

Label encoding for Sex:
F -> 0
M -> 1

Label encoding for ChestPainType:
ASY -> 0
ATA -> 1
NAP -> 2
TA -> 3

Label encoding for RestingECG:
LVH -> 0
Normal -> 1
ST -> 2

Label encoding for ExerciseAngina:
N -> 0
Y -> 1

Label encoding for ST_Slope:
Down -> 0
Flat -> 1
Up -> 2



In [165]:
print(df[df['Cholesterol'] == 0].count())
df[df['Cholesterol'] == 0].head()

Age               172
Sex               172
ChestPainType     172
RestingBP         172
Cholesterol       172
FastingBS         172
RestingECG        172
MaxHR             172
ExerciseAngina    172
Oldpeak           172
ST_Slope          172
HeartDisease      172
dtype: int64


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
293,65,1,0,115,0,0,1,93,1,0.0,1,1
294,32,1,3,95,0,1,1,127,0,0.7,2,1
295,61,1,0,105,0,1,1,110,1,1.5,2,1
296,50,1,0,145,0,1,1,139,1,0.7,1,1
297,57,1,0,110,0,1,2,131,1,1.4,2,1


In [166]:
target_column = 'Cholesterol'
label_column = 'HeartDisease'

In [167]:
mask_missing = (df['Cholesterol'] == 0)

In [168]:
(df[df['Cholesterol'] == 0]).count()

Age               172
Sex               172
ChestPainType     172
RestingBP         172
Cholesterol       172
FastingBS         172
RestingECG        172
MaxHR             172
ExerciseAngina    172
Oldpeak           172
ST_Slope          172
HeartDisease      172
dtype: int64

In [169]:
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd

# Assuming df, target_column, and label_column are defined previously

# Identify rows with missing values in the target column
mask_missing = (df[target_column] == 0)
# Replace 0 with NaN in the target column for easier handling
df.loc[df[target_column] == 0, target_column] = np.nan

# Impute missing values with the mean to perform KMeans clustering
features = df.drop(columns=[target_column, label_column])
imputer = SimpleImputer(strategy='mean')
features_imputed = pd.DataFrame(imputer.fit_transform(features), columns=features.columns)

# Include the target column without missing values for clustering
features_imputed[target_column] = df[target_column]
features_imputed.fillna(features_imputed.mean(), inplace=True)

# Perform KMeans clustering on rows without missing values in the target column
kmeans = KMeans(n_clusters=12, random_state=0)
kmeans.fit(features_imputed[~mask_missing])

# Predict clusters for all rows
df['Cluster'] = kmeans.predict(features_imputed)
print(df['Cluster'].unique())
# Impute missing values in the target column using the cluster means
for cluster in range(kmeans.n_clusters):
    cluster_mean = df.loc[df['Cluster'] == cluster, target_column].mean()
    df.loc[(df['Cluster'] == cluster) & mask_missing, target_column] = cluster_mean
    print(cluster_mean)
    print(len(df.loc[mask_missing]))
    print(len(df.loc[(df['Cluster'] == cluster) & mask_missing]))
# Drop the 'Cluster' column
df.drop(columns='Cluster', inplace=True)

# Check for any NaN values in the label_column
print(df[df[label_column].isna()])

# # Print the DataFrame to ensure the target column is imputed correctly
# print(df)

[10  4  5  3 11  6  8  9  0  1  2  7]
225.69230769230768
172
15
265.0098039215686
172
17
401.05263157894734
172
0
227.7912087912088
172
112
184.671875
172
0
284.0133333333333
172
0
321.4
172
0
541.0
172
0
232.23076923076923
172
28
151.875
172
0
290.86842105263156
172
0
200.78125
172
0
Empty DataFrame
Columns: [Age, Sex, ChestPainType, RestingBP, Cholesterol, FastingBS, RestingECG, MaxHR, ExerciseAngina, Oldpeak, ST_Slope, HeartDisease]
Index: []


In [170]:
df[df[target_column].isna()]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease


In [171]:
df[df[target_column] == 0]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease


In [172]:
df.to_csv('Data/heart_kmeans.csv', index=False)