In [6]:
import pandas as pd
from data import data_processing
from utils import Distances, HyperparameterTuner, NormalizationScaler, MinMaxScaler

In [7]:
data = pd.read_csv('heart_disease.csv', low_memory=False, sep=',', na_values='?')
data.head()


Unnamed: 0,age,sex(1-male 0-female),cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num(pred value)
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [8]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   303 non-null    float64
 1   sex(1-male 0-female)  303 non-null    float64
 2   cp                    303 non-null    float64
 3   trestbps              303 non-null    float64
 4   chol                  303 non-null    float64
 5   fbs                   303 non-null    float64
 6   restecg               303 non-null    float64
 7   thalach               303 non-null    float64
 8   exang                 303 non-null    float64
 9   oldpeak               303 non-null    float64
 10  slope                 303 non-null    float64
 11  ca                    303 non-null    float64
 12  thal                  303 non-null    float64
 13  num(pred value)       303 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 33.3 KB


In [9]:
data.describe()

Unnamed: 0,age,sex(1-male 0-female),cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num(pred value)
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.686469,4.726073,0.458746
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.951149,1.93629,0.49912
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,3.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,3.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,3.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,7.0,1.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,7.0,1.0


In [10]:
distance_funcs = {
        'euclidean': Distances.euclidean_distance,
        'minkowski': Distances.minkowski_distance,
        'cosine_dist': Distances.cosine_similarity_distance,
    }

scaling_classes = {
        'min_max_scale': MinMaxScaler,
        'normalize': NormalizationScaler,
    }

x_train, y_train, x_val, y_val, x_test, y_test = data_processing()

print('x_train shape = ', x_train.shape)
print('y_train shape = ', y_train.shape)

tuner_without_scaling_obj = HyperparameterTuner()
tuner_without_scaling_obj.tuning_without_scaling(distance_funcs, x_train, y_train, x_val, y_val)

print("**Without Scaling**")
print("k =", tuner_without_scaling_obj.best_k)
print("distance function =", tuner_without_scaling_obj.best_distance_function)
print("f1 score = %0.3f" % tuner_without_scaling_obj.best_f1)

tuner_with_scaling_obj = HyperparameterTuner()
tuner_with_scaling_obj.tuning_with_scaling(distance_funcs, scaling_classes, x_train, y_train, x_val, y_val)

print("\n**With Scaling**")
print("k =", tuner_with_scaling_obj.best_k)
print("distance function =", tuner_with_scaling_obj.best_distance_function)
print("scaler =", tuner_with_scaling_obj.best_scaler)
print("f1 score = %0.3f" %tuner_with_scaling_obj.best_f1)

x_train shape =  (242, 14)
y_train shape =  (242,)
**Without Scaling**
k = 15
distance function = cosine_dist
f1 score = 0.638

**With Scaling**
k = 21
distance function = euclidean
scaler = min_max_scale
f1 score = 0.667
