In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Bootcamp/StartupCampus/dataset/heart.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


Tentu, berikut adalah detail kolom pada dataset penyakit jantung yang Anda sebutkan:

*  age - usia dalam tahun
*  sex - (1 = pria; 0 = wanita)
*  cp - jenis nyeri dada
    - 0: Angina khas: nyeri dada terkait penurunan pasokan darah ke jantung
    - 1: Angina atipikal: nyeri dada tidak terkait dengan jantung
    - 2: Nyeri non-angina: biasanya spasme esofagus (tidak terkait jantung)
    - 3: Asimptomatik: nyeri dada tidak menunjukkan tanda-tanda penyakit
*  trestbps - tekanan darah istirahat (dalam mm Hg saat masuk rumah sakit): apa pun di atas 130-140 biasanya menjadi penyebab kekhawatiran
*  chol - kolesterol serum dalam mg/dl
    - serum = LDL + HDL + .2 * trigliserida
    - di atas 200 adalah penyebab kekhawatiran
*  fbs - (gula darah puasa > 120 mg/dl) (1 = benar; 0 = salah)
    - ">126" mg/dL menandakan diabetes
*  restecg - hasil elektrokardiografi istirahat
    - 0: Tidak ada catatan
    - 1: Abnormalitas gelombang ST-T dapat berkisar dari gejala ringan hingga masalah parah menandakan detak jantung tidak normal
    - 2: Kemungkinan atau pasti hipertrofi ventrikel kiri
*  thalach - frekuensi jantung maksimum yang dicapai
*  exang - angina yang diinduksi olahraga (1 = ya; 0 = tidak)
*  oldpeak - depresi ST yang diinduksi oleh olahraga relatif terhadap istirahat melihat stres jantung selama olahraga jantung yang tidak sehat akan stres lebih banyak
*  slope - kemiringan segmen ST puncak latihan
    - 0: Meningkat: frekuensi jantung lebih baik dengan olahraga (tidak umum)
    - 1: Datar: perubahan minimal (jantung sehat khas)
    - 2: Menurun: tanda-tanda jantung tidak sehat
*  ca - jumlah pembuluh utama yang diwarnai oleh flourosopy
    - pembuluh berwarna berarti dokter dapat melihat darah melewatinya
    - semakin banyak pergerakan darah semakin baik (tidak ada gumpalan)
*  thal - hasil stres talium
*  target - memiliki penyakit atau tidak (1=ya, 0=tidak) (=atribut yang diprediksi)1

# **DATA PREPARATION**

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [None]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


## **Mencari Missing Value**

In [None]:
df.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

Tidak ada missing value

## **Mencari Duplicated Value**

In [None]:
df.duplicated().sum()

723

terdapat banyak duplikasi data

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

duplicate_rows_data = df[df.duplicated(keep=False)]
grouped_duplicates = duplicate_rows_data.groupby(list(df.columns))
for key, item in grouped_duplicates:
  print(grouped_duplicates.get_group(key), "\n\n")

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  ca  thal  target
60    29    1   1       130   204    0        0      202      0      0.0      2   0     2       1
64    29    1   1       130   204    0        0      202      0      0.0      2   0     2       1
118   29    1   1       130   204    0        0      202      0      0.0      2   0     2       1
668   29    1   1       130   204    0        0      202      0      0.0      2   0     2       1 


     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  ca  thal  target
12    34    0   1       118   210    0        1      192      0      0.7      2   0     2       1
15    34    0   1       118   210    0        1      192      0      0.7      2   0     2       1
779   34    0   1       118   210    0        1      192      0      0.7      2   0     2       1 


     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  ca  thal  target
143   34    1 

Terdapat Duplicated data dan penanganan dengan menghapus duplikasi

In [None]:
# Sebelum menghapus duplikasi data
df.shape

(1025, 14)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
# Setelah menghapus duplikasi data
df.shape

(302, 14)

## **Memeriksa Outlier**

In [None]:
nunique_values = df.nunique()
nunique_values

age          41
sex           2
cp            4
trestbps     49
chol        152
fbs           2
restecg       3
thalach      91
exang         2
oldpeak      40
slope         3
ca            5
thal          4
target        2
dtype: int64

In [None]:
columns = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

fig = make_subplots(rows=1, cols=len(columns))
for i, column in enumerate(columns):
  fig.add_trace(go.Box(y=df[column], name=column), row=1, col=i+1)

fig.show()

In [None]:
fig = make_subplots(rows=1, cols=len(columns))
for i, column in enumerate(columns):
  fig.add_trace(go.Violin(y=df[column], name=column, box_visible=True, meanline_visible=True), row=1, col=i+1)

fig.show()

Terdapat Outlier

### Menangani Outlier

In [None]:
# Sebelum menangani outlier
df.shape

(302, 14)

**Menangani Outlier dengan menganti nilai nilai outlier dengan batas yang diperbolehkan**

In [None]:
for column in columns:
  Q1 = df[column].quantile(0.25)
  Q3 = df[column].quantile(0.75)
  IQR = Q3 - Q1

  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR

  df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
  df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])

In [None]:
# Setelah menangani outlier
df.shape

(302, 14)

tidak ada data yang di hapus

In [None]:
fig = make_subplots(rows=1, cols=len(columns))
for i, column in enumerate(columns):
  fig.add_trace(go.Box(y=df[column], name=column), row=1, col=i+1)

fig.show()

## **Memeriksa Imbalance Data**

In [None]:
df['target'].value_counts()

1    164
0    138
Name: target, dtype: int64

In [None]:
class_counts = df['target'].value_counts()

class_percentage = (class_counts / class_counts.sum()) * 100

fig = go.Figure(data=[go.Bar(x=class_counts.index, y=class_counts.values,
                             text=[f'{x:.2f}%' for x in class_percentage],
                             textposition='auto')])

fig.update_layout(title_text='Imbalance Data', xaxis_title='Class', yaxis_title='Count',
                  width=600, height=400)

fig.show()

**Terdapat Imbalance data yang tidak terlalu signifikan**

disini dilakukan teknik imbalance data menggunakan **oversampling -> SMOTE** agar data seimbang dan sebagai **pembelajaran dalam handling imbalance data** dan di simpan di variable baru

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()

df_smote, _ = smote.fit_resample(df, df['target'])

In [None]:
df_smote['target'].value_counts()

0    164
1    164
Name: target, dtype: int64

# **Exploratory Data Analysis (EDA)**

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 302 entries, 0 to 878
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       302 non-null    float64
 1   sex       302 non-null    int64  
 2   cp        302 non-null    int64  
 3   trestbps  302 non-null    float64
 4   chol      302 non-null    float64
 5   fbs       302 non-null    int64  
 6   restecg   302 non-null    int64  
 7   thalach   302 non-null    float64
 8   exang     302 non-null    int64  
 9   oldpeak   302 non-null    float64
 10  slope     302 non-null    int64  
 11  ca        302 non-null    int64  
 12  thal      302 non-null    int64  
 13  target    302 non-null    int64  
dtypes: float64(5), int64(9)
memory usage: 35.4 KB


* age - Numerical (Interval)
* sex - Nominal
* cp - Ordinal
* trestbps - Numerical (Interval)
* chol - Numerical (Interval)
* fbs - Nominal
* restecg - Ordinal
* thalach - Numerical (Interval)
* exang - Nominal
* oldpeak - Numerical (Interval)
* slope - Ordinal
* ca - Ordinal
* thal - Ordinal
* target - Nominal

In [None]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0,302.0
mean,54.42053,0.682119,0.963576,131.258278,245.37707,0.149007,0.52649,149.612997,0.327815,1.027815,1.397351,0.718543,2.31457,0.543046
std,9.04797,0.466426,1.032044,16.605232,47.486683,0.356686,0.526027,22.765983,0.470196,1.110395,0.616274,1.006748,0.613026,0.49897
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,84.125,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,133.25,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.5,1.0,1.0,130.0,240.5,0.0,1.0,152.5,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.75,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,170.0,370.375,1.0,2.0,202.0,1.0,4.0,2.0,4.0,3.0,1.0


* **Mean (Rata-rata)**: Ini adalah jumlah semua nilai dalam kumpulan data dibagi oleh jumlah elemen dalam kumpulan data. Mean sangat dipengaruhi oleh outlier (nilai yang jauh lebih besar atau lebih kecil dari nilai lainnya), sehingga mungkin tidak selalu mewakili “pusat” data dengan baik jika outlier ada.
* **Median**: Ini adalah nilai tengah dalam kumpulan data yang telah diurutkan. Jika jumlah elemen dalam kumpulan data adalah genap, median adalah rata-rata dari dua nilai tengah. Median lebih tahan terhadap outlier dibandingkan mean.
* **Modus**: Ini adalah nilai yang paling sering muncul dalam kumpulan data. Kumpulan data dapat memiliki lebih dari satu modus jika ada beberapa nilai yang muncul dengan frekuensi yang sama dan paling sering. Modus bisa digunakan untuk data numerik dan kategorikal.


* **Minimum**: Nilai terkecil dalam dataset.
* **Kuartil Pertama (Q1)**: Nilai di mana 25% data berada di bawahnya. Ini juga dikenal sebagai kuartil bawah.
* **Median (Q2)**: Nilai tengah dataset. Jika jumlah data genap, median adalah rata-rata dari dua nilai tengah.
* **Kuartil Ketiga (Q3)**: Nilai di mana 75% data berada di bawahnya. Ini juga dikenal sebagai kuartil atas.
* **Maksimum**: Nilai terbesar dalam dataset

In [None]:
for column in df.columns:
  num_distinct_values = df[column].nunique()
  distinct_values = df[column].unique()
  print(f"{column}: {num_distinct_values} -> {distinct_values}\n")

age: 41 -> [52. 53. 70. 61. 62. 58. 55. 46. 54. 71. 43. 34. 51. 50. 60. 67. 45. 63.
 42. 44. 56. 57. 59. 64. 65. 41. 66. 38. 49. 48. 29. 37. 47. 68. 76. 40.
 39. 77. 69. 35. 74.]

sex: 2 -> [1 0]

cp: 4 -> [0 1 2 3]

trestbps: 43 -> [125. 140. 145. 148. 138. 100. 114. 160. 120. 122. 112. 132. 118. 128.
 124. 106. 104. 135. 130. 136. 170. 129. 150. 146. 117. 152. 154. 134.
 144. 108. 123. 110. 142. 126. 115.  94. 165. 102. 105. 155. 164. 156.
 101.]

chol: 148 -> [212.    203.    174.    294.    248.    318.    289.    249.    286.
 149.    341.    210.    298.    204.    308.    266.    244.    211.
 185.    223.    208.    252.    209.    307.    233.    319.    256.
 327.    169.    131.    269.    196.    231.    213.    271.    263.
 229.    360.    258.    330.    342.    226.    228.    278.    230.
 283.    241.    175.    188.    217.    193.    245.    232.    299.
 288.    197.    315.    215.    164.    326.    207.    177.    257.
 255.    187.    201.    220.    268.    26

In [None]:
columns = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
fig = make_subplots(rows=math.ceil(len(columns) / 5), cols=5, subplot_titles=columns)
for i, column in enumerate(columns):
  fig.add_trace(go.Histogram(x=df[column], nbinsx=20, name=column), row=i//5+1, col=i%5+1)
fig.show()

Pada oldpeak terlihat right skewed distribution

In [None]:
corr_matrix = df.corr()
fig = ff.create_annotated_heatmap(
  z=corr_matrix.values,
  x=list(corr_matrix.columns),
  y=list(corr_matrix.index),
  annotation_text=corr_matrix.round(2).values,
  showscale=True)

fig.show()

In [None]:
cross_tab = pd.crosstab(df['target'], df['sex'])
fig = go.Figure(data=[
  go.Bar(name='Female', x=['No Disease', 'Disease'], y=cross_tab[0]),
  go.Bar(name='Male', x=['No Disease', 'Disease'], y=cross_tab[1])
])
fig.update_layout(barmode='group', title='Heart Disease Frequency for Sex', width=800, height=600)

fig.show()


In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(x=df.age[df.target==1],y=df.thalach[df.target==1],mode='markers',marker=dict(color="red"),name="Disease"))
fig.add_trace(go.Scatter(x=df.age[df.target==0],y=df.thalach[df.target==0],mode='markers',marker=dict(color="blue"),name="No Disease"))
fig.update_layout(
  title="Heart Disease in function of Age and Max Heart Rate",
  width=900,
  height=600,
  xaxis=dict(title="Age"),
  yaxis=dict(title="Max Heart Rate")
)

fig.show()

In [None]:
crosstab_data = pd.crosstab(df.cp, df.target)

fig = go.Figure()
fig.add_trace(go.Bar(x=crosstab_data.index, y=crosstab_data[0], name='No Disease', marker_color='lightblue'))
fig.add_trace(go.Bar(x=crosstab_data.index,y=crosstab_data[1],name='Disease',marker_color='salmon'))
fig.update_layout(
  title="Heart Disease Frequency Per Chest Pain Type",
  xaxis=dict(title="Chest Pain Type"),
  yaxis=dict(title="Amount"),
  legend=dict(title="Legend"),
  barmode='group',
  width=800,
  height=500
)

fig.show()


# **Feature Engineering**

### **1. Membuat kelompok caterory**
* 1 untuk usia < 12 (Kids)
* 2 untuk usia 12-19 (Teenagers)
* 3 untuk usia 20-29 (Young Adults)
* 4 untuk usia 30-49 (Adults)
* 5 untuk usia 50+ (Elderly)


In [None]:
df['age_category'] = pd.cut(df['age'], bins=[0, 12, 20, 30, 50, np.inf], labels=[1, 2, 3, 4, 5])
print(f"age_category: {df['age_category'].nunique()} -> {df['age_category'].unique()}")

age_category: 3 -> [5, 4, 3]
Categories (5, int64): [1 < 2 < 3 < 4 < 5]


In [None]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,age_category
0,52.0,1,0,125.0,212.0,0,1,168.0,0,1.0,2,2,3,0,5
1,53.0,1,0,140.0,203.0,1,0,155.0,1,3.1,0,0,3,0,5
2,70.0,1,0,145.0,174.0,0,1,125.0,1,2.6,0,0,3,0,5
3,61.0,1,0,148.0,203.0,0,1,161.0,0,0.0,2,1,3,0,5
4,62.0,0,0,138.0,294.0,1,1,106.0,0,1.9,1,3,2,0,5


### **2. Indikator jika tekanan darah istirahat di atas 140**

* 0 = False
* 1 = True

In [None]:
df['high_resting_bp'] = (df['trestbps'] > 140).astype(int)
print(f"high_resting_bp: {df['high_resting_bp'].nunique()} -> {df['high_resting_bp'].unique()}")

high_resting_bp: 2 -> [0 1]


In [None]:
df.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,age_category,high_resting_bp
0,52.0,1,0,125.0,212.0,0,1,168.0,0,1.0,2,2,3,0,5,0
1,53.0,1,0,140.0,203.0,1,0,155.0,1,3.1,0,0,3,0,5,0
2,70.0,1,0,145.0,174.0,0,1,125.0,1,2.6,0,0,3,0,5,1
3,61.0,1,0,148.0,203.0,0,1,161.0,0,0.0,2,1,3,0,5,1
4,62.0,0,0,138.0,294.0,1,1,106.0,0,1.9,1,3,2,0,5,0


### **3. Indikator jika kolesterol serum di atas 200**

* 0 = False
* 1 = True

In [None]:
df['high_chol'] = (df['chol'] > 200).astype(int)
print(f"high_chol: {df['high_chol'].nunique()} -> {df['high_chol'].unique()}")

high_chol: 2 -> [1 0]


In [None]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,age_category,high_resting_bp,high_chol
0,52.0,1,0,125.0,212.0,0,1,168.0,0,1.0,2,2,3,0,5,0,1
1,53.0,1,0,140.0,203.0,1,0,155.0,1,3.1,0,0,3,0,5,0,1
2,70.0,1,0,145.0,174.0,0,1,125.0,1,2.6,0,0,3,0,5,1,0
3,61.0,1,0,148.0,203.0,0,1,161.0,0,0.0,2,1,3,0,5,1,1
4,62.0,0,0,138.0,294.0,1,1,106.0,0,1.9,1,3,2,0,5,0,1


### **4. Indikator jika frekuensi jantung maksimum dicapai di bawah 150**
* 0 = False
* 1 = True

In [None]:
df['low_max_hr'] = (df['thalach'] > 200).astype(int)
print(f"low_max_hr: {df['low_max_hr'].nunique()} -> {df['low_max_hr'].unique()}")

low_max_hr: 2 -> [0 1]


In [None]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,age_category,high_resting_bp,high_chol,low_max_hr
0,52.0,1,0,125.0,212.0,0,1,168.0,0,1.0,2,2,3,0,5,0,1,0
1,53.0,1,0,140.0,203.0,1,0,155.0,1,3.1,0,0,3,0,5,0,1,0
2,70.0,1,0,145.0,174.0,0,1,125.0,1,2.6,0,0,3,0,5,1,0,0
3,61.0,1,0,148.0,203.0,0,1,161.0,0,0.0,2,1,3,0,5,1,1,0
4,62.0,0,0,138.0,294.0,1,1,106.0,0,1.9,1,3,2,0,5,0,1,0


### **5. Rasio Kolesterol dan Tekanan Darah**
kolesterol dibagi tekanan darah

In [None]:
df['chol_bp_ratio'] = df['chol'] / df['trestbps']
print(f"chol_bp_ratio: {df['chol_bp_ratio'].nunique()}")

chol_bp_ratio: 277


In [None]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,age_category,high_resting_bp,high_chol,low_max_hr,chol_bp_ratio
0,52.0,1,0,125.0,212.0,0,1,168.0,0,1.0,2,2,3,0,5,0,1,0,1.696
1,53.0,1,0,140.0,203.0,1,0,155.0,1,3.1,0,0,3,0,5,0,1,0,1.45
2,70.0,1,0,145.0,174.0,0,1,125.0,1,2.6,0,0,3,0,5,1,0,0,1.2
3,61.0,1,0,148.0,203.0,0,1,161.0,0,0.0,2,1,3,0,5,1,1,0,1.371622
4,62.0,0,0,138.0,294.0,1,1,106.0,0,1.9,1,3,2,0,5,0,1,0,2.130435


### **6. Rasio Frekuensi Jantung Maksimum dan Usia**
Rasio ini bisa memberikan wawasan tentang seberapa baik jantung seseorang bekerja relatif terhadap usianya.

In [None]:
df['maxhr_age_ratio'] = df['thalach'] / df['age']
print(f"maxhr_age_ratio: {df['maxhr_age_ratio'].nunique()}")

maxhr_age_ratio: 285


In [None]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,age_category,high_resting_bp,high_chol,low_max_hr,chol_bp_ratio,maxhr_age_ratio
0,52.0,1,0,125.0,212.0,0,1,168.0,0,1.0,2,2,3,0,5,0,1,0,1.696,3.230769
1,53.0,1,0,140.0,203.0,1,0,155.0,1,3.1,0,0,3,0,5,0,1,0,1.45,2.924528
2,70.0,1,0,145.0,174.0,0,1,125.0,1,2.6,0,0,3,0,5,1,0,0,1.2,1.785714
3,61.0,1,0,148.0,203.0,0,1,161.0,0,0.0,2,1,3,0,5,1,1,0,1.371622,2.639344
4,62.0,0,0,138.0,294.0,1,1,106.0,0,1.9,1,3,2,0,5,0,1,0,2.130435,1.709677


### **7. Tekanan Darah dikurangi Usia**
Mengurangi usia dari tekanan darah bisa menjadi indikator yang baik tentang kesehatan jantung seseorang.

In [None]:
df['bp_minus_age'] = df['trestbps'] - df['age']
print(f"bp_minus_age: {df['bp_minus_age'].nunique()}")

bp_minus_age: 75


In [None]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,age_category,high_resting_bp,high_chol,low_max_hr,chol_bp_ratio,maxhr_age_ratio,bp_minus_age
0,52.0,1,0,125.0,212.0,0,1,168.0,0,1.0,2,2,3,0,5,0,1,0,1.696,3.230769,73.0
1,53.0,1,0,140.0,203.0,1,0,155.0,1,3.1,0,0,3,0,5,0,1,0,1.45,2.924528,87.0
2,70.0,1,0,145.0,174.0,0,1,125.0,1,2.6,0,0,3,0,5,1,0,0,1.2,1.785714,75.0
3,61.0,1,0,148.0,203.0,0,1,161.0,0,0.0,2,1,3,0,5,1,1,0,1.371622,2.639344,87.0
4,62.0,0,0,138.0,294.0,1,1,106.0,0,1.9,1,3,2,0,5,0,1,0,2.130435,1.709677,76.0


### **8. Rasio antara Frekuensi Jantung Maksimum dan Tekanan Darah Istirahat**
Rasio ini bisa memberikan wawasan tentang seberapa baik jantung seseorang bekerja relatif terhadap tekanan darahnya.

In [None]:
df['thalach_trestbps_ratio'] = df['thalach'] / df['trestbps']
print(f"thalach_trestbps_ratio: {df['thalach_trestbps_ratio'].nunique()}")

thalach_trestbps_ratio: 260


In [None]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,age_category,high_resting_bp,high_chol,low_max_hr,chol_bp_ratio,maxhr_age_ratio,bp_minus_age,thalach_trestbps_ratio
0,52.0,1,0,125.0,212.0,0,1,168.0,0,1.0,2,2,3,0,5,0,1,0,1.696,3.230769,73.0,1.344
1,53.0,1,0,140.0,203.0,1,0,155.0,1,3.1,0,0,3,0,5,0,1,0,1.45,2.924528,87.0,1.107143
2,70.0,1,0,145.0,174.0,0,1,125.0,1,2.6,0,0,3,0,5,1,0,0,1.2,1.785714,75.0,0.862069
3,61.0,1,0,148.0,203.0,0,1,161.0,0,0.0,2,1,3,0,5,1,1,0,1.371622,2.639344,87.0,1.087838
4,62.0,0,0,138.0,294.0,1,1,106.0,0,1.9,1,3,2,0,5,0,1,0,2.130435,1.709677,76.0,0.768116


### **9. Rasio antara Jumlah Pembuluh Utama yang Diwarnai oleh Flourosopy dan Usia**

Rasio ini bisa memberikan wawasan tentang seberapa baik sirkulasi darah seseorang relatif terhadap usianya.

In [None]:
df['ca_age_ratio'] = df['ca'] / df['age']
print(f"ca_age_ratio: {df['ca_age_ratio'].nunique()}")

ca_age_ratio: 64


In [None]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,age_category,high_resting_bp,high_chol,low_max_hr,chol_bp_ratio,maxhr_age_ratio,bp_minus_age,thalach_trestbps_ratio,ca_age_ratio
0,52.0,1,0,125.0,212.0,0,1,168.0,0,1.0,2,2,3,0,5,0,1,0,1.696,3.230769,73.0,1.344,0.038462
1,53.0,1,0,140.0,203.0,1,0,155.0,1,3.1,0,0,3,0,5,0,1,0,1.45,2.924528,87.0,1.107143,0.0
2,70.0,1,0,145.0,174.0,0,1,125.0,1,2.6,0,0,3,0,5,1,0,0,1.2,1.785714,75.0,0.862069,0.0
3,61.0,1,0,148.0,203.0,0,1,161.0,0,0.0,2,1,3,0,5,1,1,0,1.371622,2.639344,87.0,1.087838,0.016393
4,62.0,0,0,138.0,294.0,1,1,106.0,0,1.9,1,3,2,0,5,0,1,0,2.130435,1.709677,76.0,0.768116,0.048387


### **10. rata-rata dari age, trestbps dan chol**

In [None]:
df['age_trestbps_chol_mean'] = df[['age', 'trestbps', 'chol']].mean(axis=1)
print(f"age_trestbps_chol_mean: {df['age_trestbps_chol_mean'].nunique()}")

age_trestbps_chol_mean: 160


In [None]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,age_category,high_resting_bp,high_chol,low_max_hr,chol_bp_ratio,maxhr_age_ratio,bp_minus_age,thalach_trestbps_ratio,ca_age_ratio,age_trestbps_chol_mean
0,52.0,1,0,125.0,212.0,0,1,168.0,0,1.0,2,2,3,0,5,0,1,0,1.696,3.230769,73.0,1.344,0.038462,129.666667
1,53.0,1,0,140.0,203.0,1,0,155.0,1,3.1,0,0,3,0,5,0,1,0,1.45,2.924528,87.0,1.107143,0.0,132.0
2,70.0,1,0,145.0,174.0,0,1,125.0,1,2.6,0,0,3,0,5,1,0,0,1.2,1.785714,75.0,0.862069,0.0,129.666667
3,61.0,1,0,148.0,203.0,0,1,161.0,0,0.0,2,1,3,0,5,1,1,0,1.371622,2.639344,87.0,1.087838,0.016393,137.333333
4,62.0,0,0,138.0,294.0,1,1,106.0,0,1.9,1,3,2,0,5,0,1,0,2.130435,1.709677,76.0,0.768116,0.048387,164.666667


### **Membuat kolom baru dengan data yang di standarisasi menggunakan standard scaler**
* age -> age_scaled
* trestbps -> trestbps_scaled
* chol -> chol_scaled
* thalach -> thalach_scaled
* oldpeak -> oldpeak_scaled

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
columns_to_scale = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
for col in columns_to_scale:
  df[col + '_scaled'] = scaler.fit_transform(df[[col]])

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,age_category,high_resting_bp,high_chol,low_max_hr,chol_bp_ratio,maxhr_age_ratio,bp_minus_age,thalach_trestbps_ratio,ca_age_ratio,age_trestbps_chol_mean,age_scaled,trestbps_scaled,chol_scaled,thalach_scaled,oldpeak_scaled
0,52.0,1,0,125.0,212.0,0,1,168.0,0,1.0,2,2,3,0,5,0,1,0,1.696,3.230769,73.0,1.344,0.038462,129.666667,-0.267966,-0.377511,-0.704039,0.808993,-0.025091
1,53.0,1,0,140.0,203.0,1,0,155.0,1,3.1,0,0,3,0,5,0,1,0,1.45,2.924528,87.0,1.107143,0.0,132.0,-0.15726,0.527318,-0.89388,0.237018,1.869266
2,70.0,1,0,145.0,174.0,0,1,125.0,1,2.6,0,0,3,0,5,1,0,0,1.2,1.785714,75.0,0.862069,0.0,129.666667,1.724733,0.828927,-1.505591,-1.082925,1.418229
3,61.0,1,0,148.0,203.0,0,1,161.0,0,0.0,2,1,3,0,5,1,1,0,1.371622,2.639344,87.0,1.087838,0.016393,137.333333,0.728383,1.009893,-0.89388,0.501006,-0.927166
4,62.0,0,0,138.0,294.0,1,1,106.0,0,1.9,1,3,2,0,5,0,1,0,2.130435,1.709677,76.0,0.768116,0.048387,164.666667,0.839089,0.406674,1.025627,-1.918889,0.786777
