## Import Data

In [1]:
import numpy as np
import pandas as pd
data = pd.read_csv("../data/heart.csv")
data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


## Explore Data

In [2]:
data.shape

(918, 12)

In [3]:
data.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [4]:
cont_feats =list(data.describe().columns)
cont_feats

['Age',
 'RestingBP',
 'Cholesterol',
 'FastingBS',
 'MaxHR',
 'Oldpeak',
 'HeartDisease']

In [5]:
for col in data.columns:
    print(col+": "+str(type(data.iloc[0][col])))

Age: <class 'numpy.int64'>
Sex: <class 'str'>
ChestPainType: <class 'str'>
RestingBP: <class 'numpy.int64'>
Cholesterol: <class 'numpy.int64'>
FastingBS: <class 'numpy.int64'>
RestingECG: <class 'str'>
MaxHR: <class 'numpy.int64'>
ExerciseAngina: <class 'str'>
Oldpeak: <class 'numpy.float64'>
ST_Slope: <class 'str'>
HeartDisease: <class 'numpy.int64'>


In [7]:
data.isna().any()

Age               False
Sex               False
ChestPainType     False
RestingBP         False
Cholesterol       False
FastingBS         False
RestingECG        False
MaxHR             False
ExerciseAngina    False
Oldpeak           False
ST_Slope          False
HeartDisease      False
dtype: bool

In [8]:
#importing required packages
import pandas as pd
import pandas_profiling
import numpy as np

#importing the data
df = pd.read_csv('../data/heart.csv')

#descriptive statistics
pandas_profiling.ProfileReport(df)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



## Data Processing

In [10]:
## Remove Missing Values
data = data.dropna(how='any')
data.shape

(918, 12)

In [11]:
target = 'HeartDisease'
data[target].value_counts()

1    508
0    410
Name: HeartDisease, dtype: int64

In [12]:
feats = [feat for feat in data.columns if feat!=target]
feats

['Age',
 'Sex',
 'ChestPainType',
 'RestingBP',
 'Cholesterol',
 'FastingBS',
 'RestingECG',
 'MaxHR',
 'ExerciseAngina',
 'Oldpeak',
 'ST_Slope']

In [13]:
cont_feats = [col for col in feats if col in data.describe().columns]
cont_feats

['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']

In [14]:
cat_feats = [feat for feat in feats if feat not in cont_feats]
cat_feats

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

In [15]:
## Onehot_encoding
data_onehot = pd.get_dummies(data[cat_feats], drop_first=True)
df = pd.concat([data[cont_feats], data_onehot, data[target]], axis=1)
df

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up,HeartDisease
0,40,140,289,0,172,0.0,1,1,0,0,1,0,0,0,1,0
1,49,160,180,0,156,1.0,0,0,1,0,1,0,0,1,0,1
2,37,130,283,0,98,0.0,1,1,0,0,0,1,0,0,1,0
3,48,138,214,0,108,1.5,0,0,0,0,1,0,1,1,0,1
4,54,150,195,0,122,0.0,1,0,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,1,0,0,1,1,0,0,1,0,1
914,68,144,193,1,141,3.4,1,0,0,0,1,0,0,1,0,1
915,57,130,131,0,115,1.2,1,0,0,0,1,0,1,1,0,1
916,57,130,236,0,174,0.0,0,1,0,0,0,0,0,1,0,1


### Train/Test Split

In [16]:
## Train/test split
from sklearn.model_selection import train_test_split
feats = [col for fol in df.columns if col != target]
X = df.drop(target, axis=1)
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((734, 15), (184, 15), (734,), (184,))

### Outlier Detection

In [17]:
# train the COPOD detector
from pyod.models.copod import COPOD
clf = COPOD()
clf.fit(X_train)

# get outlier scores
sum(clf.predict(X_train))

74

In [18]:
# train the IForest detector
from pyod.models.iforest import IForest
clf = IForest()
clf.fit(X_train)

# get outlier scores
sum(clf.predict(X_train))

74

In [19]:
# train the COPOD detector
from pyod.models.knn import KNN
od = KNN()
od.fit(X_train)

# get outlier scores
sum(od.predict(X_train))

60

In [20]:
X_train['outlier'] = od.predict(X_train)
X_train = X_train[X_train['outlier']==0]
X_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['outlier'] = od.predict(X_train)


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up,outlier
852,43,120,177,0,120,2.5,1,0,0,0,0,0,1,1,0,0
121,52,125,272,0,139,0.0,0,0,1,0,1,0,0,0,1,0
664,65,150,225,0,114,1.0,0,0,0,0,0,0,0,1,0,0
187,41,120,237,1,138,1.0,1,0,0,0,1,0,1,1,0,0
108,50,140,129,0,135,0.0,1,0,0,0,1,0,0,0,1,0


In [21]:
train_inds = X_train.index
y_train = y_train.loc[train_inds]
X_train.shape, y_train.shape

((674, 16), (674,))

In [22]:
X_test['outlier'] = od.predict(X_test)
X_test = X_test[X_test['outlier']==0]
X_test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['outlier'] = od.predict(X_test)


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up,outlier
900,58,114,318,0,140,4.4,1,0,0,0,0,1,0,0,0,0
570,56,128,223,0,119,2.0,1,0,0,0,0,1,1,0,0,0
791,51,140,298,0,122,4.2,1,0,0,0,1,0,1,1,0,0
191,50,170,209,0,116,0.0,1,1,0,0,0,1,0,0,1,0
643,58,112,230,0,165,2.5,1,0,1,0,0,0,0,1,0,0


In [23]:
test_inds = X_test.index
y_test = y_test.loc[test_inds]
X_test.shape, y_test.shape

((165, 16), (165,))

### Scaling

In [24]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)
X_train.shape, X_test.shape

((674, 16), (165, 16))

## Modeling

### Logistic Regression

In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
model = LogisticRegression()
cvs = [KFold(n_splits=5, random_state=1, shuffle=True), StratifiedKFold(n_splits=5, random_state=1, shuffle=True)]
for cv in cvs:
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    mean_score = np.mean(scores)
    print(str(cv) + ": " + str(mean_score))

KFold(n_splits=5, random_state=1, shuffle=True): 0.8627167973390355
StratifiedKFold(n_splits=5, random_state=1, shuffle=True): 0.8693276312663339


### KNN

In [49]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
cvs = [KFold(n_splits=5, random_state=1, shuffle=True), StratifiedKFold(n_splits=5, random_state=1, shuffle=True)]
for cv in cvs:
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    mean_score = np.mean(scores)
    print(str(cv) + ": " + str(mean_score))

KFold(n_splits=5, random_state=1, shuffle=True): 0.7059218341648847
StratifiedKFold(n_splits=5, random_state=1, shuffle=True): 0.6938702779757662


### LGBM


In [25]:
from lightgbm import LightGBMClassifier
model = LightGBMClassifier()
cvs = [KFold(n_splits=5, random_state=1, shuffle=True), StratifiedKFold(n_splits=5, random_state=1, shuffle=True)]
for cv in cvs:
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    mean_score = np.mean(scores)
    print(str(cv) + ": " + str(mean_score))

ModuleNotFoundError: No module named 'lightgbm'

In [26]:
!pip install lightgbm

Collecting lightgbm
  Using cached lightgbm-3.3.2.tar.gz (1.5 MB)
Building wheels for collected packages: lightgbm
  Building wheel for lightgbm (setup.py) ... [?25lerror
[31m  ERROR: Command errored out with exit status 1:
   command: /Users/wlee/opt/anaconda3/bin/python -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'/private/var/folders/k3/klldpxyd4jjg0yszf0qbsrmm0000gn/T/pip-install-kq88vyz8/lightgbm/setup.py'"'"'; __file__='"'"'/private/var/folders/k3/klldpxyd4jjg0yszf0qbsrmm0000gn/T/pip-install-kq88vyz8/lightgbm/setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' bdist_wheel -d /private/var/folders/k3/klldpxyd4jjg0yszf0qbsrmm0000gn/T/pip-wheel-io3h2u4p
       cwd: /private/var/folders/k3/klldpxyd4jjg0yszf0qbsrmm0000gn/T/pip-install-kq88vyz8/lightgbm/
  Complete output (76 lines):
  running bdist_wheel
  running build
  running build_py
  creatin

[?25h