In [47]:
# Import pandas, numpy
import pandas as pd
import numpy as np

# 분류문제를 위한 모델 import
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# 분류모델 평가를 위한 accuracy_score import
from sklearn.metrics import accuracy_score
# 분류모델 평가를 위한 train/test set 분리를 위한 train_test_split import
from sklearn.model_selection import train_test_split
# 정규화를 위한 StandardScaler import
from sklearn.preprocessing import StandardScaler

# Seed 고정
import random
import os
seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

In [48]:
# 분류 모델 평가를 위해 default값을 넣은 분류모델을 각각 학습
# train 과 test으로 분리한 데이터로 각 분류모델별 accuracy_score 계산 후 result 배열에 넣기
def _classification(x, y, result:list):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1, stratify=y)
    
    for model in [KNeighborsClassifier(), KNeighborsClassifier(weights='distance'), LogisticRegression(solver='newton-cg'), LinearDiscriminantAnalysis(), DecisionTreeClassifier(), RandomForestClassifier()]:
        model.fit(x_train, y_train)
        res = model.predict(x_test)
        result.append({'model': str(model), 'score': accuracy_score(res, y_test)})

In [49]:
# 데이터 불러오기
data = pd.read_csv('train.csv').drop('index', axis=1)
test = pd.read_csv('test.csv')

# Outlier 제거
data = data.drop(data[(data.thal == -1) | (data.ca == -1)].index)

# 심장병 존재 시 전부 1로 바꿈
data.loc[data.target > 1, 'target'] = 1

x = data.drop(['target'], axis=1)
test = test.drop(['index'], axis=1)
y = data['target']


In [51]:
result = []
_classification(x, y, result)
print(pd.DataFrame(result))

                                      model     score
0                    KNeighborsClassifier()  0.708333
1  KNeighborsClassifier(weights='distance')  0.666667
2    LogisticRegression(solver='newton-cg')  0.791667
3              LinearDiscriminantAnalysis()  0.770833
4                  DecisionTreeClassifier()  0.729167
5                  RandomForestClassifier()  0.729167


In [52]:
submit = pd.read_csv('submit.csv')
model = LogisticRegression(solver='saga', max_iter=10000)
model.fit(x, y)
res = model.predict(test)
submit['target'] = res
submit.to_csv('submission.csv', index=False)