# 05-Logistic Regression: Problems

## 모듈 불러오기

In [None]:
# 데이터 전처리 패키지
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# 기계학습 모델 구축 및 평가 패키지
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, accuracy_score, confusion_matrix, recall_score, precision_score

# 데이터 시각화 패키지
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

# # 한글 폰트 설정
# plt.rc('font', family='Malgun Gothic')

# !git clone https://github.com/KU-DIC/LG_time_series_day03.git

## 데이터 불러오기

### Parkinson 질병 예측
: This dataset is composed of a range of biomedical voice measurements from 31 people, 23 with Parkinson's disease (PD). Each column in the table is a particular voice measure, and each row corresponds one of 195 voice recording from these individuals ("name" column). The main aim of the data is to discriminate healthy people from those with PD, according to "status" column which is set to 0 for healthy and 1 for PD.

The data is in ASCII CSV format. The rows of the CSV file contain an instance corresponding to one voice recording. There are around six recordings per patient, the name of the patient is identified in the first column.For further information or to pass on comments, please contact Max Little (littlem '@' robots.ox.ac.uk).

Further details are contained in the following reference -- if you use this dataset, please cite:
Max A. Little, Patrick E. McSharry, Eric J. Hunter, Lorraine O. Ramig (2008), 'Suitability of dysphonia measurements for telemonitoring of Parkinson's disease', IEEE Transactions on Biomedical Engineering (to appear).

### 타겟변수(Y): status (1:Parkinson, 0:Healthy)

### 0: 정상 / 1: 환자

In [None]:
data = pd.read_csv('''Answer''')
data.head()

## 데이터 전처리

### 데이터 행과 열 개수 확인

In [None]:
data.'''Answer'''

### Column별 기초 통계량 확인

In [None]:
data.'''Answer'''()

### 필요하지 않은 변수 제거

In [None]:
data = data.drop('''Answer''')

### 설명변수(X), 반응변수(y) 나누기

In [None]:
X = '''Answer'''
y = '''Answer'''

display(X.head(3))

### 변수별 target y 값 분포 확인

In [None]:
parkinson = '''Answer'''
healthy =  '''Answer'''

plt.figure(figsize=[20,15])

for col in range(22):
    plt.subplot(6,4,col+1)
    plt.hist(parkinson.iloc[:,col],bins=20, alpha=0.3) # 막대 너비 = 전체 구간을 bins=20으로 나누어 준 너비
    plt.hist(healthy.iloc[:,col], bins=20 ,alpha=0.3) # alpha 는 투명도(0~1)를 조절
    plt.title(X.columns[col])
    plt.legend(['Healty', 'Parkinson'])
    plt.xticks([])

### Training 데이터 70% / Testing 데이터 30% 나누기

In [None]:
X_train, X_test, y_train, y_test = train_test_split('''Answer''')

### 데이터 전처리 (정규화)

In [None]:
normalizer = '''Answer'''
X_train = normalizer.fit_transform('''Answer''')
X_test = normalizer.'''Answer'''

## 모델링

In [None]:
# model 불러오기
model = LogisticRegression()

# model 학습시키기
model.fit('''Answer''')

## 모델 결과물 확인 (변수별 beta, exp(beta) 확인)

In [None]:
# 데이터 컬럼에 따른 beta, exp(beta) 체크
column_name = ["const"] + X.columns.tolist()
beta = np.concatenate([model.intercept_,model.coef_.reshape('''Answer''')]).round(2) # round(2) 소수점 둘째짜리까지 반올림
odds = np.exp(beta).round(2) # round(2) 소수점 둘째짜리까지 반올림
interpret = np.where(beta>0,'risky','protective') # np.where() 조건문 

# beta 해석 정리
beta_analysis = pd.DataFrame(np.c_[beta,odds,interpret],index=column_name, columns=['beta','exp(beta)','interpret'])
beta_analysis

### Numpy를 통해 계산한 결과와 model.predict_proba() 함수를 사용한 결과 비교

In [None]:
probs = model.predict_proba('''Answer''')[:,1]

## beta 곱하기 X 계산 (np.c_ :상수항 파트)
Xbeta = np.matmul(np.c_['''Answer'''],beta.reshape(-1,1))

## P(Y=1) 계산
P_1 = 1 / (1+np.exp(-Xbeta))

## 패키지와 직접 계산 결과 비교
pd.DataFrame(np.concatenate([P_1,probs.reshape(-1,1)],axis=1),columns=['직접','패키지'])[:10]

### 산출한 예측 결과값의 성능을 Cut off 값을 변경하며 Accuracy 관찰할 것.

In [None]:
Cut_off = np.linspace(0,1,11)

## cut off 값 여러개 만들어두기
for cutoff in Cut_off:
    y_pred = np.where(probs.reshape(-1)>=cutoff,1,0) # 확률값이 cuf off보다 크면 1(유방암), 아니면 0(유방암x)
    acc = '''Answer'''(y_true = y_test, y_pred =y_pred)
    recall = '''Answer'''(y_true = y_test, y_pred =y_pred)
    precision = '''Answer'''(y_true = y_test, y_pred =y_pred)
    
    print(f"정확도:{acc:0.2f}",f"민감도:{recall:0.2f}",f"정밀도:{precision:0.2f}",f"cut off:{cutoff:0.2f}", sep='  |  ')  

### 최종 Cut-off에 대해 Confusion Matrix 그려보기

In [None]:
best_cut_off = 0.5
y_pred = np.where('''Answer'''.reshape(-1)>=best_cut_off,1,0)
cm = '''Answer'''(y_true = y_test, y_pred =y_pred)

In [None]:
sns.heatmap(cm, xticklabels=['Healthy', 'Parkinson'], yticklabels=['Healthy', 'Parkinson'], annot=True, fmt="d");
plt.title("Cut_off : {:.2f}".format(best_cut_off))
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.show()

### ROC Curve 그려보기

In [None]:
# Roc curve X 축, Y축 추출
model_fpr, model_tpr, threshold1 = roc_curve(y_test, probs)
random_fpr, random_tpr, threshold2 = roc_curve(y_test, [0 for i in range(len(X_test))])

# plot the roc curve for the model and random
plt.figure('''Answer''')
plt.plot('''Answer''')
plt.plot('''Answer''')

# axis labels
plt.'''Answer'''('False Positive Rate',size=20)  # X axis labels
plt.'''Answer'''('True Positive Rate',size=20)  # Y axis labels
 
# show the legend
plt.'''Answer'''(fontsize=20)
 
# show the plot
plt.title("ROC curve",size=20)
plt.show()