# PIMA DATA

<img src='https://storage.googleapis.com/kaggle-datasets-images/228/482/a520351269b547c89afe790820a1087e/dataset-cover.jpeg'>
* ref : kaggle  : https://www.kaggle.com/uciml/pima-indians-diabetes-database

#### 환자가 당뇨병을 가지고 있는지 예측 --> 분류문제

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split   # def    # 데이터 분할 함수
from sklearn.tree import DecisionTreeClassifier        # class
from sklearn.ensemble import RandomForestClassifier    # class
from sklearn.metrics import accuracy_score             # def    # 평가 지표 함수
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve
from sklearn.metrics import roc_curve, roc_auc_score, plot_roc_curve
from sklearn.preprocessing import Binarizer

import warnings
warnings.filterwarnings(action='ignore')

sns.set()

## 1. Data Collection

In [None]:
pima = pd.read_csv("./pima.csv")

#### 데이터 컬럼
* Pregnancies(임신 횟수)

* Glucose(포도당)
: 경구 포도당 내성 검사에서 2시간 동안의 혈장 포도당 농도 / 포도당 부하 검사 수치
* BloodPressure(혈압)
: 이완기 혈압(mmHg)
* SkinThickness(피부두께)
: 삼두근 피부 주름 두께(mm) / 삼두근 뒤쪽의 피하지방 측정값(mm)
* Insulin(인슐린)
: 2시간 혈청 인슐린(mu U/ml)
* BMI(체질량지수)
: 체질량지수(체중(kg)/(키(m)^2)
* DiabetesPedigreeFunction(당뇨병 가계기능)
: 당뇨병 가계도 기능 / 당뇨 내력 가중치 값
* Age(나이)
: 나이
* Outcome(결과)
: 클래스 결정 값(0 또는 1)

## 2. EDA(Explore Data Analysis)

In [None]:
# 데이터 확인
pima.head()

In [None]:
# 크기
pima.shape

In [None]:
# 데이터 형식
pima.info()

In [None]:
# 데이터 요약
pima.describe()

In [None]:
sns.set()
pima.hist()
plt.show()

# 공통함수 

In [None]:
# def my_eval(y_val, pred, proba, th,  avg=None):  # avg='binary'
#     accuracy = accuracy_score(y_val, pred)
#     f1 = f1_score(y_val, pred, average=avg)
#     precision = precision_score(y_val, pred, average=avg)
#     recall = recall_score(y_val, pred, average=avg)
#     con_matrix = confusion_matrix(y_val, pred)
#     # cls_report = classification_report(y_val, pred)
#     print(f'정확도{accuracy:.4f} 정밀도:{precision:.4f} 재현률:{recall:.4f} f1:{f1:.4f}')
#     print("오차행렬:\n", con_matrix)

#     # plot_precision_recall_curve(rf_model, X_val, y_val)
#     precisions, recalls, ths = precision_recall_curve(y_val, proba[:, 1])
#     plt.title("precision recall curv %0.2f  %0.4f"%(th, f1))
#     plt.xlabel("threadhold")
#     plt.ylabel("value")
#     plt.grid()
#     plt.plot(ths, precisions[:ths.shape[0]], 'b', label="precision")
#     plt.plot(ths, recalls[:ths.shape[0]], 'r', linestyle="--", label="recall")
#     plt.legend()
#     plt.show()

#     auc_score = roc_auc_score(y_val, proba[:,1])
#     fprs, tprs, ths = roc_curve(y_val, proba[:,1])
#     plt.title("roc curv %0.4f" % auc_score)
#     plt.xlabel("FPR(1-specificity)")
#     plt.ylabel("TPR")
#     plt.grid()
#     plt.plot(fprs, tprs, 'b')
#     plt.plot([0,1],[0,1],'--',color="black")
#     plt.show()



# def split_fit_score(X, y, model=None, test_size=0.2):
#     X_train, X_val, y_train, y_val = train_test_split(X, y,
#                                                       test_size=test_size,
#                                                       random_state=1414,
#                                                       shuffle=False)
#     model.fit(X_train, y_train)
#     pred = model.predict(X_val)                   # [0 1] : 0 1 0 1 1 1
#     proba = model.predict_proba(X_val)            # [.7 >.3] [.4 <.6]  th:0.5-->[0 1] : 컴퓨터가 낸 답안
#     print(proba[:5])
#     proba_positive = proba[:, 1]  # 양성 중에 비정상인?
#     th_list = [0.35, 0.38, 0.42, 0.45, 0.5]
#     for th in th_list:
#          biz = Binarizer(threshold=th)            # [[.7 .3] [.4 .6]]  th:0.2
#          res = biz.fit_transform(proba_positive.reshape(-1,1)) # [ th<.3] [ th<.6]]  ->[1,1](컴퓨터가 예측)컴퓨터가 낸 답안=res
#          print(th)
#          my_eval(y_val, res, proba, th, avg='macro')

#          # plot_roc_curve(model, X_val, y_val)
#          # plt.show()


#     # print(proba[:5], pred[:5])
#     # my_eval(y_val, res, avg='macro')  # ----------점수def호출

# #-----------------------------------------------------------
# rf_model = RandomForestClassifier(n_estimators=500, random_state=1414)
# y = pima["Outcome"]
# X = pima.drop("Outcome", axis=1)   # 문제지: outcome 뺸 나머지

# split_fit_score(X, y, rf_model)

## 3. 학습 모델 선정 : 우선 점수부터 보자
* 숫자로만 이루어져있다
* 결측 X

### Confusion_matrix
<pre>
        예측
  실  TN | FP
  제  -------  
  값  FN | TP
 </pre>

| | |  |
|-|-|:-|
|정확도| $accuracy = \frac{TN+TP}{TP+FP+TN+FP}$|전체 샘플 중 맞게 예측한 비율|
|정밀도| $precision = \frac{TP}{TP+FP}$|True로 예측한 것 중 실제 True인 비율|
|재현률| $precision = \frac{TP}{FN+FP}$|실제 True 중 예측이 True인 비율|
|F1| $f1 = 2*\frac{precision * recall}{precision + recall}$| 정밀도 재현율 조화 평균 |


In [None]:
def my_eval(y_val, pred, avg=None):  # avg='binary'
    accuracy = accuracy_score(y_val, pred)
    f1 = f1_score(y_val, pred, average=avg)
    precision = precision_score(y_val, pred, average=avg)
    recall = recall_score(y_val, pred, average=avg)
    con_matrix = confusion_matrix(y_val, pred)
    # cls_report = classification_report(y_val, pred)
    print(f'정확도{accuracy:.4f} 정밀도:{precision:.4f} 재현률:{recall:.4f} f1:{f1:.4f}')
    print("오차행렬:\n", con_matrix)
    # print('정확도: {:.4f} 정밀도: {:.4f} 재현률: {:.4f} f1: {:.4f}'.format(accuracy,precision,recall,f1))
    
    
    # plot_precision_recall_curve(rf_model, X_val, y_val)
    precisions, recalls, ths = precision_recall_curve(y_val, proba[:, 1])
    plt.title("precision recall curv %0.2f  %0.4f"%(th, f1))
    plt.xlabel("threadhold")
    plt.ylabel("value")
    plt.grid()
    plt.plot(ths, precisions[:ths.shape[0]], 'b', label="precision")
    plt.plot(ths, recalls[:ths.shape[0]], 'r', linestyle="--", label="recall")
    plt.legend()
    plt.show()

    
    auc_score = roc_auc_score(y_val, proba[:,1])
    fprs, tprs, ths = roc_curve(y_val, proba[:,1])
    plt.title("roc curv %0.4f" % auc_score)
    plt.xlabel("FPR(1-specificity)")
    plt.ylabel("TPR")
    plt.grid()
    plt.plot(fprs, tprs, 'b')
    plt.plot([0,1],[0,1],'--',color="black")
    plt.show()

In [None]:
def my_eval(y_val, pred, avg=None):  # avg='binary'
    accuracy = accuracy_score(y_val, pred)
    f1 = f1_score(y_val, pred, average=avg)
    precision = precision_score(y_val, pred, average=avg)
    recall = recall_score(y_val, pred, average=avg)
    con_matrix = confusion_matrix(y_val, pred)
    # cls_report = classification_report(y_val, pred)
    print(f'정확도{accuracy:.4f} 정밀도:{precision:.4f} 재현률:{recall:.4f} f1:{f1:.4f}')
    print("오차행렬:\n", con_matrix)
    # print('정확도: {:.4f} 정밀도: {:.4f} 재현률: {:.4f} f1: {:.4f}'.format(accuracy,precision,recall,f1))
    
    
    # plot_precision_recall_curve(rf_model, X_val, y_val)
    precisions, recalls, ths = precision_recall_curve(y_val, proba[:, 1])
    plt.title("precision recall curv %0.2f  %0.4f"%(th, f1))
    plt.xlabel("threadhold")
    plt.ylabel("value")
    plt.grid()
    plt.plot(ths, precisions[:ths.shape[0]], 'b', label="precision")
    plt.plot(ths, recalls[:ths.shape[0]], 'r', linestyle="--", label="recall")
    plt.legend()
    plt.show()

    
    auc_score = roc_auc_score(y_val, proba[:,1])
    fprs, tprs, ths = roc_curve(y_val, proba[:,1])
    plt.title("roc curv %0.4f" % auc_score)
    plt.xlabel("FPR(1-specificity)")
    plt.ylabel("TPR")
    plt.grid()
    plt.plot(fprs, tprs, 'b')
    plt.plot([0,1],[0,1],'--',color="black")
    plt.show()

In [None]:
def my_eval(y_val, pred, avg=None):  # avg='binary'
    accuracy = accuracy_score(y_val, pred)
    f1 = f1_score(y_val, pred, average=avg)
    precision = precision_score(y_val, pred, average=avg)
    recall = recall_score(y_val, pred, average=avg)
    con_matrix = confusion_matrix(y_val, pred)
    # cls_report = classification_report(y_val, pred)
    print(f'정확도{accuracy:.4f} 정밀도:{precision:.4f} 재현률:{recall:.4f} f1:{f1:.4f}')
    print("오차행렬:\n", con_matrix)
    # print('정확도: {:.4f} 정밀도: {:.4f} 재현률: {:.4f} f1: {:.4f}'.format(accuracy,precision,recall,f1))
    
    
    # plot_precision_recall_curve(rf_model, X_val, y_val)
    precisions, recalls, ths = precision_recall_curve(y_val, proba[:, 1])
    plt.title("precision recall curv %0.2f  %0.4f"%(th, f1))
    plt.xlabel("threadhold")
    plt.ylabel("value")
    plt.grid()
    plt.plot(ths, precisions[:ths.shape[0]], 'b', label="precision")
    plt.plot(ths, recalls[:ths.shape[0]], 'r', linestyle="--", label="recall")
    plt.legend()
    plt.show()

    
    auc_score = roc_auc_score(y_val, proba[:,1])
    fprs, tprs, ths = roc_curve(y_val, proba[:,1])
    plt.title("roc curv %0.4f" % auc_score)
    plt.xlabel("FPR(1-specificity)")
    plt.ylabel("TPR")
    plt.grid()
    plt.plot(fprs, tprs, 'b')
    plt.plot([0,1],[0,1],'--',color="black")
    plt.show()

In [None]:
# rf_모델점수 0.7597402597402597
# dt_모델점수 0.7142857142857143

In [None]:
from sklearn.metrics import precision_recall_curve
precision, recall, th = precision_recall_curve(y_val, proba[:,1])

plt.plot()

pp.title("Precision-Recall curv")


In [None]:
## f1_score() 변경을 위한 임계치 조정

In [None]:
from sklearn.preprocessing import Binarizer
X = [[-1,1,0], [1,2,3], [2,3,4]]
binz = Binarizer(threshold=1)  # 0과 1의 비율을 조정할 수 있음(th=~보다 크다)
print(binz.fit_transform(X))

In [None]:
## recall(재현율) 점수 올리기
# th2 --> th:1[임계치를 낮추면]
# [0 1 1] --> [1 1 1] 즉, 양성 데이터가 늘었다 -->24수치를 줄인다.


## 4. 결측 확인(Missing Value)

In [None]:
## 4. 결측 확인(Missing Value)

In [None]:
# 0인 값 = 결측치
pima[pima == 0].count()

In [None]:
## 5. EDA & Engineering
* 이상치 데이터 : 0
* 왜도(skew) : 정규분포화, 표준화

In [None]:
pima.columns

In [None]:
zero_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

In [None]:
pima[pima["Glucose"]==0]["Glucose"].count()

In [None]:
for col in zero_cols:
    zero = pima[pima[col]==0][col].count()
    zero_n = pima[pima[col]!=0][col].count()
    print(col, zero, zero_n)

In [None]:
# 비율
for col in zero_cols:
    z_rate = round(pima[pima[col]==0][col].count()/pima.shape[0]*100, 3)
    z_n_rate = round(pima[pima[col]!=0][col].count()/pima.shape[0]*100, 3)
    print(col, z_rate, z_n_rate)

####  zero_cols : (정상/비정상(0/1)   &  나이구간별) 평균 --> 0 채우기

In [None]:
pima['BloodPressure'].mean()

In [None]:
pima['Age'].describe()

In [None]:
# df[조건][컬럼]

# min : 21 ~ max : 81 --> 20 30 40 50 60 70 80
#pima['Age_band'] = pima['Age'] // 20

pima['Age_band'] = pd.cut(pima['Age'], bins=[0,20,24,29,41,81,150], labels=[0,1,2,3,4,5])

pima[['Age','Age_band']].head()
pima['Age_band'].unique()

In [None]:
# Outcome별(정상,비정상), 나이별 평균 혈압  
# sql문 => select Age_band, avg(BloodPressure) from emp group by Age_band;
# df[조건].groupby()[컬럼]

pima[pima['BloodPressure']>0].groupby(['Outcome', 'Age_band'])['BloodPressure'].mean()

In [None]:
# # Outcome별(정상,비정상), 나이별 평균 zero_cols('Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI')

for col in zero_cols:
    print(col, pima[pima[col]>0].groupby(['Outcome', 'Age_band'])[col].mean())
    print('---'*15)

In [None]:
## 6. 이상값(0) 채우기
=> 구간화, df.fillna(호칭별 그룹 평균값, inplace=True)
<pre>
BMI             : Outcome
SkinThickness   : Outcome  
-------------------------------------
Insulin         : Outcome  Age_band
BloodPressure   : Outcome  Age_band
Glucose         : Outcome  Age_band
</pre>

In [None]:
# 0값을 Nan으로 바꾸기 -> fillna사용을 위해
pima[zero_cols] = pima[zero_cols].replace(0,np.nan)

# df.fillna(  df[zero_cols].transpose( _____ )   )
# df.isna().sum()

In [None]:
# BMI, SkinThickness
pima["BMI"]           = pima["BMI"].fillna(pima.groupby(by="Outcome")["BMI"].transform('mean'))
pima["SkinThickness"] = pima["SkinThickness"].fillna(pima.groupby(by="Outcome")["SkinThickness"].transform('mean'))

# Insulin, BloodPressure, Glucose
pima["Insulin"]       = pima["Insulin"].fillna(pima.groupby(by=["Outcome","Age_band"])["Insulin"].transform('mean'))
pima["BloodPressure"] = pima["BloodPressure"].fillna(pima.groupby(by=["Outcome","Age_band"])["BloodPressure"].transform('mean'))
pima["Glucose"]       = pima["Glucose"].fillna(pima.groupby(by=["Outcome","Age_band"])["Glucose"].transform('mean'))

In [None]:
pima.isna().sum()

In [None]:
pima.hist()
plt.show()

## 7. 2차 점수 확인 : scaling

In [None]:
# pima --> 0처리 다 한 데이터
y = pima["Outcome"]
X = pima.drop("Outcome", axis=1)
print("2차")
split_fit_score(X, y, rf_model)  #0.8441558441558441