# imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import warnings; warnings.filterwarnings(action='ignore')

# load datasets
from sklearn import datasets

# model_selection.splitter
from sklearn.model_selection import train_test_split # function
from sklearn.model_selection import KFold, StratifiedKFold

# model_selection.hyper_parameter_optimizer
from sklearn.model_selection import GridSearchCV

# model_selection.model_validation
from sklearn.model_selection import cross_val_predict

# models/estimators
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# preprocessing.encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # pd.get_dummies()

# preprocessing.scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

# preprocessing.binarizing
from sklearn.preprocessing import Binarizer

# metrics.scores
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score

# metrics.curves
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve
from sklearn.metrics import roc_curve, plot_roc_curve

# metrics.reports
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.metrics import classification_report

# read pima diabetes

In [None]:
df = pd.read_csv('./kaggle/pima/diabetes.csv')
df.info()

# X, y

In [None]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']
model = RandomForestClassifier(n_estimators=500, random_state=0)

# preprocessing

전처리 ; 모두 fit/transform 있음, 아래 6개만 사용

1. encoder   ; LabelEncoder, OneHotEncoder
2. scaler    ; MinMaxScaler, StandardScaler, RobustScaler
3. bizarizer ; Binarizer

## encoder

In [None]:
# LabelEncoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

data = ['a', 'b', 'b', 'c', 'd']

encoder = LabelEncoder()    # 사전 순으로 sort labeling
encoder.fit(data)           # make labels, return self
encoder.transform(data)     # apply labels, return arr
encoder.fit_transform(data)

In [None]:
# srs.map(dict) ; 특정 값
mapping = {'a':1, 'b':0, 'c':2}
map_df = pd.DataFrame(data, columns=['chars'])
map_df['chars_map'] = map_df['chars'].map(mapping)
map_df[['chars', 'chars_map']]

## scaler

In [None]:
# MinMaxScaler ; 최솟값, 최댓값 기준으로 정규화
# StandardScaler ; mean=0, unit_variance(편차)=1로 정규화
# RobustScaler ; median(중간값)을 기준으로 4등분하여 정규화

# classification tree model은 scaling 영향 적게 받음
# regression model은 scaling 영향 크게 받음

s_train = np.array(list(range(0, 10))).reshape(-1, 1)
s_test  = np.array(list(range(0,  5))).reshape(-1, 1)

# train scaling
scaler_f = MinMaxScaler()
scaler_t = MinMaxScaler()
scaler_f.fit(s_train)                 # input arr, return self
scaler_t.fit(s_train) 
train_f = scaler_f.transform(s_train) # apply fit, return arr
train_t = scaler_t.transform(s_train)

# test scaling ; test는 fit하면 안 됨
scaler_f.fit(s_test)
test_f = scaler_f.transform(s_test)

test_t = scaler_t.transform(s_test)

print('----- train scaling  -----\n', train_f.T)
print('----- test fit & trf -----\n', test_f.T)
print('----- test transform -----\n', test_t.T)


## binarize

- threshold ; N/P를 나누는 기준 확률점 (임계치, 임계점), predict의 default는 0.5 초과
- binarize ; threshold를 기준으로 N=0, P=1로 이진화
- binarizing ; threshold를 조정하여, N/P의 비율을 조정
- 일반적으로 P의 비율이 적기 때문에, threshold를 낮춰서 점수 개선
- oversampling ; P가 적을 경우, 인위적으로 P 데이터의 절대적인 양을 늘리는 것

In [None]:
# threshold 조정에 따른 N/P 변화

# threshold=2 일 때, P=5 N=4
T = [[-1, 1, 0],
     [ 1, 2, 3],
     [ 2, 3, 4]]
Binarizer(threshold=1).fit_transform(X) # input predict_proba, return arr

In [None]:
# threshold=3 일 때, P=1 N=8
Binarizer(threshold=3).fit_transform(X)

# scoring & plotting

In [None]:
def pr_curve(y_val, probas_pred):

    """threshold / precision, recall, f1 curve"""

    # plot_precision_recall_curve(model, X_val, y_val) # x=recall, y=precision
    precisions, recalls, thresholds = precision_recall_curve(y_val, probas_pred[:, 1])
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls)

    # settings
    plt.title('precision recall f1 curve')
    plt.gray()
    plt.xlabel('threshold')
    plt.ylabel('socre')

    # x, y values
    plt.plot(thresholds, precisions[:thresholds.shape[0]], label='precision', linestyle=':')
    plt.plot(thresholds, recalls[:thresholds.shape[0]],    label='recall',    linestyle='--')
    plt.plot(thresholds, f1_scores[:thresholds.shape[0]],  label='f1',        linestyle='solid')
    # valid linestyle = '-', '--', '-.', ':', 'None', ' ', '', 'solid', 'dashed', 'dashdot', 'dotted'

    plt.legend()
    plt.show()

    return thresholds, precisions, recalls, f1_scores

In [None]:
def ra_curve(y_val, probas_pred):

    """ROC curve, auc_score"""
    
    FPRS, TPRS, thresholds = roc_curve(y_val, probas_pred[:, 1])

    # settings
    plt.title('ROC curve')
    plt.gray()
    plt.xlabel('FPR(1- specificity)')
    plt.ylabel('TPR')

    # x, y values
    plt.plot(FPRS, TPRS, label='ROC', linestyle='solid')
    plt.plot([0, 1], [0, 1], label='50%', color='gray', linestyle=':')

    plt.legend()
    plt.show()

    return FPRS, TPRS

In [None]:
def max_eval(y_val, y_pred, probas_pred, 
             thresholds, f1_scores, precisions, recalls, FPRS, TPRS):
      
      """when f1 max, threshold & scorings"""
      
      print('-' * 35, 'max_eval', '-' * 35)

      f1max_idx = np.where(f1_scores == f1_scores.max())
      m_threshold = float(thresholds[f1max_idx])
      m_proba     = Binarizer(threshold=m_threshold).fit_transform(probas_pred[:, 1].reshape(-1, 1))

      max_f1      = f1_scores.max()
      m_accuracy  = accuracy_score (y_val, m_proba)
      m_AUC       = roc_auc_score(y_val, probas_pred[:, 1])
      m_precision = float(precisions[f1max_idx])
      m_recall    = float(recalls[f1max_idx])
      m_FPR       = float(FPRS[np.where(thresholds == m_threshold)])
      m_TPR       = float(TPRS[np.where(thresholds == m_threshold)])

      print(f'threshold = {m_threshold:.4f}   '
            f'max f1   = {max_f1:.4f}   '
            f'accuracy = {m_accuracy:.4f}   '
            f'AUC      = {m_AUC:.4f}\n'
            f'precision = {m_precision:.4f}   '
            f'recall   = {m_recall:.4f}   '
            f'FPR      = {m_FPR:.4f}   '
            f'TPR      = {m_TPR:.4f}')    

In [None]:
def my_eval(th, y_val, y_pred, probas_pred, model, X_val):

    """th in th_list scorings"""
    
    print('-' * 29, 'threshold adjusting ', '-' * 29)

    f1        = f1_score        (y_val, y_pred)
    accuracy  = accuracy_score  (y_val, y_pred)
    AUC       = roc_auc_score   (y_val, probas_pred[:, 1])
    precision = precision_score (y_val, y_pred)
    recall    = recall_score    (y_val, y_pred)

    print(f'threshold = {th:.4f}   '
          f'f1       = {f1:.4f}   '
          f'accuracy = {accuracy:.4f}   '
          f'AUC      = {AUC:.4f}\n'
          f'precision = {precision:.4f}   '
          f'recall   = {recall:.4f}   \n')

    conf_matx = confusion_matrix(y_val, y_pred)
    disp = plot_confusion_matrix(model, X_val, y_pred, cmap=plt.cm.Blues, normalize='all')
    disp.ax_.set_title(th)
    plt.show()

In [None]:
def split_fit_score(X, y, model=RandomForestClassifier(),
                    test_size=0.2, th_list=[0.5], random_state=0):

    """train_test_split & fit_predict"""
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=random_state)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    probas_pred = model.predict_proba(X_val)
    
    # precision_recall_curve & ROC_curve & max_eval
    thresholds, precisions, recalls, f1_scores = pr_curve(y_val, probas_pred)
    FPRS, TPRS = ra_curve(y_val, probas_pred)
    max_eval(y_val, y_pred, probas_pred, thresholds, f1_scores, precisions, recalls, FPRS, TPRS)

    # binarize loop
    for th in th_list:
        bin_probas = Binarizer(threshold=th).fit_transform(probas_pred[:, 1].reshape(-1, 1))
        my_eval(th, y_val, bin_probas, probas_pred, model, X_val)

In [None]:
split_fit_score(X, y, th_list=[0.375, 0.4])