# 정확도 (Accuracy)

## Titanic 엉터리 분류기

In [1]:
import numpy as np
from sklearn.base import BaseEstimator

In [2]:
class MyDummyClassifier(BaseEstimator):
    # fit(), predict() method만 재정의(overriding)
    def fit(self, X, y):
        pass
    def predict(self, X):
        pred = np.zeros((X.shape[0], 1))
        # for i in range(X.shape[0]):
        for i in np.arange(X.shape[0]):
            # 여성인 경우
            if X['Sex'].iloc[i] == 0:
                pred[i, 0] = 1
        return pred

## 전처리를 하지 않고 실행하는 경우

In [3]:
import pandas as pd
titanic_df = pd.read_csv('../00.data/kaggle.com/titanic/train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis=1)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_titanic_df, y_titanic_df, test_size=0.2, random_state=123
)

In [5]:
my_clf = MyDummyClassifier()
my_clf.fit(X_train, y_train)

In [6]:
my_pred = my_clf.predict(X_test)

In [7]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, my_pred)

0.6368715083798883

## 전처리를 수행한 후 실행하는 경우

In [8]:
from sklearn.preprocessing import LabelEncoder

# Null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

# 머신러닝 알고리즘에 불필요한 속성 제거
def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)
    return df

# 레이블 인코딩 수행. 
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        df[feature] = le.fit_transform(df[feature])
    return df

# 앞에서 설정한 Data Preprocessing 함수 호출
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [9]:
import pandas as pd
titanic_df = pd.read_csv('../00.data/kaggle.com/titanic/train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis=1)
X_titanic_df = transform_features(X_titanic_df)

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_titanic_df, y_titanic_df, test_size=0.2, random_state=123
)

In [11]:
my_clf = MyDummyClassifier()
my_clf.fit(X_train, y_train)

In [12]:
my_pred = my_clf.predict(X_test)

In [13]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, my_pred)

0.7988826815642458

## 오차 행렬(Confusion Matrix)

In [14]:
class MyFakeClassifier(BaseEstimator):
    def fit(self, X, y):
        pass
    # 입력값으로 들어오는 X 데이터 셋의 크기만큼 모두 0으로 만들어서 반환
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)

In [15]:
from sklearn.datasets import load_digits

# 사이킷런의 내장 데이터 셋인 load_digits()를 이용하여 MNIST 데이터 로딩
digits = load_digits()

# digits 번호가 7번이면 True이고 이를 astype(int)로 1로 변환
# 7번이 아니면 False이고 0으로 변환
y = (digits.target == 7).astype(int)
X_train, X_test, y_train, y_test = train_test_split(
    digits.data, y, random_state=1
)

In [16]:
# 불균형한 레이블 데이터 분포도 확인
print('레이블 테스트 세트 크기', y_test.shape)
print('테스트 세트 레이블 0과 1의 분포도')
print(pd.Series(y_test).value_counts())

레이블 테스트 세트 크기 (450,)
테스트 세트 레이블 0과 1의 분포도
0    402
1     48
dtype: int64


In [17]:
# Dummy Classifier로 학습/예측/정확도 평가
fakeclf = MyFakeClassifier()
fakeclf.fit(X_train, y_train)
fakepred = fakeclf.predict(X_test)
score = accuracy_score(y_test, fakepred)
print('모든 예측을 0으로 하여도 정확도는: {:.3f}'.format(score))

모든 예측을 0으로 하여도 정확도는: 0.893


## 오차 행렬(Confusion Matrix)

MyFakeClassifier 사례

In [18]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, fakepred)

array([[402,   0],
       [ 48,   0]])

## 정밀도(Precision) 와 재현율(Recall)

In [22]:
import pandas as pd

titanic_df = pd.read_csv('../00.data/kaggle.com/titanic/train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis=1)
X_titanic_df = transform_features(X_titanic_df)

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_titanic_df, y_titanic_df, test_size=0.2, random_state=123
)

In [24]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix
from sklearn.linear_model import LogisticRegression

def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)

    print('오차 행렬')
    print(confusion)
    print('정확도: {:.4f}'.format(accuracy))
    print('정밀도: {:.4f}'.format(precision))
    print('재현율: {:.4f}'.format(recall))

lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)

get_clf_eval(y_test, pred)

오차 행렬
[[97 17]
 [16 49]]
정확도: 0.8156
정밀도: 0.7424
재현율: 0.7538


Precision/Recall Trade-off

In [25]:
pred_proba = lr_clf.predict_proba(X_test)
pred_proba[:10, :]

array([[0.23778817, 0.76221183],
       [0.85698221, 0.14301779],
       [0.43255423, 0.56744577],
       [0.72614888, 0.27385112],
       [0.88233389, 0.11766611],
       [0.8571772 , 0.1428228 ],
       [0.27238463, 0.72761537],
       [0.2470994 , 0.7529006 ],
       [0.23593651, 0.76406349],
       [0.39937463, 0.60062537]])

In [26]:
from sklearn.preprocessing import Binarizer

X = [[ 1, -1, 2],
     [ 2, 0, 0],
     [ 0, 1.1, 1.2]]

# threshold 기준값보다 같거나 작으면 0을, 크면 1을 반환
binarizer = Binarizer(threshold=1.1)
print(binarizer.fit_transform(X))

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]


In [27]:
binarizer = Binarizer(threshold=1)
print(binarizer.fit_transform(X))

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 1.]]


In [28]:
# Binarizer의 threshold 설정값. 분류 결정 임갯값음.
custom_threshold = 0.5

# predict_proba() 반환값의
# 즉
pred_proba_1 = pred_proba[:,1].reshape(-1,1)

binarizer = Binarizer(threshold=custom_threshold)
custom_predict = binarizer.fit_transform(pred_proba_1)

get_clf_eval(y_test, custom_predict)

오차 행렬
[[97 17]
 [16 49]]
정확도: 0.8156
정밀도: 0.7424
재현율: 0.7538


## F1 스코어

In [30]:
from sklearn.metrics import f1_score

f1 = f1_score(y_test, pred)
print( 'F1 스코어: {:.4f}'.format(f1) )

F1 스코어: 0.7481


## ROC AUC 스코어

In [31]:
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(y_test, pred)
print( 'ROC AUC 스코어: {:.4f}'.format(roc_auc) )

ROC AUC 스코어: 0.8024
