## 혼돈 행렬(Confusion Matrix)

In [1]:
import numpy as np 
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split

In [2]:
class MyFakeClassifier(BaseEstimator):
    def fit(self, X, y):
        return
    def predict(self, X):
        return np.zeros((len(X),1), dtype=int)

In [3]:
from sklearn.datasets import load_digits

digits = load_digits()
y = (digits.target == 7).astype(int)
X_train, X_test, y_train, y_test = train_test_split(
    digits.data, y, test_size=0.2, random_state=2021
)

In [4]:
my_clf = MyFakeClassifier()
my_clf.fit(X_train, y_train)
my_pred = my_clf.predict(X_test)

### Confusion Matrix

In [5]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, my_pred)

array([[326,   0],
       [ 34,   0]], dtype=int64)

### 정밀도(Precision), 재현율(Recall), F1 Score

In [7]:
import pandas as pd 
from sklearn.preprocessing import LabelEncoder

In [8]:
# Nan 처리 함수
def proc_nan(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    return df

# 불필요한 피처 제거 함수
def drop_features(df):
    df.drop(['PassengerId','Name','Ticket','Cabin'], axis=1, inplace=True)
    return df

# 문자열을 숫자로 변환하는 함수
def transform_feature(df):
    le = LabelEncoder()
    for feature in ['Sex', 'Embarked']:
        df[feature] = le.fit_transform(df[feature])
    return df

# 위에서 정의한 함수들을 차례로 호출해주는 함수
def pre_process(df):
    df = proc_nan(df)
    df = drop_features(df)
    df = transform_feature(df)
    return df

In [10]:
titanic_df = pd.read_csv('../00.data/titanic/train.csv')
y = titanic_df['Survived']
X = titanic_df.drop(['Survived'], axis=1)
X = pre_process(X)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2021
)

In [20]:
from sklearn.linear_model import LogisticRegression         # Logisctic이지만 거의 확률로..

lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


- scaling을 안하면 위와 같은 에러가 남.

In [13]:
pred_proba = lr_clf.predict_proba(X_test)
pred_proba[:5, :]

array([[0.90089846, 0.09910154],
       [0.88406205, 0.11593795],
       [0.75385906, 0.24614094],
       [0.92304482, 0.07695518],
       [0.64290152, 0.35709848]])

In [14]:
pred[:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [15]:
# Confusion Matrix
confusion_matrix(y_test, pred)

array([[92, 16],
       [28, 43]], dtype=int64)

In [16]:
# 정밀도(Precision), 43 / (16 + 43)
# 1종 오류를 16건에서 10건으로 줄이면, 정밀도는 49 / (10 + 49) = 0.83
from sklearn.metrics import precision_score
precision_score(y_test, pred)

0.7288135593220338

In [17]:
# 재현율(Recall), 43 / (28 + 43)
# 2종 오류를 28건에서 15건으로 줄이면, 재현율은 56 / (15 + 56) = 0.79
from sklearn.metrics import recall_score
recall_score(y_test, pred)

0.6056338028169014

In [18]:
# F1 Score
from sklearn.metrics import f1_score
f1_score(y_test, pred)

0.6615384615384615

In [19]:
# 정확도(Accuracy)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred)

0.7541899441340782