## 평가(Evaluation)

### 정확도(Accuracy)

In [3]:
import numpy as np 
import pandas as pd

In [4]:
from sklearn.preprocessing import LabelEncoder

In [6]:
# Nan 처리 함수
def proc_nan(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    return df

# 불필요한 피처 제거 함수
def drop_features(df):
    df.drop(['PassengerId','Name','Ticket','Cabin'], axis=1, inplace=True)
    return df

# 문자열을 숫자로 변환하는 함수
def transform_feature(df):
    le = LabelEncoder()
    for feature in ['Sex', 'Embarked']:
        df[feature] = le.fit_transform(df[feature])
    return df

# 위에서 정의한 함수들을 차례로 호출해주는 함수
def pre_process(df):
    df = proc_nan(df)
    df = drop_features(df)
    df = transform_feature(df)
    return df

In [7]:
titanic_df = pd.read_csv('../00.data/titanic/train.csv')
y = titanic_df['Survived']
X = titanic_df.drop(['Survived'], axis=1)
X = pre_process(X)

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2021
)

In [9]:
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
210,3,1,24.0,0,0,7.05,3
876,3,1,20.0,0,0,9.8458,3
666,2,1,25.0,0,0,13.0,3
819,3,1,10.0,3,2,27.9,3
736,3,0,48.0,1,3,34.375,3


### 타이타닉 엉터리 분류기

- 여성이면 무조건 생존했다고 예측하는 분류기

In [11]:
from sklearn.base import BaseEstimator

class MyTitanicClassifier(BaseEstimator):
    # fit(), predict() method만 재정의(overriding)
    def fit(self, X, y):
        pass
    def predict(self, X):   # X.shape() --> (row, col)
        pred = np.zeros((X.shape[0],1))
        for i in range(X.shape[0]):
            if X['Sex'].iloc[i] == 0:    # female = 0
                pred[i,0] = 1
        return pred

In [12]:
my_clf = MyTitanicClassifier()
my_clf.fit(X_train, y_train)
my_pred = my_clf.predict(X_test)

In [13]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, my_pred)

0.7206703910614525

### MNIST 손글씨 "Is it seven?"

In [15]:
class MyFakeClassifier(BaseEstimator):
    def fit(self, X, y):
        return
    def predict(self, X):
        return np.zeros((len(X),1), dtype=int)

In [16]:
from sklearn.datasets import load_digits

digits = load_digits()
digits.data[0]

array([ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.,  0.,  0., 13., 15., 10.,
       15.,  5.,  0.,  0.,  3., 15.,  2.,  0., 11.,  8.,  0.,  0.,  4.,
       12.,  0.,  0.,  8.,  8.,  0.,  0.,  5.,  8.,  0.,  0.,  9.,  8.,
        0.,  0.,  4., 11.,  0.,  1., 12.,  7.,  0.,  0.,  2., 14.,  5.,
       10., 12.,  0.,  0.,  0.,  0.,  6., 13., 10.,  0.,  0.,  0.])

In [17]:
digits.target[0]

0

In [18]:
y = (digits.target == 7).astype(int)
'''
y = np.zeros((len(digits.target),1))
for i in range(len(digits.target)):
    if digits.target[i] == 7:
        y[i] = 1
'''

'\ny = np.zeros((len(digits.target),1))\nfor i in range(len(digits.target)):\n    if digits.target[i] == 7:\n        y[i] = 1\n'

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    digits.data, y, test_size=0.2, random_state=2021
)

In [21]:
my_clf = MyFakeClassifier()
my_clf.fit(X_train, y_train)
my_pred = my_clf.predict(X_test)

In [22]:
accuracy_score(y_test, my_pred)-

0.9055555555555556