### 머신러닝 단계

1. 데이터 가공 / 변환
2. 모델 학습/예측
3. 평가

### 성능 평가

1. 지도학습
    - 1) 회귀 - 정확도, MSE, RMSE, R square => (실제값 - 예측값) 를 이용한 평가
    - 2) 분류 - 정확도, 오차행렬(Confusion Metrix), 정밀도(Precision), 재현율(Recall), F1 스코어, ROC AUC

### 정확도

accuracy = 예측결과가 동일한 데이터 건수 / 전체 예측 데이터 건수

In [1]:
import numpy as np

from sklearn.base import BaseEstimator

class MyDummyClassifier(BaseEstimator):
    
    def fit(self,X,y=None):
        pass
    
    def predict(self, X):
        pred = np.zeros((X.shape[0],1))
        
        for i in range(X.shape[0]):
            if X['Sex'].iloc[i] == 1:
                pred[i] = 0
            else:
                pred[i] = 1
                
        return pred

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

df1 = pd.read_csv("./dataset/train.csv")
y = df1['Survived']
X = df1.drop('Survived', axis=1)

In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


In [5]:
# 나이 컬럼 중 비어 있는 값에 대해 나이 평균으로 대체
X['Age'].fillna(X['Age'].mean(), inplace=True)

# Cabin 은 N 으로 대체
X['Cabin'].fillna('N', inplace=True)

# Embarked N 으로 대체
X['Embarked'].fillna('N', inplace=True)

In [6]:
X.drop(columns=['PassengerId','Name','Ticket'],inplace=True)

In [7]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.000000,1,0,7.2500,N,S
1,1,female,38.000000,1,0,71.2833,C85,C
2,3,female,26.000000,0,0,7.9250,N,S
3,1,female,35.000000,1,0,53.1000,C123,S
4,3,male,35.000000,0,0,8.0500,N,S
...,...,...,...,...,...,...,...,...
886,2,male,27.000000,0,0,13.0000,N,S
887,1,female,19.000000,0,0,30.0000,B42,S
888,3,female,29.699118,1,2,23.4500,N,S
889,1,male,26.000000,0,0,30.0000,C148,C


In [8]:
from sklearn.preprocessing import LabelEncoder

def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    
    features = ['Cabin','Sex','Embarked']
    
    for feature in features:
        le = LabelEncoder()
        df[feature] = le.fit_transform(df[feature])
        
    return df

In [9]:
X = format_features(X)

In [10]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,1,22.000000,1,0,7.2500,7,3
1,1,0,38.000000,1,0,71.2833,2,0
2,3,0,26.000000,0,0,7.9250,7,3
3,1,0,35.000000,1,0,53.1000,2,3
4,3,1,35.000000,0,0,8.0500,7,3
...,...,...,...,...,...,...,...,...
886,2,1,27.000000,0,0,13.0000,7,3
887,1,0,19.000000,0,0,30.0000,1,3
888,3,0,29.699118,1,2,23.4500,7,3
889,1,1,26.000000,0,0,30.0000,2,0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [12]:
myclf = MyDummyClassifier()
myclf.fit(X_train,y_train)

In [13]:
mypred = myclf.predict(X_test)
print("정확도 {:.4f}".format(accuracy_score(y_test,mypred)))

정확도 0.7877


- 데이터의 구성여부에 따라서 정확도는 좋게 나올 수 있음
- 정확도만 평가 지표로 볼 수 없음

### Confusion Matrix

**MNIST 데이터 셋**

In [14]:
from sklearn.datasets import load_digits
import numpy as np

In [15]:
class MyFakeClassifier(BaseEstimator):
    
    def fit(self,X,y=None):
        pass
    
    def predict(self, X):
        return np.zeros((len(X),1), dtype=bool)       

In [16]:
digits = load_digits()

digits.data

array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  1., ...,  6.,  0.,  0.],
       [ 0.,  0.,  2., ..., 12.,  0.,  0.],
       [ 0.,  0., 10., ..., 12.,  1.,  0.]])

In [17]:
# 번호가 7번이면 True

y = (digits.target == 7).astype(int) 

In [18]:
X_train,X_test,y_train,y_test = train_test_split(digits.data, y, random_state=11)

In [20]:
y_test.shape, y_train.shape

((450,), (1347,))

In [22]:
# 불균형한 레이블 데이터 분포도

print(pd.Series(y_train).value_counts())
print(pd.Series(y_test).value_counts())

0    1213
1     134
dtype: int64
0    405
1     45
dtype: int64


In [23]:
fake = MyFakeClassifier()
fake.fit(X_train,y_train)

In [24]:
pred_fake = fake.predict(X_test)

In [25]:
accuracy_score(y_test,pred_fake)

0.9

In [26]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test,pred_fake)

array([[405,   0],
       [ 45,   0]], dtype=int64)

### 정밀도

In [28]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import precision_score, recall_score

precision_score(y_test,pred_fake)  # TP / (FP + TP)

0.0

### 재현율

In [29]:
recall_score(y_test,pred_fake) # TP / (FN + TP)

0.0