# 머신러닝 프로젝트

# 1.성능평가 기법

## 1.1 회귀분석

### MAE(Mean Absolute Error)
- 특징  
    - 에러의 크기 그대로 반영
    - 이상치에 영향을 받음

In [None]:
from sklearn.metrics import maen_absolute_error
mae = maen_absolute_error(y_test, y_pred)

### MSE(Mean Squared Error)
- 특징
    - 실젯값과 예측값 차이의 면접 합
    - 특이값이 존재하면 수치 증가

In [None]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)

### RMSE(Root Mean Squared Error)

- 특징 
    - 에러를 제곱하여 계산할때 에러가 클수록 더 크게 반영되는 문제 발생
    - 이를 방지하기 위해 Root를 사용하여 오류값이 더 커지지 않도록 장치

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

### MSLE(Mean Squared Log Error)
- 특징
    - RMSE와 같이 손실이 기하급수적으로 증가하는 상황에서 실제 오류평균보다 값이 더 커지지 않도록 상쇄하기 위해 사용

In [None]:
from sklearn.metrics import mean_squared_log_error
msle = mean_squared_log_error(y_test, y_pred)

### MAPE(Mean Absolute Percentage Error)
- 특징
    - 오차가 예측값에서 차지하는 정도를 나타냄

In [None]:
import numpy as np
def MAPE(y_test, y_pred):
    mape = np.mean(np.abs((y_test - y_pred)/y_test)) * 100
    return mape

mape = MAPE(y_test, y_pred)

## 1.2 분류분석

### 정확도(Accuracy)

Accuracy = (TP+TF) / (TP+TN+FP+FN)

In [None]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred)

### 혼동행렬(Confusion Matrix)

In [None]:
from sklearn.metrics import confusion_matrix
cm  = confusion_matrix(y_test, y_pred)

### 정밀도(Precision)와 재현율(Recall)

#### 정밀도(Precision)
설명: Positive로 예측한 것들 중 실제로도 Positive인 비율

Precision = TP / (TP+FP)

In [None]:
from sklearn.metrics import precision_score
precision = precision_score(y_test, y_pred)

#### 재현율(Recall)
설명: 실제로 Positive 것들 중 Positive로 예측한 비율

Recall = TP / (FN+TP)

In [None]:
from sklearn.metrics import recall_score
recall = recall_score(y_test, y_pred)

#### F1 스코어

설명: 실제 Positive인 것들 중 Positive로 예측한 것들의 비율

F1 = 2*(Precision * Recall) / (Precision + Recall)

In [None]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)

#### ROC-AUC

##### ROC곡선

설명: FPR(False Positive Rate)이 변할 때 TPR(True Positive Rate)이 변하는 것을 나타내는 곡선 (ROC)

TNR = TN / (FP + TN)

FPR = FP / (FP + TN) = 1 - TNR

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, thres = roc_curve(y_test, y_pred, pos_label = 1)
# 곡선 그리기
from matplotlib import pyplot as plt

plt.plot(fpr, tpr)

##### AUC곡선
설명 : ROC곡선 아래의 면적이며, 1에 가까울수록 성능이 높은것을 나타냄

In [None]:
from sklearn.metrics import roc_curve, auc

fpr, tpr, thres = roc_curve(y_test, y_pred, pos_label = 1)
# AUC 값
auc = auc(fpr, tpr)

# 2.분석과정 빠르게 맛보기

## 2.1 회귀분석

In [4]:
# 보스톤 주택가격 
from sklearn.datasets import load_boston
import pandas as pd

boston = load_boston()
boston_dt = boston.data
price = boston.target

df = pd.DataFrame(boston_dt, columns = boston.feature_names)
df['PRICE'] = price
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [6]:
df.shape

(506, 14)

In [7]:
df.isna().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
PRICE      0
dtype: int64

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

