# Multiple Linear Regression
### 원-핫 인코딩

In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('MultipleLinearRegressionData.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [4]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'), [2])],
                       remainder='passthrough')

In [5]:
X = ct.fit_transform(X)

In [6]:
X
# 1.0: Home
# 0.1: Library
# 0.0: Cafe

array([[1.0, 0.0, 0.5, 3],
       [0.0, 1.0, 1.2, 4],
       [0.0, 0.0, 1.8, 2],
       [0.0, 0.0, 2.4, 0],
       [1.0, 0.0, 2.6, 2],
       [1.0, 0.0, 3.2, 0],
       [0.0, 1.0, 3.9, 0],
       [0.0, 1.0, 4.4, 0],
       [1.0, 0.0, 4.5, 5],
       [0.0, 0.0, 5.0, 1],
       [0.0, 0.0, 5.3, 2],
       [0.0, 0.0, 5.8, 0],
       [0.0, 1.0, 6.0, 3],
       [0.0, 0.0, 6.1, 1],
       [0.0, 1.0, 6.2, 1],
       [1.0, 0.0, 6.9, 4],
       [0.0, 0.0, 7.2, 2],
       [1.0, 0.0, 8.4, 1],
       [0.0, 1.0, 8.6, 1],
       [0.0, 1.0, 10.0, 0]], dtype=object)

### 데이터셋 분리

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### 학습

In [9]:
from sklearn.linear_model import LinearRegression

In [10]:
reg = LinearRegression()

In [11]:
reg.fit(X_train, y_train)

LinearRegression()

### 테스트

In [12]:
y_pred = reg.predict(X_test)
y_pred

array([ 92.15457859,  10.23753043, 108.36245302,  38.14675204])

In [13]:
y_test

array([ 90,   8, 100,  38], dtype=int64)

In [14]:
reg.coef_

array([-5.82712824, -1.04450647, 10.40419528, -1.64200104])

In [15]:
reg.intercept_

5.365006706544804

### 모델 평가

In [16]:
reg.score(X_train, y_train) # 훈련 데이터셋

0.9623352565265527

In [17]:
reg.score(X_test, y_test) # 테스트 데이터셋

0.9859956178877447

### 다양한 평가지표
1. MAE(Mean Absolute Error): 실제값과 예측값 간의 차이의 절대값
2. MSE(Mean Squared Error): 차이의 제곱
3. RMSE(Root Mean Squared Error): 차이의 제곱의 제곱근
4. R2: 결정 계수  

R2 score는 1에 가까울수록, 나머지 지표는 0에 가까울수록 좋습니다.

In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [19]:
# MAE
mean_absolute_error(y_test, y_pred) # 실제값, 예측값 순으로 들어갑니다.

3.2253285188287792

In [20]:
# MSE
mean_squared_error(y_test, y_pred)

19.900226981514813

In [21]:
# RMSE
mean_squared_error(y_test, y_pred, squared=False)

4.460967045553555

In [22]:
# R2 스코어
r2_score(y_test, y_pred)

0.9859956178877447