# 선형회귀모델 실습 2: 예측 및 평가

## 1. 모듈 불러오기

In [None]:
from IPython.display import display, HTML
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats as stats

import statsmodels.api as sm
import statsmodels.formula.api as smf

import pylab

from sklearn.model_selection import train_test_split

from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

plt.rc('font', family='Malgun Gothic')

## 2. 데이터 불러오기: California Housing Data

In [None]:
california = fetch_california_housing()
print(california.DESCR)

In [None]:
X = pd.DataFrame(california.data, columns=california.feature_names)
y = pd.DataFrame(california.target, columns=['House Value'])

## 3. 데이터 전처리

In [None]:
X.head()

In [None]:
y.head()

In [None]:
data = pd.concat([X, y], axis=1)

In [None]:
data.head()

### 모델을 학습(train) 및 테스트(test) 데이터로 분리
<br>

![alt text](Figures/Train_Test.png"")
<br>

In [None]:
train_data, test_data = train_test_split(data, test_size=0.3, random_state=55)

In [None]:
train_data.head()

In [None]:
test_data.head()

## 4. 모델링

In [None]:
lm = sm.OLS(train_data['House Value'], train_data.drop(['House Value'], axis=1))

In [None]:
lm_trained = lm.fit()

## 5. 모델 해석

In [None]:
display(lm_trained.summary())

## 6. 예측 값 도출 및 예측 성능 평가

### 학습 데이터 (Training Data)에 대한 예측 성능 평가

In [None]:
train_pred = lm_trained.predict(train_data.drop(['House Value'], axis=1))

In [None]:
plt.figure(figsize=(6, 6))
plt.title('실제값 vs. 예측값')
plt.scatter(train_data['House Value'], train_pred)
plt.xlabel('실제값', size=16)
plt.ylabel('예측값', size=16)
plt.xlim(-2, 8)
plt.ylim(-2, 8)
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true-y_pred)/y_true))*100

In [None]:
print('Training MSE: {:.3f}'.format(mean_squared_error(train_data['House Value'], train_pred)))
print('Training RMSE: {:.3f}'.format(np.sqrt(mean_squared_error(train_data['House Value'], train_pred))))
print('Training MAE: {:.3f}'.format(mean_absolute_error(train_data['House Value'], train_pred)))
print('Training MAPE: {:.3f}'.format(mean_absolute_percentage_error(train_data['House Value'], train_pred)))

### 테스트 데이터 (Testing Data)에 대한 예측 성능 평가

In [None]:
test_data

In [None]:
test_pred = lm_trained.predict(test_data.drop(['House Value'], axis=1))

In [None]:
display(test_pred)

In [None]:
plt.figure(figsize=(6, 6))
plt.title('실제값 vs. 예측값')
plt.scatter(test_data['House Value'], test_pred)
plt.xlabel('실제값', size=16)
plt.ylabel('예측값', size=16)
plt.xlim(-2, 8)
plt.ylim(-2, 8)
plt.show()

print('Testing MSE: {:.3f}'.format(mean_squared_error(test_data['House Value'], test_pred)))
print('Testing RMSE: {:.3f}'.format(np.sqrt(mean_squared_error(test_data['House Value'], test_pred))))
print('Testing MAE: {:.3f}'.format(mean_absolute_error(test_data['House Value'], test_pred)))
print('Testing MAPE: {:.3f}'.format(mean_absolute_percentage_error(test_data['House Value'], test_pred)))