# Model Validation

In [None]:
import warnings
warnings.filterwarnings('ignore')

# I. Model Capacity

* import Packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

* Colab File Upload
 - Electric.csv & Cars.csv

In [None]:
!ls -l

* pandas DataFrame

In [None]:
Elec = pd.read_csv('Electric.csv')

Elec.info()

* 산점도(surface_area vs. electricity)

In [None]:
plt.scatter(Elec.surface_area, Elec.electricity)
plt.show()

> ## 1) 1차 모델 시각화

In [None]:
sns.regplot(x = 'surface_area', y = 'electricity', data = Elec,
            line_kws = {'color':'red'})
plt.show()

> ## 2) 2차 모델 시각화

In [None]:
sns.regplot(x = 'surface_area', y = 'electricity', data = Elec,
            line_kws = {'color':'blue'},
            order = 2)
plt.show()

> ## 3) 5차 모델 시각화

In [None]:
sns.regplot(x = 'surface_area', y = 'electricity', data = Elec,
            line_kws = {'color':'green'},
            order = 5)
plt.show()

> ## 4) 9차 모델 시각화

In [None]:
sns.regplot(x = 'surface_area', y = 'electricity', data = Elec,
            line_kws = {'color':'orange'},
            order = 9)
plt.ylim(50, 450)
plt.show()

> ## 5) 4개 모델 비교 시각화

In [None]:
sns.regplot(x = 'surface_area', y = 'electricity', data = Elec, line_kws = {'color':'red'})
sns.regplot(x = 'surface_area', y = 'electricity', data = Elec, line_kws = {'color':'blue'}, order = 2)
sns.regplot(x = 'surface_area', y = 'electricity', data = Elec, line_kws = {'color':'green'}, order = 5)
sns.regplot(x = 'surface_area', y = 'electricity', data = Elec, line_kws = {'color':'orange'}, order = 9,
            scatter_kws = {"color":"gray"})
plt.ylim(50, 450)
plt.xlim(500, 800)
plt.show()

# II. Training Error
* import Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

* pandas DataFrame

In [None]:
Elec = pd.read_csv('Electric.csv')

Elec.info()

> ## 1) 1차 모델 Training Error

* Reshape X_train & y_train

In [None]:
X_train = Elec.surface_area.values.reshape(-1, 1)
y_train = Elec.electricity.values.reshape(-1, 1)

X_train.shape, y_train.shape

* 모델 생성

In [None]:
from sklearn.linear_model import LinearRegression

Model_1 = LinearRegression()
Model_1.fit(X_train, y_train)

* 모델 정보(학습결과) 확인

In [None]:
print(Model_1.coef_)
print(Model_1.intercept_)

* y(실제값) 지정

In [None]:
y = Elec.electricity.values.reshape(-1, 1)

y.shape

* y_hat(예측값) 생성

In [None]:
y_hat_1 = Model_1.predict(X_train)

y_hat_1.shape

* MSE(Mean Squared Error) 계산

In [None]:
TR_Err_1 = np.mean((y - y_hat_1) ** 2)

TR_Err_1

> ## 2) 5차 모델 Training Error

* 모델 생성

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree = 5, include_bias = False)
PX_5 = poly.fit_transform(X_train)

PX_5

In [None]:
from sklearn.linear_model import LinearRegression

Model_5 = LinearRegression()
Model_5.fit(PX_5, y_train)

* 모델 정보(학습결과) 확인

In [None]:
np.set_printoptions(suppress = True, precision = 8)

print(Model_5.coef_)
print(Model_5.intercept_)

* y_hat(예측값) 생성

In [None]:
PX_5_pred = poly.fit_transform(X_train)

y_hat_5 = Model_5.predict(PX_5_pred)

y_hat_5.shape

* MSE(Mean Squared Error) 계산

In [None]:
TR_Err_5 = np.mean((y - y_hat_5) ** 2)

TR_Err_5

> ## 3) 9차 모델 Training Error

* 모델 생성

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree = 9, include_bias = False)
PX_9 = poly.fit_transform(X_train)

PX_9.shape

In [None]:
from sklearn.linear_model import LinearRegression

Model_9 = LinearRegression()
Model_9.fit(PX_9, y_train)

* 모델 정보(학습결과) 확인

In [None]:
print(Model_9.coef_)
print(Model_9.intercept_)

* y_hat(예측값) 생성

In [None]:
PX_9_pred = poly.fit_transform(X_train)

y_hat_9 = Model_9.predict(PX_9_pred)

y_hat_9.shape

* MSE(Mean Squared Error) 계산

In [None]:
TR_Err_9 = np.mean((y - y_hat_9) ** 2)

TR_Err_9

> ## 4) 3개 모델 Training Error 비교

In [None]:
print('1차 모델 : ', TR_Err_1)
print('5차 모델 : ', TR_Err_5)
print('9차 모델 : ', TR_Err_9)

# III. Testing Error
* import Packages

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

* pandas DataFrame

In [8]:
Elec = pd.read_csv('Electric.csv')

Elec.shape

(768, 9)

In [9]:
Elec.head(10)

Unnamed: 0,compactness,surface_area,wall_area,roof_area,height,orientation,glazing_area,glazing_area_distribution,electricity
0,0.98,514.5,294.0,110.25,7.0,2,0.0,0,155.5
1,0.98,514.5,294.0,110.25,7.0,3,0.0,0,155.5
2,0.98,514.5,294.0,110.25,7.0,4,0.0,0,155.5
3,0.98,514.5,294.0,110.25,7.0,5,0.0,0,155.5
4,0.9,563.5,318.5,122.5,7.0,2,0.0,0,208.4
5,0.9,563.5,318.5,122.5,7.0,3,0.0,0,214.6
6,0.9,563.5,318.5,122.5,7.0,4,0.0,0,207.1
7,0.9,563.5,318.5,122.5,7.0,5,0.0,0,196.8
8,0.86,588.0,294.0,147.0,7.0,2,0.0,0,195.0
9,0.86,588.0,294.0,147.0,7.0,3,0.0,0,199.5


> ## Train_Data vs. Test_Data

### (1) DataFram Split
* 8:2 Split(614:154)
* 80% Train_DF & 20% Test_DF

In [16]:
from sklearn.model_selection import train_test_split

TR_Elec, TE_Elec = train_test_split(Elec, test_size = 0.2, random_state = 2045)

In [7]:
TR_Elec.shape, TE_Elec.shape

((614, 9), (154, 9))

* 80% TR_Elec DataFrame

In [17]:
TR_Elec.head()

Unnamed: 0,compactness,surface_area,wall_area,roof_area,height,orientation,glazing_area,glazing_area_distribution,electricity
555,0.74,686.0,245.0,220.5,3.5,5,0.4,1,145.5
355,0.79,637.0,343.0,147.0,7.0,5,0.25,2,389.8
200,0.86,588.0,294.0,147.0,7.0,2,0.1,4,264.4
669,0.62,808.5,367.5,220.5,3.5,3,0.4,3,163.5
561,0.69,735.0,294.0,220.5,3.5,3,0.4,1,147.0


* 20% TE_Elec DataFrame

In [11]:
TE_Elec.head()

Unnamed: 0,compactness,surface_area,wall_area,roof_area,height,orientation,glazing_area,glazing_area_distribution,electricity
414,0.71,710.5,269.5,220.5,3.5,4,0.25,3,121.0
475,0.64,784.0,343.0,220.5,3.5,5,0.25,4,166.2
511,0.71,710.5,269.5,220.5,3.5,5,0.25,5,122.7
213,0.76,661.5,416.5,122.5,7.0,3,0.1,4,323.8
339,0.98,514.5,294.0,110.25,7.0,5,0.25,2,286.0


### (2) Array Split
* X_train, X_test & y_train, y_test

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(Elec[['surface_area']], Elec['electricity'], 
                                                    test_size = 0.2, random_state = 2045)

In [19]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((614, 1), (614,), (154, 1), (154,))

* 80% X_train Array

In [20]:
X_train.head()

Unnamed: 0,surface_area
555,686.0
355,637.0
200,588.0
669,808.5
561,735.0


* 80% y_train Array

In [21]:
y_train.head()

555    145.5
355    389.8
200    264.4
669    163.5
561    147.0
Name: electricity, dtype: float64

> ## 1) 1차 모델 Testing Error

* Train_Data로 모델 생성

In [22]:
from sklearn.linear_model import LinearRegression

Model_1 = LinearRegression()
Model_1.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [32]:
print(Model_1.coef_)
print(Model_1.intercept_)

[-0.74323216]
721.7548044195229


* Test_Data로 y_hat(예측값) 생성

In [23]:
y_hat_1 = Model_1.predict(X_test)

y_hat_1.shape

(154,)

In [42]:
Model_1.predict(np.array([85]).reshape(-1,1))

array([658.58007084])

In [24]:
y_test.shape

(154,)

* Test_Data로 MSE(Mean Squared Error) 계산

In [25]:
from sklearn.metrics import mean_squared_error

TE_Err_1 = mean_squared_error(y_test, y_hat_1)
TE_Err_1

6044.176547629271

> ## 2) 5차 모델 Testing Error

* Train_Data로 모델 생성

In [29]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree = 5, include_bias = False)
PX_5_TR = poly.fit_transform(X_train)

X_train.shape, PX_5_TR.shape

((614, 1), (614, 5))

In [30]:
from sklearn.linear_model import LinearRegression

Model_5 = LinearRegression()
Model_5.fit(PX_5_TR, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [31]:
print(Model_5.coef_)
print(Model_5.intercept_)

[-3.07056749e-04 -1.00141539e-01  3.69529793e-04 -4.91890831e-07
  2.24941149e-10]
2806.9743467624226


* Test_Data로 y_hat(예측값) 생성

In [33]:
PX_5_TE = poly.fit_transform(X_test)

y_hat_5 = Model_5.predict(PX_5_TE)

* Test_Data로 MSE(Mean Squared Error) 계산

In [34]:
from sklearn.metrics import mean_squared_error

TE_Err_5 = mean_squared_error(y_test, y_hat_5)
TE_Err_5

4330.604566409588

> ## 3) 9차 모델 Testing Error

* Train_Data로 모델 생성

In [35]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree = 9, include_bias = False)
PX_9_TR = poly.fit_transform(X_train)

X_train.shape, PX_9_TR.shape

((614, 1), (614, 9))

In [36]:
from sklearn.linear_model import LinearRegression

Model_9 = LinearRegression()
Model_9.fit(PX_9_TR, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [37]:
print(Model_9.coef_)
print(Model_9.intercept_)

[ 4.76264641e-26 -1.70469344e-19  2.17816857e-20  9.54163210e-18
  3.15504805e-15  6.29094463e-13 -2.27204154e-15  2.75642039e-18
 -1.11981996e-21]
-435.349494746201


* Test_Data로 y_hat(예측값) 생성

In [38]:
PX_9_TE = poly.fit_transform(X_test)

y_hat_9 = Model_9.predict(PX_9_TE)

* Test_Data로 MSE(Mean Squared Error) 계산

In [39]:
from sklearn.metrics import mean_squared_error

TE_Err_9 = mean_squared_error(y_test, y_hat_9)
TE_Err_9

4238.689067137622

> ## 4) 3개 모델 Testing Error 비교

In [40]:
print('1차 모델 : ', TE_Err_1)
print('5차 모델 : ', TE_Err_5)
print('9차 모델 : ', TE_Err_9)

1차 모델 :  6044.176547629271
5차 모델 :  4330.604566409588
9차 모델 :  4238.689067137622


# IV. Validation Approach
* import Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

* pandas DataFrame

In [None]:
CARS = pd.read_csv('Cars.csv')

CARS.info()

* 산점도(weight vs. mpg)

In [None]:
plt.scatter(CARS.weight, CARS.mpg, s = 3)
plt.show()

> ## Train vs. Validation vs. Test
* 6:2:2 Split(234:79:79)

> ## sklearn Package 사용
* train_test_split( )

* 20% Test_Data(79)

In [None]:
from sklearn.model_selection import train_test_split

X_remain, X_test, y_remain, y_test = train_test_split(CARS[['weight']], CARS['mpg'], 
                                                      test_size = 0.2, random_state = 2045)

X_remain.shape, y_remain.shape, X_test.shape, y_test.shape

* 60% Train_Data(234) & 20% Validation_Data(79)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_remain, y_remain, 
                                                      test_size = 79, random_state = 2045)

X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

> ## 1) 1차 모델 Validation Error

* Train_Data로 모델 생성

In [None]:
from sklearn.linear_model import LinearRegression

Model_1 = LinearRegression()
Model_1.fit(X_train, y_train)

* Validation_Data로 y_hat(예측값) 생성 및 MSE 계산

In [None]:
from sklearn.metrics import mean_squared_error

MSE_1 = mean_squared_error(y_valid, Model_1.predict(X_valid))
MSE_1

> ## 2) 9차 모델 Validation Error

* Train_Data로 모델 생성

In [None]:
poly = PolynomialFeatures(degree = 9, include_bias = False)
PX_9_TR = poly.fit_transform(X_train)

Model_9 = LinearRegression()
Model_9.fit(PX_9_TR, y_train)

* Validation_Data로 y_hat(예측값) 생성 및 MSE 계산

In [None]:
from sklearn.metrics import mean_squared_error

PX9_valid = poly.fit_transform(X_valid)

MSE_9 = mean_squared_error(y_valid, Model_9.predict(PX9_valid))
MSE_9

> ## 3) 2개 모델 Validation Error 비교

In [None]:
print('1차 모델 MSE_1  : ', MSE_1)
print('9차 모델 MSE_9  : ', MSE_9)

> ## 4) 최종 9차 모델을 Test_Data에 적용

* Test_Data로 y_hat(예측값) 생성 및 MSE 계산

In [None]:
PX9_TE = poly.fit_transform(X_test)

mean_squared_error(y_test, Model_9.predict(PX9_TE))

# 
# 
# 
# The End
# 
# 
# 