In [None]:
import warnings 
warnings.filterwarnings('ignore')

## Model Validation
- 학습하고 생성한 모델을 사용하기에 적합한지 검증하려는 것
- 캐패시티(파라미터의 개수)가 높아질수록 모델의 데이터에 대한 설명력도 증가한다.
- 검증 단계에선 모델을 여러개 만들어 가장 적합한 모델을 선택한다. -> 그러나 크게 의미는 없다. 이것이 미래의 성능을 담보할 수 없기 때문이다.
- 모델링의 목적은 일반화된 모델을 생성하는 것.
> 일반화된 모델이란? -> 학습에 사용되지 않은 데이터(미래 데이터)에서도 비슷한 성능을 제공해 줄 수 있어야 한다.
- (부작용)과거 데이터에 최적화 하면 결국 오버피팅을 피할 수 없다.
### Generalization Error
과거 데이터를 쪼개서 쓴다. 
#### Testing Error
학습을 과거로 보고, 검증데이터를 미래라 가정을 해서 성능을 측정을 한다. 스플릿 사이즈는 하이퍼 파라미터가 된다.
- 이때 train과 test를 유사하게 뽑기 위해 랜덤하게 뽑는다.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
url='https://raw.githubusercontent.com/rusita-ai/pyData/master/Electric.csv'
elec = pd.read_csv(url)

elec.info()

In [None]:
sns.scatterplot(elec['surface_area'],elec['electricity'])
plt.show()

In [None]:
sns.regplot(x='surface_area',y='electricity',data=elec,line_kws={'color':'red'},
            scatter_kws={'edgecolor':'white'})
plt.xlim(505,820)
plt.show()

### 2차 모델 시각화

In [None]:
sns.regplot(x='surface_area',y='electricity',data=elec,line_kws={'color':'blue'},
            scatter_kws={'edgecolor':'white'},order=2)
plt.xlim(505,820)
plt.show()

### 5차 모델 시각화

In [None]:
sns.regplot(x='surface_area',y='electricity',data=elec,line_kws={'color':'green'},
            scatter_kws={'edgecolor':'white'},order=5)
plt.xlim(505,820)
plt.show()

### 9차 모델 시각화

In [None]:
sns.regplot(x='surface_area',y='electricity',data=elec,line_kws={'color':'orange'},
            scatter_kws={'edgecolor':'white'},order=9)
plt.xlim(505,820)
plt.ylim(50,450)
plt.show()

### 시각화 비교

In [None]:
sns.regplot(x='surface_area',y='electricity',data=elec,line_kws={'color':'red'},
            scatter_kws={'edgecolor':'white'})
sns.regplot(x='surface_area',y='electricity',data=elec,line_kws={'color':'blue'},
            scatter_kws={'edgecolor':'white'},order=2)
sns.regplot(x='surface_area',y='electricity',data=elec,line_kws={'color':'green'},
            scatter_kws={'edgecolor':'white'},order=5)
sns.regplot(x='surface_area',y='electricity',data=elec,line_kws={'color':'orange'},
            scatter_kws={'edgecolor':'white'},order=9)
plt.xlim(505,820)
plt.ylim(50,450)
plt.show()

## Traing Error

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
url='https://raw.githubusercontent.com/rusita-ai/pyData/master/Electric.csv'
elec = pd.read_csv(url)

### 1차

In [None]:
X_train = elec[['surface_area']]
y_train = elec['electricity']

X_train.shape,y_train.shape

In [None]:
from sklearn.linear_model import LinearRegression

Model1=LinearRegression()
Model1.fit(X_train,y_train)

In [None]:
print(Model1.coef_)
print(Model1.intercept_)

### y_hat 생성

In [None]:
y_hat_1 = Model1.predict(X_train)

In [None]:
TR_ERR_1 = np.mean((y_train-y_hat_1)**2)
TR_ERR_1

### 5차

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=5,include_bias = False)
px_5 = poly.fit_transform(X_train)

In [None]:
X_train.shape,px_5.shape

- 5차 모델 생성

In [None]:
from sklearn.linear_model import LinearRegression

model5 = LinearRegression()
model5.fit(px_5,y_train)

In [None]:
print(model5.coef_)
print(model5.intercept_)

In [None]:
px_5_pred = poly.fit_transform(X_train)
y_hat_5=model5.predict(px_5_pred)

In [None]:
y_hat_5.shape

In [None]:
TR_ERR_5 = np.mean((y_train-y_hat_5)**2)
TR_ERR_5

### 9차

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=9,include_bias = False)
px_9 = poly.fit_transform(X_train)
px_9.shape

In [None]:
from sklearn.linear_model import LinearRegression

model9 = LinearRegression()
model9.fit(px_9,y_train)

print(model5.coef_)
print(model5.intercept_)

In [None]:
px_9_pred = poly.fit_transform(X_train)
y_hat_9=model9.predict(px_9_pred)

In [None]:
y_hat_9.shape

In [None]:
TR_ERR_9 = np.mean((y_train-y_hat_9)**2)
TR_ERR_9

### 모델 비교

In [None]:
print('1차 : ',TR_ERR_1)
print('5차 : ',TR_ERR_5)
print('9차 : ',TR_ERR_9)

### 잔차 시각화
- 1차

In [None]:
sns.residplot(x='surface_area',y='electricity',data=elec,order=1,scatter_kws={'edgecolor':'white'})
plt.ylim(-300,300)
plt.show()

In [None]:
sns.residplot(x='surface_area',y='electricity',data=elec,order=5,scatter_kws={'edgecolor':'white'})
plt.ylim(-300,300)
plt.show()

In [None]:
sns.residplot(x='surface_area',y='electricity',data=elec,order=9,scatter_kws={'edgecolor':'white'})
plt.ylim(-300,300)
plt.show()

## Testing Error

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
url='https://raw.githubusercontent.com/rusita-ai/pyData/master/Electric.csv'
elec = pd.read_csv(url)

elec.shape

In [None]:
from sklearn.model_selection import train_test_split

TR_elec,TE_elec = train_test_split(elec,test_size=0.2,random_state=2045)

In [None]:
TR_elec.shape,TE_elec.shape

In [None]:
TR_elec.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test= train_test_split(elec[['surface_area']],elec['electricity'],test_size=0.2,random_state=2045)

In [None]:
sns.scatterplot(TR_elec['surface_area'],TR_elec['electricity'])
plt.show()

In [None]:
sns.scatterplot(TE_elec['surface_area'],TE_elec['electricity'])
plt.show()

### 1차 모델 Training Error

In [None]:
from sklearn.linear_model import LinearRegression

model1=LinearRegression()
model1.fit(X_traiin,y_train)

- predict

In [None]:
y_hat_1 = model1.predict(X_test)
y_hat_1.shape

In [None]:
from sklearn.metrics import mean_squared_error
TE_Err_1 =mean_squared_error(y_test,y_hat_1)
TE_Err_1

### 5차 모델 

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=5,include_bias=False)
px_5_TR = poly.fit_transform(X_train)

In [None]:
model5=LinearRegression()

In [None]:
model5.fit(px_5_TR,y_train)

In [None]:
px_5_TE = poly.fit_transform(X_test)
y_hat_5=model5.predict(px_5_TE)

In [None]:
TE_Err_5 = mean_squared_error(y_test,y_hat_5)
TE_Err_5

### 9차 모델

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=9,include_bias=False)
px_9_TR = poly.fit_transform(X_train)


In [None]:
model9=LinearRegression()
model9.fit(px_9_TR,y_train)

In [None]:
px_9_TE = poly.fit_transform(X_test)
y_hat_9=model9.predict(px_9_TE)
TE_Err_9 = mean_squared_error(y_test,y_hat_9)
TE_Err_9

### 3개 모델 비교

In [None]:
print('1차 : ',TE_Err_1)
print('5차 : ',TE_Err_5)
print('9차 : ',TE_Err_9)

## Validation approach 
엄격하게 접근하기 위해서 사용함.
- 데이터를 3개로 쪼갠다.
- training, test, validation

## Regeression Analysis - 수치예측
y~ w*X+b \
y: Output(종속변수,목표변수) , X: Input(독립변수)
### Scaling
- 범위가 다른 변수를 맞추기 위해 스케일링을 사용한다.
- MinMaxScaler(정규화),StandardScaler(표준화)