# 과적합 

In [1]:
from sklearn.preprocessing import PolynomialFeatures # 다항속성을 만들어 주는 툴
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from math import sqrt

import numpy as np
import pandas as pd

In [3]:
# 불필요한 열 삭제한 데이터셋 만들기
admission_df = pd.read_csv("admission_data.csv").drop('Serial No.', axis =1)

In [4]:
admission_df.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.0,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.8
4,314,103,2,2.0,3.0,8.21,0,0.65


In [7]:
# Chance of Admit: 목표변수 , 그 외 변수들: 입력변수
# 입력변수 정의
X = admission_df.drop(['Chance of Admit '], axis=1)

In [13]:
polynomial_transformer = PolynomialFeatures(6) # 가설함수 6차 
polynomial_features = polynomial_transformer.fit_transform(X.values)
features = polynomial_transformer.get_feature_names(X.columns)

In [14]:
X = pd.DataFrame(polynomial_features, columns=features)
X.head()

Unnamed: 0,1,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,GRE Score^2,GRE Score TOEFL Score,...,LOR CGPA^2 Research^3,LOR CGPA Research^4,LOR Research^5,CGPA^6,CGPA^5 Research,CGPA^4 Research^2,CGPA^3 Research^3,CGPA^2 Research^4,CGPA Research^5,Research^6
0,1.0,337.0,118.0,4.0,4.5,4.5,9.65,1.0,113569.0,39766.0,...,419.05125,43.425,4.5,807539.696082,83682.87006,8671.800006,898.632125,93.1225,9.65,1.0
1,1.0,324.0,107.0,4.0,4.0,4.5,8.87,1.0,104976.0,34668.0,...,354.04605,39.915,4.5,487014.306256,54905.784245,6190.054594,697.864103,78.6769,8.87,1.0
2,1.0,316.0,104.0,3.0,3.0,3.5,8.0,1.0,99856.0,32864.0,...,224.0,28.0,3.5,262144.0,32768.0,4096.0,512.0,64.0,8.0,1.0
3,1.0,322.0,110.0,3.0,3.5,2.5,8.67,1.0,103684.0,35420.0,...,187.92225,21.675,2.5,424731.61094,48988.651781,5650.363527,651.714363,75.1689,8.67,1.0
4,1.0,314.0,103.0,2.0,2.0,3.0,8.21,0.0,98596.0,32342.0,...,0.0,0.0,0.0,306237.903347,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
y = admission_df[['Chance of Admit ']]
y.head()

Unnamed: 0,Chance of Admit
0,0.92
1,0.76
2,0.72
3,0.8
4,0.65


In [19]:
x_train, x_test, y_train, y_test =train_test_split(X,y,test_size=0.3,random_state=5)

In [28]:
model = LinearRegression()
model.fit(x_train,y_train) # 학습시키기

LinearRegression()

In [22]:
y_train_predicton = model.predict(x_train)
y_test_predicton = model.predict(x_test)

In [23]:
mse = mean_squared_error(y_train, y_train_predicton)
print("training set에서 성능")
print(sqrt(mse))

training set에서 성능
0.001504798774237806


In [24]:
mse = mean_squared_error(y_test, y_test_predicton)
print("test set에서 성능")
print(sqrt(mse))

test set에서 성능
5.090720284226617


In [25]:
# training set에서만 성능 좋음
# test set에서 성능 안좋음
# -> 과적합

# 과적합 예방하기 -> 정규화
* L1 정규화(Lasso 모델), L2 정규화(Ridge 모델)

## Lasso 모델

In [26]:
from sklearn.preprocessing import PolynomialFeatures 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso ## 이부분만 바꿈
from sklearn.metrics import mean_squared_error

from math import sqrt

import numpy as np
import pandas as pd

In [29]:
# 불필요한 열 삭제한 데이터셋 만들기
admission_df = pd.read_csv("admission_data.csv").drop('Serial No.', axis =1)

# Chance of Admit: 목표변수 , 그 외 변수들: 입력변수
# 입력변수 정의
X = admission_df.drop(['Chance of Admit '], axis=1)

polynomial_transformer = PolynomialFeatures(6) # 가설함수 6차 
polynomial_features = polynomial_transformer.fit_transform(X.values)
features = polynomial_transformer.get_feature_names(X.columns)

X = pd.DataFrame(polynomial_features, columns=features)

y = admission_df[['Chance of Admit ']]

x_train, x_test, y_train, y_test =train_test_split(X,y,test_size=0.3,random_state=5)

In [30]:
# 이부분 변함
# 옵션 설정 : alpha는 람다, max_iter은 경사하강법 횟수 최대지정, 
# normalize는 freature scaling 자동실행(값들을 0과 1 사이로 변경)
model = Lasso(alpha = 0.001, max_iter = 1000,normalize=True )
model.fit(x_train,y_train) # 학습시키기

Lasso(alpha=0.001, normalize=True)

In [31]:
y_train_predicton = model.predict(x_train)
y_test_predicton = model.predict(x_test)

In [32]:
mse = mean_squared_error(y_train, y_train_predicton)
print("training set에서 성능")
print(sqrt(mse))

training set에서 성능
0.06336620966147144


In [33]:
mse = mean_squared_error(y_test, y_test_predicton)
print("test set에서 성능")
print(sqrt(mse))

test set에서 성능
0.06007719092689258


In [None]:
# test set의 평균제곱 오차도 월등히 줄어듦을 알 수 있음