In [1]:
# 모델 과적합 관찰
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

from math import sqrt

In [2]:
ADMISSION_FILE_PATH = "../data/admission_data.csv"
admission_df = pd.read_csv(ADMISSION_FILE_PATH).drop(['Serial No.'], axis ='columns')
admission_df.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.0,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.8
4,314,103,2,2.0,3.0,8.21,0,0.65


In [3]:
X = admission_df.drop(['Chance of Admit '], axis='columns')

polynomial_transformer = PolynomialFeatures(6)
polynomial_features = polynomial_transformer.fit_transform(X.values)
features = polynomial_transformer.get_feature_names_out(X.columns)

X = pd.DataFrame(data = polynomial_features, columns=features)
X.head()

Unnamed: 0,1,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,GRE Score^2,GRE Score TOEFL Score,...,LOR CGPA^2 Research^3,LOR CGPA Research^4,LOR Research^5,CGPA^6,CGPA^5 Research,CGPA^4 Research^2,CGPA^3 Research^3,CGPA^2 Research^4,CGPA Research^5,Research^6
0,1.0,337.0,118.0,4.0,4.5,4.5,9.65,1.0,113569.0,39766.0,...,419.05125,43.425,4.5,807539.696082,83682.87006,8671.800006,898.632125,93.1225,9.65,1.0
1,1.0,324.0,107.0,4.0,4.0,4.5,8.87,1.0,104976.0,34668.0,...,354.04605,39.915,4.5,487014.306256,54905.784245,6190.054594,697.864103,78.6769,8.87,1.0
2,1.0,316.0,104.0,3.0,3.0,3.5,8.0,1.0,99856.0,32864.0,...,224.0,28.0,3.5,262144.0,32768.0,4096.0,512.0,64.0,8.0,1.0
3,1.0,322.0,110.0,3.0,3.5,2.5,8.67,1.0,103684.0,35420.0,...,187.92225,21.675,2.5,424731.61094,48988.651781,5650.363527,651.714363,75.1689,8.67,1.0
4,1.0,314.0,103.0,2.0,2.0,3.0,8.21,0.0,98596.0,32342.0,...,0.0,0.0,0.0,306237.903347,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
y = admission_df[['Chance of Admit ']]
y.head()

Unnamed: 0,Chance of Admit
0,0.92
1,0.76
2,0.72
3,0.8
4,0.65


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

In [6]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [7]:
y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)

In [8]:
mse = mean_squared_error(y_train, y_train_predict)
print("training_set 성능 : {}".format(sqrt(mse)))
mse = mean_squared_error(y_test, y_test_predict)
print("test_set 성능 : {}".format(sqrt(mse)))

training_set 성능 : 0.0015048321591021147
test_set 성능 : 5.090589291875944


In [9]:
# Lasso model
from sklearn.linear_model import Lasso
model = Lasso(alpha=0.001, max_iter=1000, normalize=True) # lambda 역할 -> alpha, normalize -> 자동 feature-scaling normalize
model.fit(X_train, y_train)

y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


In [10]:
mse = mean_squared_error(y_train, y_train_predict)
print("training_set 성능 : {}".format(sqrt(mse)))
mse = mean_squared_error(y_test, y_test_predict)
print("test_set 성능 : {}".format(sqrt(mse)))

training_set 성능 : 0.06336620966147144
test_set 성능 : 0.06007719092689259


In [11]:
# Ridge model
from sklearn.linear_model import Ridge
model = Ridge(alpha=0.001, max_iter=1000, normalize=True) # lambda 역할 -> alpha, normalize -> 자동 feature-scaling normalize
model.fit(X_train, y_train)

y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 


In [12]:
mse = mean_squared_error(y_train, y_train_predict)
print("training_set 성능 : {}".format(sqrt(mse)))
mse = mean_squared_error(y_test, y_test_predict)
print("test_set 성능 : {}".format(sqrt(mse)))

training_set 성능 : 0.05327825805894524
test_set 성능 : 0.06669588064649669


In [13]:
# 실습과제_L1 정규화 직접 해보기
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from math import sqrt
import numpy as np
import pandas as pd

INSURANCE_FILE_PATH = '../data/insurance.csv'
insurance_df = pd.read_csv(INSURANCE_FILE_PATH)

insurance_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [14]:
insurance_df = pd.get_dummies(data=insurance_df, columns=['sex', 'smoker', 'region'])

X = insurance_df.drop(['charges'], axis='columns')

polynomial_transformer = PolynomialFeatures(4)
polynomial_features = polynomial_transformer.fit_transform(X.values)
features = polynomial_transformer.get_feature_names(X.columns)

X = pd.DataFrame(polynomial_features, columns=features)
y = insurance_df[['charges']]



In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

model = Lasso(alpha=1, max_iter=2000, normalize=True)
model.fit(X_train, y_train)

y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)

print("training set에서 성능")
print("-------------------------")
print("오차: {}\n".format(sqrt(mean_squared_error(y_train, y_train_predict))))
print("testing set에서 성능")
print("-------------------------")
print("오차: {}".format(sqrt(mean_squared_error(y_test, y_test_predict))))

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Lasso())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


training set에서 성능
-------------------------
오차: 4726.636439607449

testing set에서 성능
-------------------------
오차: 4692.232442526969


In [16]:
# 실습과제_L2 정규화 직접 해보기
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from math import sqrt

import numpy as np
import pandas as pd

INSURANCE_FILE_PATH = '../data/insurance.csv'
insurance_df = pd.read_csv(INSURANCE_FILE_PATH)

insurance_df = pd.get_dummies(data=insurance_df, columns=['sex', 'smoker', 'region'])

X = insurance_df.drop(['charges'], axis='columns')

polynomial_transformer = PolynomialFeatures(4)
polynomial_features = polynomial_transformer.fit_transform(X.values)

features = polynomial_transformer.get_feature_names_out(X.columns)

X = pd.DataFrame(polynomial_features, columns=features)
y = insurance_df[['charges']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

model = Ridge(alpha=0.01, max_iter=2000, normalize=True)
model.fit(X_train, y_train)

y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)

print("training set에서 성능")
print("-------------------------")
print("오차: {}\n".format(sqrt(mean_squared_error(y_train, y_train_predict))))
print("testing set에서 성능")
print("-------------------------")
print("오차: {}".format(sqrt(mean_squared_error(y_test, y_test_predict))))

training set에서 성능
-------------------------
오차: 4561.6650975238645

testing set에서 성능
-------------------------
오차: 4692.427560339882


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), Ridge())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * n_samples. 


## Logistic Regression에서 L1, L2?

LogisticRegression(penalty='none')  
LogisticRegression(penalty='l1')  
LogisticRegression(penalty='l2') or LogisticRegression()  
  
## L1, L2 차이
L1 정규화는 여러 thetaθ값들을 0으로 만들어 줍니다. 모델에 중요하지 않다고 생각되는 속성들을 아예 없애주는 거죠. column이 너무 많을 때 사용 
L2 정규화는 thetaθ값들을 0으로 만들기보다는 조금씩 줄여 줍니다. 모델에 사용되는 속성들을 L1처럼 없애지는 않는 거죠. column 갯수 줄일 필요 없을 때 사용