# feature scaling example code

In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing

In [2]:
NBA_FILE_PATH = 'C:/Users/kth25/ai_dataset/NBA_player_of_the_week.csv'
nba_player_of_the_week_df = pd.read_csv(NBA_FILE_PATH)

In [3]:
nba_player_of_the_week_df.head()

Unnamed: 0,Player,Team,Conference,Date,Position,Height,Weight,Age,Draft Year,Seasons in league,Season,Season short,Pre-draft Team,Real_value,Height CM,Weight KG,Last Season
0,Jayson Tatum,Boston Celtics,East,"Feb 10, 2020",SF,6'8,208,21,2017,2,2019-2020,2020,Duke,0.5,203,94,1
1,Nikola Jokic,Denver Nuggets,West,"Feb 10, 2020",C,7'0,250,25,2014,4,2019-2020,2020,KK Mega Bemax (Serbia),0.5,213,113,1
2,Jaylen Brown,Boston Celtics,East,"Feb 3, 2020",SF,6'7,220,23,2016,3,2019-2020,2020,California,0.5,201,99,1
3,Damian Lillard,Portland Trail Blazers,West,"Feb 3, 2020",G,6'3,195,29,2012,7,2019-2020,2020,Weber State,0.5,190,88,1
4,Pascal Siakam,Toronto Raptors,East,"Jan 27, 2020",F,6'9,230,25,2016,3,2019-2020,2020,New Mexico State,0.5,206,104,1


In [4]:
nba_player_of_the_week_df.describe()

Unnamed: 0,Weight,Age,Draft Year,Seasons in league,Season short,Real_value,Height CM,Weight KG,Last Season
count,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0
mean,224.567164,26.73806,1996.287313,5.740299,2003.156716,0.68694,201.071642,101.384328,0.023881
std,30.798885,3.400683,11.253558,3.293421,11.470164,0.242007,9.36797,14.011226,0.152734
min,150.0,19.0,1965.0,0.0,1980.0,0.5,175.0,68.0,0.0
25%,205.0,24.0,1987.0,3.0,1994.0,0.5,193.0,93.0,0.0
50%,220.0,26.0,1998.0,5.0,2005.0,0.5,201.0,99.0,0.0
75%,250.0,29.0,2005.0,8.0,2013.0,1.0,208.0,113.0,0.0
max,325.0,40.0,2018.0,17.0,2020.0,1.0,229.0,147.0,1.0


In [5]:
height_weight_age_df = nba_player_of_the_week_df[['Height CM','Weight KG','Age']]
height_weight_age_df.head()

Unnamed: 0,Height CM,Weight KG,Age
0,203,94,21
1,213,113,25
2,201,99,23
3,190,88,29
4,206,104,25


### MinMaxScaler를 이용한 Normalization

MinMaxNormalization이란 data의 최솟값과 최댓값을 이용하여 data를 0~1사이의 범위로 scaling 해주는 방법을 말한다!

x_new = (x_old - x_min) / (x_max - x_min)

In [6]:
scaler = preprocessing.MinMaxScaler()
normalized_data = scaler.fit_transform(height_weight_age_df)
normalized_data

array([[0.51851852, 0.32911392, 0.0952381 ],
       [0.7037037 , 0.56962025, 0.28571429],
       [0.48148148, 0.39240506, 0.19047619],
       ...,
       [0.48148148, 0.37974684, 0.23809524],
       [0.38888889, 0.21518987, 0.23809524],
       [0.42592593, 0.27848101, 0.52380952]])

In [7]:
normalized_df = pd.DataFrame(normalized_data, columns=['Height','Weight','Age'])

In [8]:
normalized_df.describe()

Unnamed: 0,Height,Weight,Age
count,1340.0,1340.0,1340.0
mean,0.482808,0.422586,0.368479
std,0.173481,0.177357,0.161937
min,0.0,0.0,0.0
25%,0.333333,0.316456,0.238095
50%,0.481481,0.392405,0.333333
75%,0.611111,0.56962,0.47619
max,1.0,1.0,1.0


### StandardScaler를 이용한 Normalization

Standardization은 data를 평균이 0, 표준 편차가 1이 되도록 표준화 시켜주는 방법을 말한다!

x_new = (x_old - x_mean) / (표준편차)

In [9]:
scaler = preprocessing.StandardScaler()
standardized_data = scaler.fit_transform(height_weight_age_df)
standardized_data

array([[ 0.20592274, -0.52722617, -1.68795564],
       [ 1.27378837,  0.82933556, -0.51128218],
       [-0.00765038, -0.17023624, -1.09961891],
       ...,
       [-0.00765038, -0.24163423, -0.80545055],
       [-0.54158319, -1.16980804, -0.80545055],
       [-0.32801007, -0.81281811,  0.95955965]])

In [10]:
standardized_df = pd.DataFrame(standardized_data, columns=['Height', 'Weight', 'Age'])

In [11]:
standardized_df.describe()

Unnamed: 0,Height,Weight,Age
count,1340.0,1340.0,1340.0
mean,-1.070288e-15,-7.049088e-16,-3.557685e-16
std,1.000373,1.000373,1.000373
min,-2.784101,-2.383574,-2.276292
25%,-0.8619429,-0.5986242,-0.8054505
50%,-0.007650381,-0.1702362,-0.2171138
75%,0.7398556,0.8293356,0.6653913
max,2.982373,3.256867,3.901243


# One-hot Encoding Example

In [12]:
TITANIC_FILE_PATH = 'C:/Users/kth25/ai_dataset/titanic.csv'
titanic_df = pd.read_csv(TITANIC_FILE_PATH)
titanic_df.head()

Unnamed: 0.1,Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [13]:
titanic_sex_embarked = titanic_df[['Sex','Embarked']]

In [14]:
titanic_sex_embarked.head()

Unnamed: 0,Sex,Embarked
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S


In [15]:
one_hot_encoded_df = pd.get_dummies(titanic_sex_embarked)

In [16]:
one_hot_encoded_df.head()

Unnamed: 0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,1,0,0,1
1,1,0,1,0,0
2,1,0,0,0,1
3,1,0,0,0,1
4,0,1,0,0,1


In [17]:
# 원하는 열들만 one-hot Encoding을 하고 싶은 경우
one_hot_encoded_df = pd.get_dummies(data=titanic_df,columns=['Sex','Embarked'])
one_hot_encoded_df.head()

Unnamed: 0.1,Unnamed: 0,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,0,3,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,0,1,0,0,1
1,1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,1,0,1,0,0
2,2,1,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,1,0,0,0,1
3,3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,1,0,0,0,1
4,4,0,3,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0,1,0,0,1


# Overfitting Example

In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

from math import sqrt

In [20]:
ADMISSION_FILE_PATH = 'C:/Users/kth25/ai_dataset/admission_data.csv'
admission_df = pd.read_csv(ADMISSION_FILE_PATH).drop('Serial No.',axis=1)

In [21]:
admission_df.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.0,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.8
4,314,103,2,2.0,3.0,8.21,0,0.65


In [24]:
X = admission_df.drop(['Chance of Admit '],axis=1)

polynomial_transformer = PolynomialFeatures(6)
polynomial_features = polynomial_transformer.fit_transform(X.values)
features = polynomial_transformer.get_feature_names(X.columns)

X = pd.DataFrame(polynomial_features, columns=features)
X.head()

Unnamed: 0,1,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,GRE Score^2,GRE Score TOEFL Score,...,LOR CGPA^2 Research^3,LOR CGPA Research^4,LOR Research^5,CGPA^6,CGPA^5 Research,CGPA^4 Research^2,CGPA^3 Research^3,CGPA^2 Research^4,CGPA Research^5,Research^6
0,1.0,337.0,118.0,4.0,4.5,4.5,9.65,1.0,113569.0,39766.0,...,419.05125,43.425,4.5,807539.696082,83682.87006,8671.800006,898.632125,93.1225,9.65,1.0
1,1.0,324.0,107.0,4.0,4.0,4.5,8.87,1.0,104976.0,34668.0,...,354.04605,39.915,4.5,487014.306256,54905.784245,6190.054594,697.864103,78.6769,8.87,1.0
2,1.0,316.0,104.0,3.0,3.0,3.5,8.0,1.0,99856.0,32864.0,...,224.0,28.0,3.5,262144.0,32768.0,4096.0,512.0,64.0,8.0,1.0
3,1.0,322.0,110.0,3.0,3.5,2.5,8.67,1.0,103684.0,35420.0,...,187.92225,21.675,2.5,424731.61094,48988.651781,5650.363527,651.714363,75.1689,8.67,1.0
4,1.0,314.0,103.0,2.0,2.0,3.0,8.21,0.0,98596.0,32342.0,...,0.0,0.0,0.0,306237.903347,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
y = admission_df[['Chance of Admit ']]
y.head()

Unnamed: 0,Chance of Admit
0,0.92
1,0.76
2,0.72
3,0.8
4,0.65


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

In [27]:
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [28]:
y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)

In [30]:
mse = mean_squared_error(y_train, y_train_predict)

print('training set 에서의 성능')
print('==========================')
print(sqrt(mse))
print()

mse = mean_squared_error(y_test, y_test_predict)
print('test set 에서의 성능')
print('==========================')
print(sqrt(mse))

training set 에서의 성능
0.0015048194399234639

test set 에서의 성능
5.090721961126577


# overfitting 해결하기 _ by L1 정규화

In [32]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures

from math import sqrt

In [20]:
ADMISSION_FILE_PATH = 'C:/Users/kth25/ai_dataset/admission_data.csv'
admission_df = pd.read_csv(ADMISSION_FILE_PATH).drop('Serial No.',axis=1)

In [21]:
admission_df.head()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.0,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.8
4,314,103,2,2.0,3.0,8.21,0,0.65


In [24]:
X = admission_df.drop(['Chance of Admit '],axis=1)

polynomial_transformer = PolynomialFeatures(6)
polynomial_features = polynomial_transformer.fit_transform(X.values)
features = polynomial_transformer.get_feature_names(X.columns)

X = pd.DataFrame(polynomial_features, columns=features)
X.head()

Unnamed: 0,1,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,GRE Score^2,GRE Score TOEFL Score,...,LOR CGPA^2 Research^3,LOR CGPA Research^4,LOR Research^5,CGPA^6,CGPA^5 Research,CGPA^4 Research^2,CGPA^3 Research^3,CGPA^2 Research^4,CGPA Research^5,Research^6
0,1.0,337.0,118.0,4.0,4.5,4.5,9.65,1.0,113569.0,39766.0,...,419.05125,43.425,4.5,807539.696082,83682.87006,8671.800006,898.632125,93.1225,9.65,1.0
1,1.0,324.0,107.0,4.0,4.0,4.5,8.87,1.0,104976.0,34668.0,...,354.04605,39.915,4.5,487014.306256,54905.784245,6190.054594,697.864103,78.6769,8.87,1.0
2,1.0,316.0,104.0,3.0,3.0,3.5,8.0,1.0,99856.0,32864.0,...,224.0,28.0,3.5,262144.0,32768.0,4096.0,512.0,64.0,8.0,1.0
3,1.0,322.0,110.0,3.0,3.5,2.5,8.67,1.0,103684.0,35420.0,...,187.92225,21.675,2.5,424731.61094,48988.651781,5650.363527,651.714363,75.1689,8.67,1.0
4,1.0,314.0,103.0,2.0,2.0,3.0,8.21,0.0,98596.0,32342.0,...,0.0,0.0,0.0,306237.903347,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
y = admission_df[['Chance of Admit ']]
y.head()

Unnamed: 0,Chance of Admit
0,0.92
1,0.76
2,0.72
3,0.8
4,0.65


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)

In [33]:
# max_iter은 경사하강법을 최대한 얼만큼 할지
# L2 정규화를 쓰고 싶으면 Lasso를 Ridge로만 바꿔주면 돼!!

model = Lasso(alpha = 0.001, max_iter = 1000, normalize=True)
model.fit(X_train, y_train)

Lasso(alpha=0.001, normalize=True)

In [34]:
y_train_predict = model.predict(X_train)
y_test_predict = model.predict(X_test)

In [35]:
mse = mean_squared_error(y_train, y_train_predict)

print('training set 에서의 성능')
print('==========================')
print(sqrt(mse))
print()

mse = mean_squared_error(y_test, y_test_predict)
print('test set 에서의 성능')
print('==========================')
print(sqrt(mse))

training set 에서의 성능
0.06336620966147143

test set 에서의 성능
0.060077190926892544


### 결과 정리!!

1. overfitting이 일어났을 때 : training set에 대한 성능은 월등히 뛰어나지만, test set에서의 성능은 현저히 낮아지는 것을 확인할 수 있다.

training set 에서의 성능
==========================
*0.0015048194399234639*

test set 에서의 성능
==========================
*5.090721961126577*

2. Regularization을 적용했을 때 : training set에 대한 성능은 조금 줄어들었지만, test set에서의 성능은 월등히 높아진 것을 확인할 수 있다.

training set 에서의 성능
==========================
*0.06336620966147143*

test set 에서의 성능
==========================
*0.060077190926892544*