## 1. Scaling
-  Abalone(전복) 데이터를 가져와 두 가지 Scaling을 실행
    - 1.1. Min-Max Scaling
    - 1.2. Standard Scaling

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn

import os 
from os.path import join

### Abalone Data 준비 과정

In [2]:
# join('파일위치', '파일이름')
## '.' : 현재 파일위치
abalone_path = join('.', 'abalone.txt') # 데이터
column_path  = join('.', 'abalone_attributes.txt') # 속성

In [3]:
# 파일내용 읽어오기
abalone_columns = list()

# column_path의 파일을 line by line으로 읽어옴
# strip() : 불필요한 내용 제거 ('\n' 와 같은 부분 제거)
for line in open(column_path):
    abalone_columns.append(line.strip())
    
abalone_columns

['Sex',
 'Length',
 'Diameter',
 'Height',
 'Whole weight',
 'Shucked weight',
 'Viscera weight',
 'Shell weight',
 'Rings']

In [4]:
# header: 없을때는 None
# names : column이름 설정
data = pd.read_csv(abalone_path, header = None, names= abalone_columns)
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [5]:
# 'Sex' 속성을 따로 label에 저장하고 dataframe에서 삭제함
label = data['Sex']
del data['Sex']

In [6]:
# describe(): 요악정보 출력 (평균, 최대, 최소 등등)
data.describe()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [7]:
# info(): column에 관한 정보 출력 (null상태, datatype)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Length          4177 non-null   float64
 1   Diameter        4177 non-null   float64
 2   Height          4177 non-null   float64
 3   Whole weight    4177 non-null   float64
 4   Shucked weight  4177 non-null   float64
 5   Viscera weight  4177 non-null   float64
 6   Shell weight    4177 non-null   float64
 7   Rings           4177 non-null   int64  
dtypes: float64(7), int64(1)
memory usage: 261.2 KB


### 1.1. Min-Max Scaling 부분
    - sklearn의 MinMaxScaler() 내장함수 "fit_transform()" 사용
        [1] fit() : 테스트 해봄
        [2] transform() : 실제로 변환
        [3] pd.Dataframe()으로 변환

In [8]:
from sklearn.preprocessing import MinMaxScaler
mMscaler = MinMaxScaler()

# data = ( data - np.min(data) ) / ( np.max(data) - np.min(data) ) 

# fit()
## mMscaler.fit(data)

# transform()
## mScaled_data = mMscaler.transform(data)

# 위의 fit(), transform()과정을 한번에 해주는 fit_transform()
# numpy array로 반환된
mScaled_data = mMscaler.fit_transform(data)
mScaled_data

array([[0.51351351, 0.5210084 , 0.0840708 , ..., 0.1323239 , 0.14798206,
        0.5       ],
       [0.37162162, 0.35294118, 0.07964602, ..., 0.06319947, 0.06826109,
        0.21428571],
       [0.61486486, 0.61344538, 0.11946903, ..., 0.18564845, 0.2077728 ,
        0.28571429],
       ...,
       [0.70945946, 0.70588235, 0.18141593, ..., 0.37788018, 0.30543099,
        0.28571429],
       [0.74324324, 0.72268908, 0.13274336, ..., 0.34298881, 0.29347285,
        0.32142857],
       [0.85810811, 0.84033613, 0.17256637, ..., 0.49506254, 0.49177877,
        0.39285714]])

In [9]:
mScaled_data = pd.DataFrame(mScaled_data, columns= data.columns)
mScaled_data.head(5)

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0.513514,0.521008,0.084071,0.181335,0.150303,0.132324,0.147982,0.5
1,0.371622,0.352941,0.079646,0.079157,0.066241,0.063199,0.068261,0.214286
2,0.614865,0.613445,0.119469,0.239065,0.171822,0.185648,0.207773,0.285714
3,0.493243,0.521008,0.110619,0.182044,0.14425,0.14944,0.152965,0.321429
4,0.344595,0.336134,0.070796,0.071897,0.059516,0.05135,0.053313,0.214286


### 1.2. Standard Scaling 부분
    - 평균 0, 표준편차 1의 정규화
    - sklearn의 StandardScaler() 내장함수 "fit_transform()" 사용
        [1] fit() : 테스트 해봄
        [2] transform() : 실제로 변환
        [3] pd.Dataframe()으로 변환

In [10]:
from sklearn.preprocessing import StandardScaler

sdScaler = StandardScaler()
sdScaled_data = sdScaler.fit_transform(data)

In [11]:
sdScaled_data = pd.DataFrame(sdScaled_data, columns= data.columns)
sdScaled_data.head(5)

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,-0.574558,-0.432149,-1.064424,-0.641898,-0.607685,-0.726212,-0.638217,1.571544
1,-1.448986,-1.439929,-1.183978,-1.230277,-1.17091,-1.205221,-1.212987,-0.910013
2,0.050033,0.12213,-0.107991,-0.309469,-0.4635,-0.35669,-0.207139,-0.289624
3,-0.699476,-0.432149,-0.347099,-0.637819,-0.648238,-0.6076,-0.602294,0.020571
4,-1.615544,-1.540707,-1.423087,-1.272086,-1.215968,-1.287337,-1.320757,-0.910013
