In [1]:
import os
from os.path import join
import copy
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import sklearn

import matplotlib.pyplot as plt

abalone_path = join('data','abalone.txt')
column_path = join('data','abalone_attributes.txt')

abalone_columns = list()
for l in open(column_path):
    abalone_columns.append(l.strip())

In [2]:
data = pd.read_csv(abalone_path, header=None, names=abalone_columns)
label = data['Sex']
#라벨을 하나 뺄 수 있다.

In [3]:
data.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [4]:
data.shape

(4177, 9)

In [5]:
del data['Sex']

In [6]:
data.describe()
#df.describe()함수로 각 변수별 평균, 표준편차, 최대, 최소, 사분위수 등의 기초 통계량을 확인가능

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [7]:
data.info()
#데이터 형식을 확인가하는 df.info() 함수

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Length          4177 non-null   float64
 1   Diameter        4177 non-null   float64
 2   Height          4177 non-null   float64
 3   Whole weight    4177 non-null   float64
 4   Shucked weight  4177 non-null   float64
 5   Viscera weight  4177 non-null   float64
 6   Shell weight    4177 non-null   float64
 7   Rings           4177 non-null   int64  
dtypes: float64(7), int64(1)
memory usage: 261.2 KB


In [8]:
#sklearn에서 Min-Max Scalar는 preprocessing 패키지에 있고 0~1사이의 값을 갖는다.

data = (data - np.min(data))/(np.max(data)-np.min(data))

In [9]:
# 1) 모델 불러오기 및 정의하기
from sklearn.preprocessing import MinMaxScaler #Scaler 스펠링 주의. ler을 lar를 사용한다.
mMscaler = MinMaxScaler() 

In [10]:
# 2) 데이터에서 특징 찾기(Min, Max 값)
mMscaler.fit(data)
data.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [11]:
# 3) 데이터 변환-1. 따로하기 mMscaled_data=mMscaler.transform(data)
# 3-2 특징찾는 것과 변환하는 것을 같이 하기
mMscaled_data = mMscaler.fit_transform(data) #numpy를 매개인자로 가짐을 확인하는 습관


In [12]:
# 4) 결과 살펴보기
mMscaled_data = pd.DataFrame(mMscaled_data, columns = data.columns)
mMscaled_data.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0.513514,0.521008,0.084071,0.181335,0.150303,0.132324,0.147982,0.5
1,0.371622,0.352941,0.079646,0.079157,0.066241,0.063199,0.068261,0.214286
2,0.614865,0.613445,0.119469,0.239065,0.171822,0.185648,0.207773,0.285714
3,0.493243,0.521008,0.110619,0.182044,0.14425,0.14944,0.152965,0.321429
4,0.344595,0.336134,0.070796,0.071897,0.059516,0.05135,0.053313,0.214286


In [13]:
#2. Standard Scaling으로 표준정규분포화로 스케일링. z-score

In [14]:
#1. 모델 불러오기
from sklearn.preprocessing import StandardScaler
sdscaler = StandardScaler()

In [15]:
#2. 데이터에서 특징 찾기(meand, std 값)
sdscaler.fit(data)

StandardScaler()

In [16]:
#3. 데이터 변환
sdscaled_data = sdscaler.transform(data)

In [17]:
#4. 결과 살펴보기
data.head()

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [23]:
sdscaled_data=pd.DataFrame(sdscaled_data,columns=data.columns)

In [19]:
sdscaled_data.head() #MinMax랑 달리 Standard scaling은 음수값이 나올 수 있다.

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,-0.574558,-0.432149,-1.064424,-0.641898,-0.607685,-0.726212,-0.638217,1.571544
1,-1.448986,-1.439929,-1.183978,-1.230277,-1.17091,-1.205221,-1.212987,-0.910013
2,0.050033,0.12213,-0.107991,-0.309469,-0.4635,-0.35669,-0.207139,-0.289624
3,-0.699476,-0.432149,-0.347099,-0.637819,-0.648238,-0.6076,-0.602294,0.020571
4,-1.615544,-1.540707,-1.423087,-1.272086,-1.215968,-1.287337,-1.320757,-0.910013


NameError: name 's' is not defined

### 