##### 정규화

In [1]:
from numpy import asarray
from sklearn.preprocessing import MinMaxScaler

# 데이터 세트
data = asarray([[100, 0.001],
				[8, 0.05],
				[50, 0.005],
				[88, 0.07],
				[4, 0.1]])
print(data)

# 특성의 값을 기본 범위인 0 ~ 1로 변환
scaler = MinMaxScaler()

# 데이터 변환
scaled = scaler.fit_transform(data)
print(scaled)

[[1.0e+02 1.0e-03]
 [8.0e+00 5.0e-02]
 [5.0e+01 5.0e-03]
 [8.8e+01 7.0e-02]
 [4.0e+00 1.0e-01]]
[[1.         0.        ]
 [0.04166667 0.49494949]
 [0.47916667 0.04040404]
 [0.875      0.6969697 ]
 [0.         1.        ]]


##### 표준화

In [2]:
from numpy import asarray
from sklearn.preprocessing import StandardScaler

# 데이터 세트
data = asarray([[100, 0.001],
				[8, 0.05],
				[50, 0.005],
				[88, 0.07],
				[4, 0.1]])
print(data)

# 평균을 0으로 표준편차를 1로
scaler = StandardScaler()

# 데이터 변환
scaled = scaler.fit_transform(data)
print(scaled)

[[1.0e+02 1.0e-03]
 [8.0e+00 5.0e-02]
 [5.0e+01 5.0e-03]
 [8.8e+01 7.0e-02]
 [4.0e+00 1.0e-01]]
[[ 1.26398112 -1.16389967]
 [-1.06174414  0.12639634]
 [ 0.         -1.05856939]
 [ 0.96062565  0.65304778]
 [-1.16286263  1.44302493]]


##### 소나 데이터

```
정확도
 - 79.7%
```

In [3]:
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

# 소나 데이터
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv"
# 데이터 세트 로드
dataset = read_csv(url, header=None)

# 데이터 분할
data = dataset.values
X, y = data[:, :-1], data[:, -1]

# 데이터 타입 변환
X = X.astype('float32')
y = LabelEncoder().fit_transform(y.astype('str'))

# 모델 평가
model = KNeighborsClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

# 평균과 표준편차 확인
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.797 (0.073)


##### 소나 데이터

```
정규화
 - 81.3%
```

In [4]:
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

# 소나 데이터
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv"
# 데이터 세트 로드
dataset = read_csv(url, header=None)

# 데이터 분할
data = dataset.values
X, y = data[:, :-1], data[:, -1]

# 데이터 타입 변환
X = X.astype('float32')
y = LabelEncoder().fit_transform(y.astype('str'))

# 정규화
# 특성의 값을 기본 범위인 0 ~ 1로 변환
trans = MinMaxScaler()
model = KNeighborsClassifier()
pipeline = Pipeline(steps=[('t', trans), ('m', model)])

# 모델 평가
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

# 평균과 표준편차 확인
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.813 (0.085)


##### 소나 데이터

```
표준화
 - 81.0%
```

In [5]:
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# 소나 데이터
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv"
# 데이터 세트 로드
dataset = read_csv(url, header=None)

# 데이터 분할
data = dataset.values
X, y = data[:, :-1], data[:, -1]

# 데이터 타입 변환
X = X.astype('float32')
y = LabelEncoder().fit_transform(y.astype('str'))

# 표준화
# 평균을 0으로 표준편차를 1로
trans = StandardScaler()
model = KNeighborsClassifier()
pipeline = Pipeline(steps=[('t', trans), ('m', model)])

# 모델 평가
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

# 평균과 표준편차 확인
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.810 (0.080)
