##### 순서 인코딩

In [1]:
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder

# 데이터 세트
data = asarray([['red'], ['green'], ['blue']])
print(data)

# 순서 인코딩
encoder = OrdinalEncoder()

# 데이터 변환
result = encoder.fit_transform(data)
print(result)

[['red']
 ['green']
 ['blue']]
[[2.]
 [1.]
 [0.]]


##### 순서 인코딩

```
유방암 데이터
 - 정확도 : 75.79%
```

In [4]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score

# 유방암 데이터
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/breast-cancer.csv"

# 데이터 세트 로드
dataset = read_csv(url, header=None)

# 데이터 분할
data = dataset.values

X = data[:, :-1].astype(str)
y = data[:, -1].astype(str)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

# 원시 훈련 데이터
print(X_train[:5, :])

# 순서 인코딩
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(X_train)

X_train = ordinal_encoder.transform(X_train)
X_test = ordinal_encoder.transform(X_test)

# 순서 인코딩된 훈련 데이터
print(X_train[:5, :])

# 순서 인코딩
label_encoder = LabelEncoder()
label_encoder.fit(y_train)

y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

# 로지스틱 회귀
model = LogisticRegression()
model.fit(X_train, y_train)

# 모델 예측
yhat = model.predict(X_test)

# 예측 정확도
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

[["'50-59'" "'ge40'" "'25-29'" "'0-2'" "'no'" "'1'" "'left'"
  "'right_low'" "'no'"]
 ["'30-39'" "'premeno'" "'5-9'" "'0-2'" "'no'" "'2'" "'left'"
  "'right_low'" "'no'"]
 ["'50-59'" "'premeno'" "'50-54'" "'0-2'" "'yes'" "'2'" "'right'"
  "'left_up'" "'yes'"]
 ["'30-39'" "'premeno'" "'25-29'" "'0-2'" "'no'" "'1'" "'left'"
  "'central'" "'no'"]
 ["'40-49'" "'premeno'" "'20-24'" "'0-2'" "'no'" "'2'" "'right'"
  "'right_up'" "'no'"]]
[[ 3.  0.  4.  0.  0.  0.  0.  3.  0.]
 [ 1.  2.  9.  0.  0.  1.  0.  3.  0.]
 [ 3.  2. 10.  0.  1.  1.  1.  2.  1.]
 [ 1.  2.  4.  0.  0.  0.  0.  0.  0.]
 [ 2.  2.  3.  0.  0.  1.  1.  4.  0.]]
Accuracy: 75.79


##### 원-핫 인코딩

```
유방암 데이터
 - 정확도 : 70.53%
```

In [8]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

# 유방암 데이터
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/breast-cancer.csv"

# 데이터 세트 로드
dataset = read_csv(url, header=None)

# 데이터 분할
data = dataset.values

X = data[:, :-1].astype(str)
y = data[:, -1].astype(str)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

# 원시 훈련 데이터
print(X_train[:5, :])

# 원-핫 인코딩
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoder.fit(X_train)

X_train = onehot_encoder.transform(X_train)
X_test = onehot_encoder.transform(X_test)

# 순서 인코딩된 훈련 데이터
print(X_train[:5, :])

# 순서 인코딩
label_encoder = LabelEncoder()
label_encoder.fit(y_train)

y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

# 로지스틱 회귀
model = LogisticRegression()
model.fit(X_train, y_train)

# 모델 예측
yhat = model.predict(X_test)

# 예측 정확도
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

[["'50-59'" "'ge40'" "'25-29'" "'0-2'" "'no'" "'1'" "'left'"
  "'right_low'" "'no'"]
 ["'30-39'" "'premeno'" "'5-9'" "'0-2'" "'no'" "'2'" "'left'"
  "'right_low'" "'no'"]
 ["'50-59'" "'premeno'" "'50-54'" "'0-2'" "'yes'" "'2'" "'right'"
  "'left_up'" "'yes'"]
 ["'30-39'" "'premeno'" "'25-29'" "'0-2'" "'no'" "'1'" "'left'"
  "'central'" "'no'"]
 ["'40-49'" "'premeno'" "'20-24'" "'0-2'" "'no'" "'2'" "'right'"
  "'right_up'" "'no'"]]
[[0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 1. 