## 원핫인코딩

pandas 라이브러리 사용

In [1]:
import pandas as pd
import numpy as np

In [2]:
fruit_markets = pd.DataFrame({'person':['suji', 'suji', 'jenny', 'dindin', 'jo'],
                              'fruitname': ['apple', 'banana', 'cherry', 'durian', np.nan],
                              'color':['red', 'yellow', 'red', 'green', np.nan]})
fruit_markets

Unnamed: 0,person,fruitname,color
0,suji,apple,red
1,suji,banana,yellow
2,jenny,cherry,red
3,dindin,durian,green
4,jo,,


In [3]:
## 판다스 시리즈 하나에 적용하기

fruit_name_onehot = pd.get_dummies(fruit_markets['fruitname'])
fruit_name_onehot

Unnamed: 0,apple,banana,cherry,durian
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,0,0,0,0


In [4]:
person_onehot = pd.get_dummies(fruit_markets['person'])
person_onehot

Unnamed: 0,dindin,jenny,jo,suji
0,0,0,0,1
1,0,0,0,1
2,0,1,0,0
3,1,0,0,0
4,0,0,1,0


In [5]:
# n-1 열 처리
pd.get_dummies(fruit_markets['color'], drop_first=True)

Unnamed: 0,red,yellow
0,1,0
1,0,1
2,1,0
3,0,0
4,0,0


In [6]:
# 데이터 프레임 전체에 적용하기
fruit_markets_onehot = pd.get_dummies(fruit_markets)
fruit_markets_onehot

Unnamed: 0,person_dindin,person_jenny,person_jo,person_suji,fruitname_apple,fruitname_banana,fruitname_cherry,fruitname_durian,color_green,color_red,color_yellow
0,0,0,0,1,1,0,0,0,0,1,0
1,0,0,0,1,0,1,0,0,0,0,1
2,0,1,0,0,0,0,1,0,0,1,0
3,1,0,0,0,0,0,0,1,1,0,0
4,0,0,1,0,0,0,0,0,0,0,0


In [7]:
# 결측값을 인코딩에 포함하기
pd.get_dummies(fruit_markets['color'], dummy_na=True)

Unnamed: 0,green,red,yellow,NaN
0,0,1,0,0
1,0,0,1,0
2,0,1,0,0
3,1,0,0,0
4,0,0,0,1


In [8]:
# 더미 데이터를 복원하기
pd.from_dummies(person_onehot)

AttributeError: module 'pandas' has no attribute 'from_dummies'

In [11]:
pd.__version__

'1.4.4'

In [12]:
!pip uninstall pandas
!pip install pandas

^C


In [13]:
pip install pandas==1.5.2

Collecting pandas==1.5.2
  Downloading pandas-1.5.2-cp39-cp39-win_amd64.whl (10.9 MB)
     ---------------------------------------- 10.9/10.9 MB 9.3 MB/s eta 0:00:00
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.4.4
    Uninstalling pandas-1.4.4:
      Successfully uninstalled pandas-1.4.4
Successfully installed pandas-1.5.2
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ydata-profiling 4.3.1 requires numpy<1.24,>=1.16.0, but you have numpy 1.24.3 which is incompatible.


In [16]:
pd.__version__

'1.4.4'

사이킷런을 이용한 원핫인코딩(머신러닝 모델에 넣을 때)

In [17]:
from sklearn.preprocessing import OneHotEncoder
oh_enc = OneHotEncoder(handle_unknown='ignore')

In [18]:
oh_enc.fit(fruit_markets)

OneHotEncoder(handle_unknown='ignore')

In [19]:
oh_enc.categories_

[array(['dindin', 'jenny', 'jo', 'suji'], dtype=object),
 array(['apple', 'banana', 'cherry', 'durian', nan], dtype=object),
 array(['green', 'red', 'yellow', nan], dtype=object)]

In [20]:


print(oh_enc.transform(fruit_markets))

  (0, 3)	1.0
  (0, 4)	1.0
  (0, 10)	1.0
  (1, 3)	1.0
  (1, 5)	1.0
  (1, 11)	1.0
  (2, 1)	1.0
  (2, 6)	1.0
  (2, 10)	1.0
  (3, 0)	1.0
  (3, 7)	1.0
  (3, 9)	1.0
  (4, 2)	1.0
  (4, 8)	1.0
  (4, 12)	1.0


In [22]:
x_data = oh_enc.transform(fruit_markets).toarray()
x_data

array([[0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.]])

In [23]:
x_data.shape

(5, 13)

텐서플로우를 이용한 원핫인코딩(딥러닝)

In [24]:
from tensorflow.preprocessing.text import Tokenizer

ModuleNotFoundError: No module named 'tensorflow.preprocessing'

In [25]:
!pip install tensorflow



In [26]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

In [27]:
text = "곧 추석이 다가옵니다. 행복하고 맛있는 것들 많이 먹고 푹 쉬는 한가위 보내고 오세요"

In [28]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
print('단어 집합 : ', tokenizer.word_index)

단어 집합 :  {'곧': 1, '추석이': 2, '다가옵니다': 3, '행복하고': 4, '맛있는': 5, '것들': 6, '많이': 7, '먹고': 8, '푹': 9, '쉬는': 10, '한가위': 11, '보내고': 12, '오세요': 13}


In [29]:
encoded = tokenizer.texts_to_sequences([text])
encoded

[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]]

In [30]:
one_hot = to_categorical(encoded[0])
one_hot

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]],
      dtype=float32)

In [31]:
one_hot.shape

(13, 14)