# Data Encoding
- Norminal Encoding : 순서정보가 있는 데이터
- Ordinal Encoding : 순서정보가 없는 데이터

In [1]:
!pip install category_encoders



In [2]:
import numpy as np
import pandas as pd

import category_encoders as ce

## Norminal Encoding

### One Hot Encoding
> 피처의 항목이 많은 경우 차원의 저주에 빠질 수 있다.

In [3]:
data = {'color': ['Red','Blue','Green']}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,color
0,Red
1,Blue
2,Green


category_encoders을 이용한 인코딩

In [4]:
encoder = ce.OneHotEncoder(use_cat_names=True)

df_encoded = encoder.fit_transform(df)

df_encoded.head()

Unnamed: 0,color_Red,color_Blue,color_Green
0,1,0,0
1,0,1,0
2,0,0,1


### Mean Encoding

In [5]:
data = {'Pincode': ['753001', '753002', '753003', '753001', '753004', '753002', '753002', '753001', '753003']
        , 'O/P': [1, 1, 0, 0, 1, 0, 1, 0, 1]}

df = pd.DataFrame(data)
df

Unnamed: 0,Pincode,O/P
0,753001,1
1,753002,1
2,753003,0
3,753001,0
4,753004,1
5,753002,0
6,753002,1
7,753001,0
8,753003,1


In [6]:
group_mean = df.groupby('Pincode')['O/P'].mean()
group_mean

Pincode
753001    0.333333
753002    0.666667
753003    0.500000
753004    1.000000
Name: O/P, dtype: float64

In [7]:
df['mean'] = df['Pincode'].map(group_mean)
df.head()

Unnamed: 0,Pincode,O/P,mean
0,753001,1,0.333333
1,753002,1,0.666667
2,753003,0,0.5
3,753001,0,0.333333
4,753004,1,1.0


## Ordinal Encoding

### Label Encoding

In [8]:
data = {'column': ['Btech', 'Masters', 'High School', 'PHD']}
df=pd.DataFrame(data)
df.head()

Unnamed: 0,column
0,Btech
1,Masters
2,High School
3,PHD


 sklearn을 이용한 인코딩

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
encoder = LabelEncoder()

df['column_encoded'] = encoder.fit_transform(df['column'])
df.head()

Unnamed: 0,column,column_encoded
0,Btech,0
1,Masters,2
2,High School,1
3,PHD,3


In [11]:
encoder.classes_   #전체 label 정보

array(['Btech', 'High School', 'Masters', 'PHD'], dtype=object)

In [12]:
encoder.inverse_transform([0,1])  

array(['Btech', 'High School'], dtype=object)

### Target Encoding

In [13]:
data = {'Column': ['Btech', 'PHD', 'Masters', 'High School', 'PHD', 'Btech', 'Masters', 'High School', 'High School']
        , 'O/P': [1, 1, 0, 0, 1, 0, 0, 0, 1]}

df = pd.DataFrame(data)
df

Unnamed: 0,Column,O/P
0,Btech,1
1,PHD,1
2,Masters,0
3,High School,0
4,PHD,1
5,Btech,0
6,Masters,0
7,High School,0
8,High School,1


In [14]:
encoder = ce.TargetEncoder()

df_encoded = encoder.fit_transform(df['Column'],df['O/P']) #target data도 입력
df['encoded'] = df_encoded['Column']
df['Rank'] = df['encoded'].rank(method='dense',ascending=False)

df


Unnamed: 0,Column,O/P,encoded,Rank
0,Btech,1,0.452325,2.0
1,PHD,1,0.523251,1.0
2,Masters,0,0.3814,4.0
3,High School,0,0.427282,3.0
4,PHD,1,0.523251,1.0
5,Btech,0,0.452325,2.0
6,Masters,0,0.3814,4.0
7,High School,0,0.427282,3.0
8,High School,1,0.427282,3.0


### Ordinal Encoding

In [15]:
df = pd.DataFrame(
    {'Fruit': ['시과', '딸기', '바나나', '수박', '포도',
               '메론','자두','체리','화이트베리', '무화과'],
     'color':['red1','red2','yellow','red','purple','green','light red','pink','white','brown'],
     'price': [2000,300,400, 30000, 150, 8000,1000,100,300,800]})

df.head()

Unnamed: 0,Fruit,color,price
0,시과,red1,2000
1,딸기,red2,300
2,바나나,yellow,400
3,수박,red,30000
4,포도,purple,150


In [16]:
encoder = ce.OrdinalEncoder(cols = 'color')

df_encoded = encoder.fit_transform(df)
df_encoded.head()

Unnamed: 0,Fruit,color,price
0,시과,1,2000
1,딸기,2,300
2,바나나,3,400
3,수박,4,30000
4,포도,5,150


## Example

In [17]:
# 결과 저장
result = []

In [18]:
#모델 정의
from sklearn.tree import DecisionTreeClassifier
SEED = 42

In [19]:
# 데이터 로드
import seaborn as sns

df = sns.load_dataset('titanic')
cols = ["age","sibsp","parch","fare","pclass","sex","embarked", "survived"]
df=df[cols]
df.shape

(891, 8)

In [20]:
# 데이터 분리
from sklearn.model_selection import train_test_split
SEED = 42

train , test = train_test_split(df, random_state=SEED,test_size=0.2)

train.shape , test.shape

((712, 8), (179, 8))

In [21]:
#결측치 제거
train.age = train.age.fillna(train.age.mean())
test.age = test.age.fillna(train.age.mean())

train['embarked'] = train.embarked.fillna(train.embarked.mode().values[0])
test.embarked = test.embarked.fillna(train.embarked.mode().values[0])

train.isnull().sum().sum()

0

In [22]:
cols = ['age','fare']
features_tr = train[cols]
target_tr = train['survived']
features_te = test[cols]
target_te = test['survived']

features_tr.shape, target_tr.shape

((712, 2), (712,))

In [23]:
cols_encoding = ['pclass','sex','embarked','sibsp','parch']
tmp_tr = train[cols_encoding]
tmp_te = test[cols_encoding]

tmp_tr.shape

(712, 5)

In [24]:
tmp_tr.head()

Unnamed: 0,pclass,sex,embarked,sibsp,parch
331,1,male,S,0,0
733,2,male,S,0,0
382,3,male,S,0,0
704,3,male,S,1,0
813,3,female,S,4,2


In [25]:
tmp_tr['sex'] = tmp_tr['sex'].map({'male':1,'female':0})
tmp_tr['embarked'] = tmp_tr['embarked'].map({'S':2,'C':1,'Q':0})


tmp_te['sex'] = tmp_te['sex'].map({'male':1, 'female':0})
tmp_te['embarked'] = tmp_te['embarked'].map({'S':2, 'C':1, 'Q':0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_tr['sex'] = tmp_tr['sex'].map({'male':1,'female':0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_tr['embarked'] = tmp_tr['embarked'].map({'S':2,'C':1,'Q':0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_te['sex'] = tmp_te['sex'].map({'male':1, 'female':0})
A value is trying to be s

In [26]:
tmp_tr.head()

Unnamed: 0,pclass,sex,embarked,sibsp,parch
331,1,1,2,0,0
733,2,1,2,0,0
382,3,1,2,0,0
704,3,1,2,1,0
813,3,0,2,4,2


In [27]:
#encoding 전에 결측치 확인 필수
tmp_tr.isnull().sum().sum() , tmp_te.isnull().sum().sum() 

(0, 0)

### OneHotEncoding

In [28]:
encoder = ce.OneHotEncoder(use_cat_names=True)

In [29]:
enco_tr = pd.DataFrame()
enco_te = pd.DataFrame()

for col in tmp_tr.columns:
    _enco = encoder.fit_transform(tmp_tr[col].astype('category'))
    enco_tr = pd.concat([enco_tr,_enco],axis=1)
    
    _enco = encoder.transform(tmp_te[col].astype('category'))
    enco_te = pd.concat([enco_te,_enco],axis=1)
    
print(f'{enco_tr.shape} / {enco_te.shape}')
enco_tr.head()

(712, 22) / (179, 22)


Unnamed: 0,pclass_1.0,pclass_2.0,pclass_3.0,sex_1.0,sex_0.0,embarked_2.0,embarked_1.0,embarked_0.0,sibsp_0.0,sibsp_1.0,...,sibsp_2.0,sibsp_8.0,sibsp_5.0,parch_0.0,parch_2.0,parch_1.0,parch_6.0,parch_4.0,parch_3.0,parch_5.0
331,1,0,0,1,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
733,0,1,0,1,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
382,0,0,1,1,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
704,0,0,1,1,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
813,0,0,1,0,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [30]:
#concat 하기 전에 index 정리
features_tr = features_tr.reset_index(drop=True)
features_te = features_te.reset_index(drop=True)

enco_tr = enco_tr.reset_index(drop=True)
enco_te = enco_te.reset_index(drop=True)

In [31]:
features_tr.shape , enco_tr.shape

((712, 2), (712, 22))

In [40]:
df_tr = pd.concat([features_tr,enco_tr],axis=1)
df_te = pd.concat([features_te,enco_te],axis=1)

print(f'{df_tr.shape} / {df_te.shape}')


(712, 24) / (179, 24)


In [41]:
df_tr.head()

Unnamed: 0,age,fare,pclass_1.0,pclass_2.0,pclass_3.0,sex_1.0,sex_0.0,embarked_2.0,embarked_1.0,embarked_0.0,...,sibsp_2.0,sibsp_8.0,sibsp_5.0,parch_0.0,parch_2.0,parch_1.0,parch_6.0,parch_4.0,parch_3.0,parch_5.0
0,45.5,28.5,1,0,0,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
1,23.0,13.0,0,1,0,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,32.0,7.925,0,0,1,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
3,26.0,7.8542,0,0,1,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,6.0,31.275,0,0,1,0,1,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [45]:
model = DecisionTreeClassifier(random_state=SEED)
model.fit(df_tr,target_tr)

tr_score = model.score(df_tr, target_tr)
te_score = model.score(df_te, target_te)

tr_score, te_score

(0.9803370786516854, 0.776536312849162)