In [20]:
!pip install category_encoders



In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import category_encoders as ce

# 데이터 인코딩

# One hot Encoding

In [3]:
data = {'color': ['Red', 'Blue', 'Green']}
df = pd.DataFrame(data) 
df.head()

Unnamed: 0,color
0,Red
1,Blue
2,Green


#### category_encoders를 사용한 인코딩

In [4]:
encoder = ce.OneHotEncoder(use_cat_names=True) #  0 아니면 1로 바꾸는 기법

df_encoded = encoder.fit_transform(df)

df_encoded.head()

Unnamed: 0,color_Red,color_Blue,color_Green
0,1,0,0
1,0,1,0
2,0,0,1


# Mean Encoding

In [5]:
data = {'Pincode': ['753001', '753002', '753003', '753001', '753004', '753002', '753002', '753001', '753003'], 
        'O/P': [1, 1, 0, 0, 1, 0, 1, 0, 1]}
df = pd.DataFrame(data) 
df

Unnamed: 0,Pincode,O/P
0,753001,1
1,753002,1
2,753003,0
3,753001,0
4,753004,1
5,753002,0
6,753002,1
7,753001,0
8,753003,1


In [6]:
group_mean = df.groupby('Pincode')['O/P'].mean() # 평균 값으로 인코딩
group_mean #groupby = ['값을 그룹으로 묶음']

Pincode
753001    0.333333
753002    0.666667
753003    0.500000
753004    1.000000
Name: O/P, dtype: float64

In [7]:
df['Mean'] = df['Pincode'].map(group_mean)
df.head()

Unnamed: 0,Pincode,O/P,Mean
0,753001,1,0.333333
1,753002,1,0.666667
2,753003,0,0.5
3,753001,0,0.333333
4,753004,1,1.0


# Ordinal Encoding

In [8]:
data = {'column': ['Btech', 'Masters', 'High School', 'PHD']}
df = pd.DataFrame(data) 
df.head()

Unnamed: 0,column
0,Btech
1,Masters
2,High School
3,PHD


In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
encoder = LabelEncoder()

df['column_encoded'] = encoder.fit_transform(df['column'])

df.head()

Unnamed: 0,column,column_encoded
0,Btech,0
1,Masters,2
2,High School,1
3,PHD,3


In [11]:
encoder.classes_

array(['Btech', 'High School', 'Masters', 'PHD'], dtype=object)

In [12]:
encoder.inverse_transform([0,1]) # inverse?

array(['Btech', 'High School'], dtype=object)

# Target Encoding

In [13]:
data = {'Column': ['Btech', 'PHD', 'Masters', 'High School', 'PHD', 'Btech', 'Masters', 'High School', 'High School'], 
        'O/P': [1, 1, 0, 0, 1, 0, 0, 0, 1]}
df = pd.DataFrame(data) 
df

Unnamed: 0,Column,O/P
0,Btech,1
1,PHD,1
2,Masters,0
3,High School,0
4,PHD,1
5,Btech,0
6,Masters,0
7,High School,0
8,High School,1


In [14]:
encoder = ce.TargetEncoder() # (각각의 순위를 매긴 뒤 높은 순위부터 낮은 순위대로 1~@값 부여)

df_encoded = encoder.fit_transform(df['Column'], df['O/P'])
df['encoded'] = df_encoded['Column']
df['Rank'] = df['encoded'].rank(method='dense', ascending=False)

df

Unnamed: 0,Column,O/P,encoded,Rank
0,Btech,1,0.452325,2.0
1,PHD,1,0.523251,1.0
2,Masters,0,0.3814,4.0
3,High School,0,0.427282,3.0
4,PHD,1,0.523251,1.0
5,Btech,0,0.452325,2.0
6,Masters,0,0.3814,4.0
7,High School,0,0.427282,3.0
8,High School,1,0.427282,3.0


# Ordinal Encoding

In [15]:
df = pd.DataFrame( # 대충 값 큰 것을 체크.
    {'Fruit': ['시과', '딸기', '바나나', '수박', '포도',
               '메론','자두','체리','화이트베리', '무화과'],
     'color':['red1','red2','yellow','red','purple','green','light red','pink','white','brown'],
     'price': [2000,300,400, 30000, 150, 8000,1000,100,300,800]})

df.head()

Unnamed: 0,Fruit,color,price
0,시과,red1,2000
1,딸기,red2,300
2,바나나,yellow,400
3,수박,red,30000
4,포도,purple,150


In [16]:
encoder = ce.OrdinalEncoder(cols = 'color')

df_encoded = encoder.fit_transform(df)
df_encoded.head()

Unnamed: 0,Fruit,color,price
0,시과,1,2000
1,딸기,2,300
2,바나나,3,400
3,수박,4,30000
4,포도,5,150


# 예제

#### 결과 저장

In [17]:
results = []

In [18]:
from sklearn.tree import DecisionTreeClassifier
SEED = 42

In [19]:
import seaborn as sns

df = sns.load_dataset('titanic')
cols = ["age","sibsp","parch","fare","pclass","sex","embarked", "survived"]
df = df[cols]
df.shape 

(891, 8)

In [22]:
from sklearn.model_selection import train_test_split
SEED = 42

train, test = train_test_split(df, random_state=SEED, test_size=0.2)

train.shape, test.shape

((712, 8), (179, 8))

In [24]:
train.age = train.age.fillna(train.age.mean())
test.age = test.age.fillna(test.age.mean())

train.embarked = train.embarked.fillna(train.embarked.mode().values[0]) # mode: 최빈값
test.embarked = test.embarked.fillna(test.embarked.mode().values[0])
# train['embarked'] 역시 말이 됨

train.isnull().sum().sum()

0

In [25]:
cols = ["age", "fare"]
features_tr = train[cols]
target_tr = train["survived"]
features_te = test[cols]
target_te = test["survived"]

features_tr.shape, target_tr.shape

((712, 2), (712,))

In [28]:
cols_encoding = ["pclass", "sex", "embarked", "sibsp", "parch"]
tmp_tr = train[cols_encoding]
tmp_te = test[cols_encoding]

tmp_tr.shape

(712, 5)

In [29]:
tmp_tr.head()

Unnamed: 0,pclass,sex,embarked,sibsp,parch
331,1,male,S,0,0
733,2,male,S,0,0
382,3,male,S,0,0
704,3,male,S,1,0
813,3,female,S,4,2


In [31]:
tmp_tr['sex'] = tmp_tr['sex'].map({'male':1, 'female':0})
tmp_tr['embarked'] = tmp_tr['embarked'].map({'S':2, 'C':1, 'Q':0})

tmp_te['sex'] = tmp_te['sex'].map({'male':1, 'female':0})
tmp_te['embarked'] = tmp_te['embarked'].map({'S':2, 'C':1, 'Q':0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_tr['sex'] = tmp_tr['sex'].map({'male':1, 'female':0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_tr['embarked'] = tmp_tr['embarked'].map({'S':2, 'C':1, 'Q':0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_te['sex'] = tmp_te['sex'].map({'male':1, 'female':0})
A value is trying to b

In [43]:
encoder = ce.OneHotEncoder(use_cat_names=True)

In [44]:
enco_tr = pd.DataFrame() ## 중요
enco_te = pd.DataFrame()

for col in tmp_tr.columns:
    _enco = encoder.fit_transform(tmp_tr[col].astype('category'))
    enco_tr = pd.concat([enco_tr, _enco], axis=1)
    
    _enco = encoder.transform(tmp_te[col].astype('category'))
    enco_te = pd.concat([enco_te, _enco], axis=1)

print(f'{enco_tr.shape} / {enco_te.shape}')
enco_tr.head()

(712, 22) / (179, 22)


Unnamed: 0,pclass_1.0,pclass_2.0,pclass_3.0,sex_1.0,sex_0.0,embarked_2.0,embarked_1.0,embarked_0.0,sibsp_0.0,sibsp_1.0,...,sibsp_2.0,sibsp_8.0,sibsp_5.0,parch_0.0,parch_2.0,parch_1.0,parch_6.0,parch_4.0,parch_3.0,parch_5.0
331,1,0,0,1,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
733,0,1,0,1,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
382,0,0,1,1,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
704,0,0,1,1,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
813,0,0,1,0,1,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [42]:
feature_tr = features_tr.reset_index(drop=True)
feature_te = features_te.reset_index(drop=True)
enco_tr = enco_tr.reset_index(drop=True)
enco_te = enco_te.reset_index(drop=True)

features_tr.shape, enco_tr.shape

((712, 2), (712, 22))

In [45]:
df_tr = pd.concat([features_tr, enco_tr], axis=1).reset_index(drop=True)
df_te = pd.concat([features_te, enco_te], axis=1).reset_index(drop=True)

print(f'{df_tr.shape} / {df_te.shape}')
df_tr.head()

(712, 24) / (179, 24)


Unnamed: 0,age,fare,pclass_1.0,pclass_2.0,pclass_3.0,sex_1.0,sex_0.0,embarked_2.0,embarked_1.0,embarked_0.0,...,sibsp_2.0,sibsp_8.0,sibsp_5.0,parch_0.0,parch_2.0,parch_1.0,parch_6.0,parch_4.0,parch_3.0,parch_5.0
0,45.5,28.5,1,0,0,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
1,23.0,13.0,0,1,0,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,32.0,7.925,0,0,1,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
3,26.0,7.8542,0,0,1,1,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,6.0,31.275,0,0,1,0,1,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [46]:
model = DecisionTreeClassifier(random_state=SEED)
model.fit(df_tr, target_tr)

tr_score = model.score(df_tr,target_tr) 
te_score = model.score(df_te,target_te)

tr_score, te_score

(0.9803370786516854, 0.7430167597765364)

In [55]:
results.append(
    {
        'encoding': 'one-hot',
        'tr_score': tr_score,
        'te_score': te_score
    }
)

# Mean Encoding

In [56]:
enco_tr = pd.concat([tmp_tr, target_tr], axis = 1)

In [57]:
enco_tr.head()

Unnamed: 0,pclass,sex,embarked,sibsp,parch,survived
331,1,1,2,0,0,0
733,2,1,2,0,0,0
382,3,1,2,0,0,0
704,3,1,2,1,0,0
813,3,0,2,4,2,0


In [58]:
enco_tr = enco_tr.groupby('survived').mean()
enco_tr

Unnamed: 0_level_0,pclass,sex,embarked,sibsp,parch
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2.536036,0.855856,1.70045,0.596847,0.331081
1,1.988806,0.324627,1.58209,0.481343,0.458955


In [59]:
a = b # a와 b가 완전히 같은 것임, 즉 b가 변하면 a도 변함
a = b.copy() # a와 b가 같지만, b가 변할 때 a는 변하지 않음

NameError: name 'b' is not defined

In [94]:
df_tr = features_tr.copy() # 복사본 하나 만들기
df_tr = pd.concat([df_tr.reset_index(drop=True), target_tr.reset_index(drop=True)], axis=1)
df_te = features_te.copy()
df_te = pd.concat([df_te.reset_index(drop=True), target_te.reset_index(drop=True)], axis=1)

df_tr = pd.merge(df_tr, enco_tr, on='survived', how='left')
df_te = pd.merge(df_te, enco_tr, on='survived', how='left')

df_tr.drop(['survived'], axis=1, inplace=True) # survived 버리기
df_te.drop(['survived'], axis=1, inplace=True)

print(f'{df_tr.shape} / {df_te.shape}')
df_tr.head()

KeyError: 'survived'

In [95]:
model = DecisionTreeClassifier(random_state=SEED)
model.fit(df_tr,target_tr)

tr_score = model.score(df_tr,target_tr) 
te_score = model.score(df_te,target_te)

tr_score, te_score

(1.0, 1.0)

In [96]:
results.append(
    {
        'encoding': 'mean',
        'tr_score': tr_score,
        'te_score': te_score
    }
)

# label Encoding

In [62]:
encoder = LabelEncoder()
enco_tr = pd.DataFrame()
enco_te = pd.DataFrame()

for col in tmp_tr.columns:
  enco_tr[col] = encoder.fit_transform(tmp_tr[col])
  enco_te[col] = encoder.transform(tmp_te[col])

print(f'{enco_tr.shape} / {enco_te.shape}')
enco_tr.head()

(712, 5) / (179, 5)


Unnamed: 0,pclass,sex,embarked,sibsp,parch
0,0,1,2,0,0
1,1,1,2,0,0
2,2,1,2,0,0
3,2,1,2,1,0
4,2,0,2,4,2


In [71]:
features_tr = features_tr.reset_index(drop=True)
features_te = features_te.reset_index(drop=True)
enco_tr = enco_tr.reset_index(drop=True)
enco_te = enco_te.reset_index(drop=True)

features_tr.shape, enco_tr.shape

((712, 2), (712, 5))

In [72]:
df_tr = pd.concat([features_tr,enco_tr],axis=1).reset_index(drop=True)
df_te = pd.concat([features_te,enco_te],axis=1).reset_index(drop=True)

print(f'{df_tr.shape} / {df_te.shape}')
df_tr.head()

(712, 7) / (179, 7)


Unnamed: 0,age,fare,pclass,sex,embarked,sibsp,parch
0,45.5,28.5,0,1,2,0,0
1,23.0,13.0,1,1,2,0,0
2,32.0,7.925,2,1,2,0,0
3,26.0,7.8542,2,1,2,1,0
4,6.0,31.275,2,0,2,4,2


In [74]:
model = DecisionTreeClassifier(random_state=SEED)
model.fit(df_tr,target_tr)

tr_score = model.score(df_tr, target_tr)
te_score = model.score(df_te, target_te)

tr_score, te_score

(0.9803370786516854, 0.7541899441340782)

In [75]:
results.append(
    {
        'encoding': 'label',
        'tr_score': tr_score,
        'te_score': te_score
    }
)

# ordinal Encoding

In [77]:
encoder = ce.OrdinalEncoder(cols = tmp_tr.columns)

In [79]:
enco_tr = encoder.fit_transform(tmp_tr)
enco_te = encoder.transform(tmp_te)

print(f'{enco_tr.shape} / {enco_te.shape}')
enco_tr.head()

(712, 5) / (179, 5)


Unnamed: 0,pclass,sex,embarked,sibsp,parch
331,1,1,1,1,1
733,2,1,1,1,1
382,3,1,1,1,1
704,3,1,1,2,1
813,3,2,1,3,2


In [80]:
features_tr = features_tr.reset_index(drop=True)
features_te = features_te.reset_index(drop=True)
enco_tr = enco_tr.reset_index(drop=True)
enco_te = enco_te.reset_index(drop=True)

features_tr.shape, enco_tr.shape

((712, 2), (712, 5))

In [81]:
df_tr = pd.concat([features_tr,enco_tr],axis=1).reset_index(drop=True)
df_te = pd.concat([features_te,enco_te],axis=1).reset_index(drop=True)

print(f'{df_tr.shape} / {df_te.shape}')
df_tr.head()

(712, 7) / (179, 7)


Unnamed: 0,age,fare,pclass,sex,embarked,sibsp,parch
0,45.5,28.5,1,1,1,1,1
1,23.0,13.0,2,1,1,1,1
2,32.0,7.925,3,1,1,1,1
3,26.0,7.8542,3,1,1,2,1
4,6.0,31.275,3,2,1,3,2


In [82]:
model = DecisionTreeClassifier(random_state=SEED)
model.fit(df_tr,target_tr)

tr_score = model.score(df_tr, target_tr)
te_score = model.score(df_te, target_te)

tr_score, te_score

(0.9803370786516854, 0.7486033519553073)

In [83]:
results.append(
    {
        'encoding': 'ordinal',
        'tr_score': tr_score,
        'te_score': te_score
    }
)

# target Encoding

In [85]:
encoder = ce.TargetEncoder()

In [87]:
enco_tr = encoder.fit_transform(tmp_tr.reset_index(drop=True), target_tr.reset_index(drop=True))
enco_te = encoder.transform(tmp_te.reset_index(drop=True))

print(f'{enco_tr.shape} / {enco_te.shape}')
enco_tr.head()

(712, 5) / (179, 5)


Unnamed: 0,pclass,sex,embarked,sibsp,parch
0,1,1,2,0,0
1,2,1,2,0,0
2,3,1,2,0,0
3,3,1,2,1,0
4,3,0,2,4,2


In [88]:
features_tr = features_tr.reset_index(drop=True)
features_te = features_te.reset_index(drop=True)
enco_tr = enco_tr.reset_index(drop=True)
enco_te = enco_te.reset_index(drop=True)

features_tr.shape, enco_tr.shape

((712, 2), (712, 5))

In [89]:
df_tr = pd.concat([features_tr,enco_tr],axis=1).reset_index(drop=True)
df_te = pd.concat([features_te,enco_te],axis=1).reset_index(drop=True)

print(f'{df_tr.shape} / {df_te.shape}')
df_tr.head()

(712, 7) / (179, 7)


Unnamed: 0,age,fare,pclass,sex,embarked,sibsp,parch
0,45.5,28.5,1,1,2,0,0
1,23.0,13.0,2,1,2,0,0
2,32.0,7.925,3,1,2,0,0
3,26.0,7.8542,3,1,2,1,0
4,6.0,31.275,3,0,2,4,2


In [90]:
model = DecisionTreeClassifier(random_state=SEED)
model.fit(df_tr,target_tr)

tr_score = model.score(df_tr, target_tr)
te_score = model.score(df_te, target_te)

tr_score, te_score

(0.9803370786516854, 0.7541899441340782)

In [91]:
results.append(
    {
        'encoding': 'target',
        'tr_score': tr_score,
        'te_score': te_score
    }
)

# 결과 확인

In [97]:
pd.DataFrame(results).sort_values(by=['te_score', 'tr_score'], ascending=[False, False])

Unnamed: 0,encoding,tr_score,te_score
5,mean,1.0,1.0
1,label,0.980337,0.75419
2,mean,0.980337,0.75419
4,target,0.980337,0.75419
3,ordinal,0.980337,0.748603
0,one-hot,0.980337,0.743017
