In [1]:
# To get multi output in jupyternotebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# sklearn ciktisini nnumpy array olarak donmektedir. Bu adimi pandas icin ayarlayabiliriz.
from sklearn import set_config
set_config(transform_output="pandas")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
data=pd.read_csv('../data/feature-engine/creditApprovalUCI-CategoricalAnalysis.csv')
x=data.drop('A16', axis=1)
y=data[['A16']]
sayisal_degiskenler = data.select_dtypes(include=[np.float64, np.int64]).columns
kategorik_degiskenler = data.select_dtypes(exclude=[np.float64, np.int64]).columns
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=0)

### Kategorik Degiskenler -> Sayisal Degiskenler

Kategorik degiskleri nominal ve ordinal olmak uzere iki sinifta toplayabiliriz. KAtegorik degisken icerisinde bir siralama anlami ver ise ordinal degilse nominal kategorik degisken olur. Ornegin: Ogrencilerin ders notlarini siniflandirirken kullanilan A, B, C notlarinin kendi icinde bir sira onemi mevcuttur. Dolayisiyla, ordinal kategorik degisiken teknikleri kullanilmalidir. Diger taraftan, uc farkli ulke isminin siniflari olarak degerlendirildiginde birbiri uzerinde ustunluk olmadigi icin sadece temsili olan kategorik degiskendir. Yani nominal degiskendir.


<img src='../images/ohe.png' width='350'> <img src='../images/oe.png' width='350'> <img src='../images/le.jpeg' width='350'>

[Kaynakca](https://datagy.io/wp-content/uploads/2022/01/One-Hot-Encoding-for-Scikit-Learn-in-Python-Explained-1024x576.png)
[Kaynakca](https://codefinity-content-media.s3.eu-west-1.amazonaws.com/a65bbc96-309e-4df9-a790-a1eb8c815a1c/Ordinal.png)
[Kaynakca](https://statsidea.com/the-right-way-to-carry-out-label-encoding-in-r-with-examples/)


In [4]:
# for one hot encoding with sklearn
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder

# for one hot encoding with feature-engine
#from feature_engine.encoding import OneHotEncoder

In [5]:
encoder = OneHotEncoder(categories='auto',
                        drop='first', 
                        sparse_output=False)
encoder.fit(X_train[kategorik_degiskenler])
X_train_enc=encoder.transform(X_train[kategorik_degiskenler])
X_test_enc=encoder.transform(X_test[kategorik_degiskenler])
X_train_enc.head()

Unnamed: 0,A13_p,A13_s,A12_t,A10_t,A9_t,A7_bb,A7_dd,A7_ff,A7_h,A7_j,...,A6_w,A6_x,A5_g,A5_gg,A5_p,A4_l,A4_u,A4_y,A1_a,A1_b
596,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
204,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
351,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
118,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [7]:
# Dataset metedata'sini uci'dan inceledigimizde kategorik veriler nominal oldugu belitirtiliyor. 
# OneHotEncoding teknigini kullanmak dogru olacaktir. Ancak, uygulama acisindan asagida ordinal encoding de gosterelim.
encoder=OrdinalEncoder()

encoder.fit(X_train[kategorik_degiskenler])
X_train_enc=encoder.transform(X_train[kategorik_degiskenler])
X_test_enc=encoder.transform(X_test[kategorik_degiskenler])
X_train_enc.head()

Unnamed: 0,A13,A12,A10,A9,A7,A6,A5,A4,A1
596,0.0,1.0,1.0,1.0,8.0,2.0,1.0,2.0,1.0
303,0.0,0.0,0.0,0.0,8.0,11.0,1.0,2.0,1.0
204,0.0,0.0,1.0,1.0,8.0,13.0,3.0,3.0,2.0
351,0.0,0.0,0.0,0.0,3.0,6.0,3.0,3.0,2.0
118,0.0,1.0,1.0,1.0,8.0,10.0,1.0,2.0,2.0


In [9]:
y_ordinal = np.array(['medium', 'large', 'small', 'medium'])
y_ordinal= y_ordinal[:,np.newaxis]
oe_1 = OrdinalEncoder(categories='auto')
oe_2 = OrdinalEncoder(categories=[['small','medium','large']])
oe_1.fit_transform(y_ordinal)
oe_2.fit_transform(y_ordinal)

Unnamed: 0,x0
0,1.0
1,0.0
2,2.0
3,1.0


Unnamed: 0,x0
0,1.0
1,2.0
2,0.0
3,1.0


In [10]:
y = np.array(['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot'])
le = LabelEncoder()
le.fit_transform(y)
le.inverse_transform([0,2,1,2])

array([0, 0, 2, 0, 1, 1, 2, 0, 2, 1])

array(['cold', 'warm', 'hot', 'warm'], dtype='<U4')

#### Hedef degiskene gore kodlama  (MeanEncoding, TargetEncoding)

Hedefin ortalamasıyla kodlama, makine öğreniminde kullanılan bir tekniktir, özellikle özellik kodlama veya özellik mühendisliği bağlamında. Bu teknik, modelin tahmin etmeye çalıştığı hedef değişkenin olduğu gözetimli öğrenme görevlerinde sıklıkla kullanılır.

1. **Hedef Değişkenin Anlaşılması**: Gözetimli öğrenmede, genellikle her gözlem için modelin tahmin etmeye çalıştığı bir hedef değişken vardır. Örneğin, ikili sınıflandırma görevinde, hedef değişken her gözlemin sınıf üyeliğini gösteren ikili bir etiket olabilir.

2. **Kategorik Değişkenlere Göre Gruplama**: Veri kümesi kategorik değişkenler içeriyorsa, hedefin ortalamasıyla kodlamada veriyi bu kategorik değişkenlere göre gruplamak gerekir.

3. **Her Kategori için Ortalama Hedefin Hesaplanması**: Her kategorinin içinde, kategorik değişkenlere göre, hedef değişkenin ortalaması hesaplanır. Yani, her benzersiz kategori için, o kategoriyle ilişkilendirilmiş tüm veri noktalarının hedef değişkeninin ortalama değeri hesaplanır.

4. **Kategorilerin Ortalama Hedef ile Değiştirilmesi**: Her kategori için hesaplanan ortalama hedef değerleri, orijinal kategorik değerlerin yerine kullanılır. Yani, modelde doğrudan orijinal kategorik değerleri kullanmak yerine, bu kategorilere karşılık gelen ortalama hedef değerlerini kullanırsınız.

5. **Makine Öğrenme Modellerinde Kullanımı**: Ortalama hedefle kodlanmış kategorik değişkenler, ardından makine öğrenme modellerinde özellikler olarak kullanılabilir. Tahminler yaparken, model bu kodlanmış özellikleri kullanarak yeni veri noktaları üzerinde tahminler yapar.

6. **Düzenleme ve Düzeltme**: Uygulamada, aşırı uyumlanmayı önlemek için, her kategori için ortalama hedef hesaplanırken bir düzenleme terimi eklemek veya düzeltme teknikleri uygulamak yaygındır. Bu, ortalama hedef değerlerinin küçük örneklemler veya her kategori içindeki aykırı değerler tarafından aşırı derecede etkilenmesini önler.

Hedefin ortalamasıyla kodlama, yüksek kardinaliteye sahip (yani, çok sayıda benzersiz kategoriye sahip) kategorik değişkenlerle uğraşırken ve kategorik değişken ile hedef değişken arasında güçlü bir ilişki olduğunda özellikle yararlı olabilir. Ancak, her kodlama tekniğinde olduğu gibi, etkinliğini çapraz doğrulama ve görünmeyen veriler üzerinde test ederek doğrulamak önemlidir.

Kategorik degisken ile hedef degisken arasinda kuvvetli bir iliski oldugu dusunuldugunde, kategorik degerleri sayisal degerlere donustururken hedef degisken dikkate alinmasi model performansina arttirabilmektedir. Scikit-learn`in sayfasinda bulanan [ornek](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_target_encoder.html#sphx-glr-auto-examples-preprocessing-plot-target-encoder-py) incelenebilir.

<img src='../images/targetencoder.png' width='700' height='300'>
https://dl.acm.org/doi/10.1145/507533.507538

In [11]:
from sklearn.preprocessing import TargetEncoder
encoder= TargetEncoder(target_type='binary')
encoder.fit(X_train[kategorik_degiskenler], y_train.to_numpy().flatten())

X_train_enc=encoder.transform(X_train[kategorik_degiskenler])
X_test_enc=encoder.transform(X_test[kategorik_degiskenler])

X_train_enc.head()

X_train.head()

Unnamed: 0,A13,A12,A10,A9,A7,A6,A5,A4,A1
596,0.464817,0.445472,0.701825,0.784264,0.418881,0.451588,0.512222,0.512222,0.472063
303,0.464817,0.452459,0.251477,0.070926,0.418881,0.622077,0.512222,0.512222,0.472063
204,0.464817,0.452459,0.701825,0.784264,0.418881,0.498954,0.227451,0.227451,0.438837
351,0.464817,0.452459,0.251477,0.070926,0.150027,0.108675,0.227451,0.227451,0.438837
118,0.464817,0.445472,0.701825,0.784264,0.418881,0.424035,0.512222,0.512222,0.438837


Unnamed: 0,A15,A14,A13,A12,A11,A10,A9,A8,A7,A6,A5,A4,A3,A2,A1
596,4159,396.0,g,t,8,t,t,2.375,v,c,g,u,3.0,46.08,a
303,0,120.0,g,f,0,f,f,0.085,v,q,g,u,2.875,15.92,a
204,1187,50.0,g,f,1,t,t,0.085,v,w,p,y,2.125,36.33,b
351,0,100.0,g,f,0,f,f,0.0,ff,ff,p,y,0.585,22.17,b
118,1332,360.0,g,t,6,t,t,14.0,v,m,g,u,7.04,57.83,b


In [12]:
y_train[X_train['A7']=='v'].mean()

A16    0.418773
dtype: float64

In [151]:
#data[data['A7']=='v']['A16'].mean()
#data.groupby(by=['A7'])['A16'].mean()


#### Farkli encoding teknikleri
Kaggle gibi platformlar ilan edilen yarismalarda genelde ozel teknikler kullanilarak iyi sonuclar elde edilmisitr.
- [RareLabelEncoder](https://feature-engine.trainindata.com/en/latest/api_doc/encoding/RareLabelEncoder.html): Degisken icindeki kategorilerin bulinma oranlarinda bakildiginda bazilari nadir gozukenler olabilmektedir. Bu gibi durumlarda belirlenen bir esik degeri altinda kalanlari tek bir kategoriye donusturulebilir.
- **WoEEncoder** [baglanti1](https://feature-engine.trainindata.com/en/latest/api_doc/encoding/WoEEncoder.html)  [baglanti2](https://www.listendata.com/2015/03/weight-of-evidence-woe-and-information.html): Kategorik degisken icindeki kategorilere karsilik gelen hedef degiskenlerin ortalamasi alinarak hesaplanan agirlikli bir encoding islemleri de yapilabilir.

[CatBoost](https://arxiv.org/abs/1706.09516) encoding yontemleri etkili yontemlerden bir tanesidir. TargetEncoding turunde bir kodlamam yontemidir.

In [14]:
from category_encoders import WOEEncoder, CatBoostEncoder

In [15]:
encoder= WOEEncoder(cols=kategorik_degiskenler)
encoder.fit(X_train, y_train.to_numpy().flatten())

X_train_enc=encoder.transform(X_train)
X_test_enc=encoder.transform(X_test)

X_train_enc.head()
X_train.head()

Unnamed: 0,A15,A14,A13,A12,A11,A10,A9,A8,A7,A6,A5,A4,A3,A2,A1
596,4159,396.0,0.061731,-0.015149,8,1.053667,1.484831,2.375,-0.123507,0.011872,0.25124,0.25124,3.0,46.08,0.092216
303,0,120.0,0.061731,0.012673,0,-0.882098,-2.321458,0.085,-0.123507,0.694392,0.25124,0.25124,2.875,15.92,0.092216
204,1187,50.0,0.061731,0.012673,1,1.053667,1.484831,0.085,-0.123507,0.201915,-1.002058,-1.002058,2.125,36.33,-0.042619
351,0,100.0,0.061731,0.012673,0,-0.882098,-2.321458,0.0,-1.435694,-1.743995,-1.002058,-1.002058,0.585,22.17,-0.042619
118,1332,360.0,0.061731,-0.015149,6,1.053667,1.484831,14.0,-0.123507,-0.085767,0.25124,0.25124,7.04,57.83,-0.042619


Unnamed: 0,A15,A14,A13,A12,A11,A10,A9,A8,A7,A6,A5,A4,A3,A2,A1
596,4159,396.0,g,t,8,t,t,2.375,v,c,g,u,3.0,46.08,a
303,0,120.0,g,f,0,f,f,0.085,v,q,g,u,2.875,15.92,a
204,1187,50.0,g,f,1,t,t,0.085,v,w,p,y,2.125,36.33,b
351,0,100.0,g,f,0,f,f,0.0,ff,ff,p,y,0.585,22.17,b
118,1332,360.0,g,t,6,t,t,14.0,v,m,g,u,7.04,57.83,b


In [16]:
encoder= CatBoostEncoder(cols=kategorik_degiskenler)
encoder.fit(X_train, y_train.to_numpy().flatten())

X_train_enc=encoder.transform(X_train)
X_test_enc=encoder.transform(X_test)

X_train_enc.head()
X_train.head()

Unnamed: 0,A15,A14,A13,A12,A11,A10,A9,A8,A7,A6,A5,A4,A3,A2,A1
596,4159,396.0,0.464817,0.445472,8,0.70164,0.783849,2.375,0.418882,0.451588,0.512223,0.512223,3.0,46.08,0.472064
303,0,120.0,0.464817,0.452459,0,0.251652,0.072146,0.085,0.418882,0.621917,0.512223,0.512223,2.875,15.92,0.472064
204,1187,50.0,0.464817,0.452459,1,0.70164,0.783849,0.085,0.418882,0.498965,0.228011,0.228011,2.125,36.33,0.438837
351,0,100.0,0.464817,0.452459,0,0.251652,0.072146,0.0,0.153554,0.114084,0.228011,0.228011,0.585,22.17,0.438837
118,1332,360.0,0.464817,0.445472,6,0.70164,0.783849,14.0,0.418882,0.424047,0.512223,0.512223,7.04,57.83,0.438837


Unnamed: 0,A15,A14,A13,A12,A11,A10,A9,A8,A7,A6,A5,A4,A3,A2,A1
596,4159,396.0,g,t,8,t,t,2.375,v,c,g,u,3.0,46.08,a
303,0,120.0,g,f,0,f,f,0.085,v,q,g,u,2.875,15.92,a
204,1187,50.0,g,f,1,t,t,0.085,v,w,p,y,2.125,36.33,b
351,0,100.0,g,f,0,f,f,0.0,ff,ff,p,y,0.585,22.17,b
118,1332,360.0,g,t,6,t,t,14.0,v,m,g,u,7.04,57.83,b
