## OneHotEncoder

In [1]:
from sklearn.preprocessing import OneHotEncoder

In [2]:
data = {'Feature1': ['A', 'B', 'A', 'C'],
        'Feature2': ['High', 'Low', 'Medium', 'Low']}

In [3]:
import pandas as pd

In [4]:
df_data = pd.DataFrame(data)
df_data

Unnamed: 0,Feature1,Feature2
0,A,High
1,B,Low
2,A,Medium
3,C,Low


In [5]:
df_data.index, df_data.columns, df_data.values

(RangeIndex(start=0, stop=4, step=1),
 Index(['Feature1', 'Feature2'], dtype='object'),
 array([['A', 'High'],
        ['B', 'Low'],
        ['A', 'Medium'],
        ['C', 'Low']], dtype=object))

In [6]:
type(df_data.values)

numpy.ndarray

In [7]:
oneHotEncoder = OneHotEncoder()

In [8]:
oneHotEncoder.fit(df_data[['Feature1']])

OneHotEncoder()

In [9]:
oneHotEncoder.categories_

[array(['A', 'B', 'C'], dtype=object)]

In [10]:
encoder_array = oneHotEncoder.transform(df_data[['Feature1']]).toarray()

In [11]:
# oneHotEncoder.get_feature_names(['Feature1'])

In [12]:
# df_encoder = pd.DataFrame(encoder_array, columns=['A', 'B', 'C'])
df_encoder = pd.DataFrame(encoder_array, columns=oneHotEncoder.get_feature_names_out(['Feature1']))
df_encoder

Unnamed: 0,Feature1_A,Feature1_B,Feature1_C
0,1.0,0.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0


In [27]:
df_concat = pd.concat([df_data,df_encoder], axis=1)
df_concat

Unnamed: 0,Feature1,Feature2,Feature1_A,Feature1_B,Feature1_C
0,A,High,1.0,0.0,0.0
1,B,Low,0.0,1.0,0.0
2,A,Medium,1.0,0.0,0.0
3,C,Low,0.0,0.0,1.0


#### 모델학습

In [28]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
model = DecisionTreeClassifier()
model.fit(df_concat[['Feature1_A']], df_concat[['Feature1_B','Feature1_C']])

DecisionTreeClassifier()

In [29]:
model.predict(df_concat[['Feature1_A']])

array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]])

## Imbalanced Data Sampling

### under sampling : Tomek's Link

In [14]:
#%conda install -c conda-forge imbalanced-learn

In [15]:
from imblearn.under_sampling import TomekLinks

In [16]:
from sklearn.datasets import make_classification

In [17]:
features, target = make_classification(n_classes=2, class_sep=2,
                    weights=[0.3, 0.7], n_informative=3, n_redundant=1, flip_y=0,
                    n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)

In [18]:
features.shape, target.shape

((1000, 20), (1000,))

In [19]:
from collections import Counter

In [20]:
Counter(target)

Counter({0: 300, 1: 700})

In [21]:
tomekLinks = TomekLinks()
features_resample, target_resample = tomekLinks.fit_resample(features, target)

In [22]:
features_resample.shape, target_resample.shape

((996, 20), (996,))

In [23]:
Counter(target_resample)

Counter({0: 300, 1: 696})

#### Over Sampling : SMOTE

In [24]:
from imblearn.over_sampling import SMOTE

In [25]:
smote = SMOTE()
features_over_sampling, target_over_sampling = smote.fit_resample(features, target)
features_over_sampling.shape, target_over_sampling.shape

((1400, 20), (1400,))

In [26]:
Counter(target_over_sampling)

Counter({0: 700, 1: 700})