### 결측 처리

In [1]:
import os
import pandas as pd
os.chdir("../../data")
df = pd.read_csv("classification/bands.csv")

#### 결측 판단

In [2]:
display(df.isnull().head())

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16,x17,x18,x19,y
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [3]:
print(df.loc[2, "x6"])

nan


In [4]:
display(df.isnull().sum(axis = 0))

x1     54
x2      5
x3     27
x4      2
x5      1
x6     30
x7     63
x8     55
x9     10
x10    55
x11    55
x12    56
x13    54
x14     6
x15     7
x16    54
x17     7
x18     7
x19     3
y       0
dtype: int64

#### 결측 제거

In [5]:
df.dropna(inplace = True)
print(df.isnull().sum().sum())

0


#### 대푯값으로 결측 대체

In [6]:
X = df.drop('y', axis = 1)
y = df['y']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2022)

In [7]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = "mean")

In [8]:
imputer.fit(X_train)
Z_train = pd.DataFrame(imputer.transform(X_train), columns = X_train.columns)
Z_test = pd.DataFrame(imputer.transform(X_test), columns = X_test.columns)

#### 이웃을 활용한 결측 대체

In [9]:
from sklearn.impute import KNNImputer
imputer = KNNImputer()
imputer.fit(X_train)
Z_train = pd.DataFrame(imputer.transform(X_train), columns = X_train.columns)
Z_test = pd.DataFrame(imputer.transform(X_test), columns = X_test.columns)

### 범주 및 서열형 변수 처리

In [10]:
df = pd.read_csv("classification/german.csv")
X = df.drop('y', axis = 1)
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2022)

#### 더미화

In [11]:
from feature_engine.encoding import OneHotEncoder as OHE
dummy_model = OHE(drop_last = True).fit(X_train)
Z_train = dummy_model.transform(X_train)
Z_test = dummy_model.transform(X_test)

#### 라벨을 활용한 치환

In [12]:
S = df.groupby('x1')['y'].mean()
display(S)

x1
A11    0.492701
A12    0.390335
A13    0.222222
A14    0.116751
Name: y, dtype: float64

In [13]:
display(df['x1'].replace(S.to_dict()))

0      0.492701
1      0.390335
2      0.116751
3      0.492701
4      0.492701
         ...   
995    0.492701
996    0.390335
997    0.390335
998    0.116751
999    0.390335
Name: x1, Length: 1000, dtype: float64

In [14]:
train = pd.concat([X_train, y_train], axis = 1)

In [15]:
for col, dtype in zip(X_train.columns, X_train.dtypes):
    if dtype == object:
        S = train.groupby(col)['y'].mean().to_dict()
        X_train.loc[:, col] = X_train[col].replace(S)
        X_test.loc[:, col] = X_test[col].replace(S)

display(X_train['x1'].head())
display(X_test['x1'].head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


357    0.497512
964    0.405797
337    0.113402
980    0.235294
455    0.113402
Name: x1, dtype: float64

652    0.497512
579    0.113402
836    0.113402
586    0.113402
226    0.113402
Name: x1, dtype: float64

### 스케일링

In [16]:
df = pd.read_csv("classification/glass.csv")
X = df.drop('y', axis = 1)
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2022)

In [17]:
display(X_train.describe())

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9
count,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0
mean,1.518363,13.440179,2.707666,1.451238,72.623665,0.504136,8.940115,0.180554,0.039174
std,0.002738,0.77026,1.417633,0.514204,0.776014,0.725813,1.275246,0.500161,0.090792
min,1.512995,10.73,0.0,0.29,69.81,0.0,5.87116,0.0,0.0
25%,1.51656,12.987675,2.197855,1.186393,72.2754,0.11178,8.23836,0.0,0.0
50%,1.517688,13.33015,3.48424,1.363745,72.7556,0.555795,8.60958,0.0,0.0
75%,1.519174,13.873788,3.60996,1.634188,73.044,0.603922,9.23635,0.0,0.052275
max,1.531242,15.79065,4.49,3.5,75.1804,6.21,14.96336,3.15,0.51


#### 최소-최대 정규화

In [18]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(X_train)
Z_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
Z_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
display(Z_train.describe())

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9
count,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0
mean,0.294187,0.53554,0.603044,0.361756,0.523921,0.081181,0.337537,0.057319,0.076813
std,0.150068,0.152206,0.315731,0.160188,0.144498,0.116878,0.140257,0.158781,0.178023
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.195381,0.446124,0.4895,0.27925,0.459072,0.018,0.260355,0.0,0.0
50%,0.257179,0.513798,0.776,0.3345,0.548488,0.0895,0.301183,0.0,0.0
75%,0.338639,0.621222,0.804,0.41875,0.60219,0.09725,0.370118,0.0,0.1025
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
Z_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
Z_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)
display(Z_train.describe())

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9
count,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0
mean,-3.455847e-14,-1.509903e-15,7.285839000000001e-17,-2.0816680000000002e-17,1.426012e-14,1.221245e-16,-4.149459e-16,-8.985868000000001e-17,-1.0408340000000001e-17
std,1.00314,1.00314,1.00314,1.00314,1.00314,1.00314,1.00314,1.00314,1.00314
min,-1.966508,-3.529571,-1.915988,-2.26541,-3.637174,-0.6967617,-2.414116,-0.362125,-0.4328292
25%,-0.660475,-0.5893132,-0.3607508,-0.5166751,-0.4501959,-0.5422714,-0.5520178,-0.362125,-0.4328292
50%,-0.2473847,-0.1432947,0.549516,-0.1706856,0.17055,0.07139809,-0.260007,-0.362125,-0.4328292
75%,0.2971433,0.5647055,0.6384775,0.3569093,0.5433594,0.1379147,0.2330258,-0.362125,0.144746
max,4.718043,3.06111,1.261208,3.996845,3.305045,7.886029,4.738033,5.955617,5.20205


### 재샘플링

In [20]:
df = pd.read_csv("classification/yeast-1_vs_7.csv")
X = df.drop('y', axis = 1)
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2022)

In [21]:
display(y_train.value_counts())

0    320
1     24
Name: y, dtype: int64

#### SMOTE

In [22]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
s_X_train, s_y_train = smote.fit_resample(X_train, y_train)
s_X_train = pd.DataFrame(s_X_train, columns = X_train.columns)
s_y_train = pd.Series(s_y_train)

In [23]:
display(s_y_train.value_counts())

0    320
1    320
Name: y, dtype: int64

#### NearMiss

In [24]:
from imblearn.under_sampling import NearMiss
nm = NearMiss()
s_X_train, s_y_train = nm.fit_resample(X_train, y_train)
s_X_train = pd.DataFrame(s_X_train, columns = X_train.columns)
s_y_train = pd.Series(s_y_train)

In [25]:
display(s_y_train.value_counts())

0    24
1    24
Name: y, dtype: int64

### 특징 선택

In [26]:
df = pd.read_csv("classification/wdbc.csv")
X = df.drop('y', axis = 1)
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 2022)

#### SelectKBest 클래스

In [27]:
from sklearn.feature_selection import *
selector = SelectKBest(f_classif, k = 10)
selector.fit(X_train, y_train)
selected_features = X_train.columns[selector.get_support()]
Z_train = X_train.loc[:, selected_features]
Z_test = X_test.loc[:, selected_features]

In [28]:
print(X_train.shape)
print(Z_train.shape)

(426, 30)
(426, 10)
