# Data Preprocessing And Data Cleaning
資料清理 ( Data Cleaning ) 以及 資料前處理 ( Data Preprocessing )

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris

np.set_printoptions(suppress=True)

In [7]:
iris = load_iris()
df_data = pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names'] + ['target'])
df_data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0
146,6.3,2.5,5.0,1.9,2.0
147,6.5,3.0,5.2,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0


In [8]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    float64
dtypes: float64(5)
memory usage: 6.0 KB


In [18]:
x = df_data.drop(labels=['target'],axis=1).values # 移除Species並取得剩下欄位資料
y = df_data['target']
# checked missing data
print("checked missing data(NAN mount):",len(np.where(np.isnan(x))[0]))

checked missing data(NAN mount): 0


In [32]:
from sklearn.model_selection import train_test_split

# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, stratify=y)

x_train, x_test, y_train, y_test = train_test_split(iris['data'], iris['target'], test_size=0.3, random_state=42, stratify=y)

print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

x_train shape: (105, 4)
y_train shape: (105,)
x_test shape: (45, 4)
y_test shape: (45,)


In [31]:
print("x_train features:", x_train[:1], ", label:", y_train[:1])
print("x_test features:", x_test[:1], ", label:", y_test[:1])

x_train features: [[5.1 2.5 3.  1.1]] , label: [1]
x_test features: [[7.3 2.9 6.3 1.8]] , label: [2]


In [34]:
df_data.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [36]:
x_train[:5]

array([[5.1, 2.5, 3. , 1.1],
       [6.2, 2.2, 4.5, 1.5],
       [5.1, 3.8, 1.5, 0.3],
       [6.8, 3.2, 5.9, 2.3],
       [5.7, 2.8, 4.1, 1.3]])

## StandardScaler

In [41]:
from sklearn.preprocessing import StandardScaler
standard_scaler = StandardScaler()

x_train_scaled = standard_scaler.fit_transform(x_train)
x_train_scaled[:5]

array([[-0.90045861, -1.22024754, -0.4419858 , -0.13661044],
       [ 0.38036614, -1.87955796,  0.40282929,  0.38029394],
       [-0.90045861,  1.63676428, -1.2868009 , -1.17041921],
       [ 1.07899781,  0.31814344,  1.19132338,  1.41410271],
       [-0.20182693, -0.56093712,  0.17754527,  0.12184175]])

In [40]:
print("x_train", " mean: ", x_train.mean(axis=0), ", std: ",  x_train.std(axis=0))
print("x_train_scaled", " mean: ", x_train_scaled.mean(axis=0), ", std: ",  x_train_scaled.std(axis=0))

x_train  mean:  [5.87333333 3.0552381  3.7847619  1.20571429] , std:  [0.85882164 0.45502087 1.77553646 0.77383751]
x_train_scaled  mean:  [ 0. -0. -0. -0.] , std:  [1. 1. 1. 1.]


## MinMaxScaler

In [42]:
from sklearn.preprocessing import MinMaxScaler

minmax_scaler = MinMaxScaler()
x_train_minmax_scaled = minmax_scaler.fit_transform(x_train)
x_train_minmax_scaled[:5]

array([[0.22222222, 0.20833333, 0.32758621, 0.41666667],
       [0.52777778, 0.08333333, 0.5862069 , 0.58333333],
       [0.22222222, 0.75      , 0.06896552, 0.08333333],
       [0.69444444, 0.5       , 0.82758621, 0.91666667],
       [0.38888889, 0.33333333, 0.51724138, 0.5       ]])

In [54]:
print("x_train", " min: ", x_train.min(axis=0), ", max: ",  x_train.max(axis=0))
print("x_train_minmax_scaled", " min: ", x_train_minmax_scaled.min(axis=0), ", max: ",  x_train_minmax_scaled.max(axis=0))

x_train  min:  [4.3 2.  1.1 0.1] , max:  [7.9 4.4 6.9 2.5]
x_train_minmax_scaled  min:  [0. 0. 0. 0.] , max:  [1. 1. 1. 1.]


## MaxAbsScaler

In [51]:
from sklearn.preprocessing import MaxAbsScaler

maxabs_scaler = MaxAbsScaler().fit(x_train)
x_train_maxabs_scaled = maxabs_scaler.transform(x_train)
x_train_maxabs_scaled[:5]

array([[0.64556962, 0.56818182, 0.43478261, 0.44      ],
       [0.78481013, 0.5       , 0.65217391, 0.6       ],
       [0.64556962, 0.86363636, 0.2173913 , 0.12      ],
       [0.86075949, 0.72727273, 0.85507246, 0.92      ],
       [0.72151899, 0.63636364, 0.5942029 , 0.52      ]])

In [57]:
print("x_train", " maxabs: ", x_train.max(axis=0))
print("x_train_maxabs_scaled", " maxabs: ", x_train_maxabs_scaled.max(axis=0))

print("x_train", " maxabs: ", x_train.min(axis=0))
print("x_train_maxabs_scaled", " maxabs: ", x_train_maxabs_scaled.min(axis=0))

x_train  maxabs:  [7.9 4.4 6.9 2.5]
x_train_maxabs_scaled  maxabs:  [1. 1. 1. 1.]
x_train  maxabs:  [4.3 2.  1.1 0.1]
x_train_maxabs_scaled  maxabs:  [0.5443038  0.45454545 0.15942029 0.04      ]


MaxAbsScaler 的工作原理是將每個特徵的絕對最大值縮放為 1，而保持數據的正負號。具體來說，對於每個特徵 x_i，其經過縮放後的值會按照以下公式計算：


x_{\text{scaled}} = \frac{x}{\text{max}(|x|)}


因此，如果每個樣本的特徵都是正數，那麼縮放後的結果也將是正數，因為沒有負值需要處理。所有值都會被縮放到  [0, 1]  的範圍內，最大值為 1，其他值則小於 1 且大於 0。

這樣的行為符合 MaxAbsScaler 的預期設計。如果原始數據中沒有負數，那麼縮放後的數據也不會有負數。

## RobustScaler