In [5]:
# import library
import pandas as pd
import numpy as np
from sklearn import datasets  # 載入iris資料集
from io import StringIO  # 模擬csv檔，csv檔都用","逗號隔開，實際上可以把csv轉成pandas


In [6]:
csv_data = """
A, B, C, D, E
5.0, 2.0, 3.0,, 6
4.0, 5.5,, 3.2, 2.1
1.0, 6.0, 2.0, 8.0, 5
3.0,,4.0, 8.5,9.0
5.0, 4.1, 3.5, 3.0, 4.0
"""
# 如果兩個,之間有空格的話，那就不會是空值NaN，會形成一個空白的空格

In [7]:
# 先用StringIO讀取csv_data，就變成一個csv檔，再用pd.read_csv讀取
df = pd.read_csv(StringIO(csv_data))

In [8]:
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
1,4.0,5.5,,3.2,2.1
2,1.0,6.0,2.0,8.0,5.0
3,3.0,,4.0,8.5,9.0
4,5.0,4.1,3.5,3.0,4.0


可以看到NaN在pandas表示空值

### Missing Data (空值資料處理)

In [9]:
# 如果資料夠多就可以直接丟掉空值:.dropna()
df.dropna()

Unnamed: 0,A,B,C,D,E
2,1.0,6.0,2.0,8.0,5.0
4,5.0,4.1,3.5,3.0,4.0


In [10]:
# dropna()可以指定，比如所有的欄位都是空值才丟掉的話:how="all"
df.dropna(how="all")

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
1,4.0,5.5,,3.2,2.1
2,1.0,6.0,2.0,8.0,5.0
3,3.0,,4.0,8.5,9.0
4,5.0,4.1,3.5,3.0,4.0


In [11]:
# 指定某一個欄位資料是空值的話就丟掉:subset=[]
df.dropna(subset=[' C']) #這邊注意，我在創csv檔的時候C前面有一個空格，所以要記得打空格

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
2,1.0,6.0,2.0,8.0,5.0
3,3.0,,4.0,8.5,9.0
4,5.0,4.1,3.5,3.0,4.0


In [12]:
# 補值:.fillna(數字)
df.fillna(0)  # 用0來補這個空值

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,0.0,6.0
1,4.0,5.5,0.0,3.2,2.1
2,1.0,6.0,2.0,8.0,5.0
3,3.0,0.0,4.0,8.5,9.0
4,5.0,4.1,3.5,3.0,4.0


In [13]:
# 用"平均值"補空值(注意是該欄位的平均值)
df[" B"] = df[" B"].fillna(df[" B"].mean())
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
1,4.0,5.5,,3.2,2.1
2,1.0,6.0,2.0,8.0,5.0
3,3.0,4.4,4.0,8.5,9.0
4,5.0,4.1,3.5,3.0,4.0


In [14]:
# 用"眾數"補空值(注意是該欄位的眾數)
df[" C"] = df[" C"].fillna(df[" C"].mode())
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,,6.0
1,4.0,5.5,3.0,3.2,2.1
2,1.0,6.0,2.0,8.0,5.0
3,3.0,4.4,4.0,8.5,9.0
4,5.0,4.1,3.5,3.0,4.0


In [15]:
# 用"中位數"補空值(注意是該欄位的中位數)
df[" D"] = df[" D"].fillna(df[" D"].median())
df

Unnamed: 0,A,B,C,D,E
0,5.0,2.0,3.0,5.6,6.0
1,4.0,5.5,3.0,3.2,2.1
2,1.0,6.0,2.0,8.0,5.0
3,3.0,4.4,4.0,8.5,9.0
4,5.0,4.1,3.5,3.0,4.0


最小值min等等也是用同樣方式

### Categorical Data (類別資料處理)

In [16]:
# 先創立資料
df2 = pd.DataFrame([
    ["green", "S", 10.0, 1],
    ["red", "L", 15.5, 2],
    ["blue", "XL", 17.7, 1]
])
df2.columns = ["color", "size", "price", "classlabel"]
df2

Unnamed: 0,color,size,price,classlabel
0,green,S,10.0,1
1,red,L,15.5,2
2,blue,XL,17.7,1


In [17]:
# 把size轉成數值
# 先創一個dict，做出對應關係
size_mapping = {
    "XL":3,
    "L":2,
    "S":1
}
# 只需要針對size欄位用map()
df2["size"] = df2["size"].map(size_mapping)  # map()，裡面放dict就可以直接做轉換了
df2

Unnamed: 0,color,size,price,classlabel
0,green,1,10.0,1
1,red,2,15.5,2
2,blue,3,17.7,1


In [18]:
# one-hot encoding的方式可以用pd.get_dummies(欄位)
pd.get_dummies(df2["color"])

Unnamed: 0,blue,green,red
0,0,1,0
1,0,0,1
2,1,0,0


In [19]:
# onehot-encoding方式，有一個prefix，因為新創的欄位名稱就變成原本的類別名稱了，用prefix可以知道他原本是從哪一個欄位出來的
onehot_encoding = pd.get_dummies(df2["color"], prefix="color")

In [20]:
# color已經做完處理了可以把欄位丟掉
df2 = df2.drop("color", 1)
df2

  df2 = df2.drop("color", 1)


Unnamed: 0,size,price,classlabel
0,1,10.0,1
1,2,15.5,2
2,3,17.7,1


In [21]:
# drop完了，合併onehot_encoding和原本的df2
pd.concat([onehot_encoding, df2], axis=1)  # 合併方向:水平 → axis=1

Unnamed: 0,color_blue,color_green,color_red,size,price,classlabel
0,0,1,0,1,10.0,1
1,0,0,1,2,15.5,2
2,1,0,0,3,17.7,1


### Normalization (資料常態化(正規化)、歸一化、區間縮放)

In [22]:
# 可以輸入數學算式LaTex
from IPython.display import Math

In [23]:
Math("x_{norm}^{(i)}=\cfrac{x^{(i)} - x_{min}}{x_{max} - x_{min}}")

<IPython.core.display.Math object>

In [24]:
# 先載入iris資料集
iris = datasets.load_iris()
x = pd.DataFrame(iris["data"], columns = iris["feature_names"])
print("target_names:"+str(iris["target_names"]))
y = pd.DataFrame(iris["target"], columns = ["target_names"])
data = pd.concat([x, y], axis=1)
data.head(3)

target_names:['setosa' 'versicolor' 'virginica']


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target_names
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0


In [25]:
# 針對花萼長度做Normalization
data['sepal length (cm)'] = (data['sepal length (cm)'] - data['sepal length (cm)'].min())/\
                            (data['sepal length (cm)'].max() - data['sepal length (cm)'].min())
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target_names
0,0.222222,3.5,1.4,0.2,0
1,0.166667,3.0,1.4,0.2,0
2,0.111111,3.2,1.3,0.2,0
3,0.083333,3.1,1.5,0.2,0
4,0.194444,3.6,1.4,0.2,0


### Standardization (資料標準化)

In [27]:
Math('x_{std}^{(i)}=\cfrac{x^{(i)}-\mu_x}{\sigma_x}')

<IPython.core.display.Math object>

In [28]:
# 用花萼的寬度做標準化
data['sepal width (cm)'] = (data['sepal width (cm)'] - data['sepal width (cm)'].mean())/\
                           (data['sepal width (cm)'].std())
data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target_names
0,0.222222,1.015602,1.4,0.2,0
1,0.166667,-0.131539,1.4,0.2,0
2,0.111111,0.327318,1.3,0.2,0
3,0.083333,0.097889,1.5,0.2,0
4,0.194444,1.24503,1.4,0.2,0
