該工具各種矩陣運算，缺失值填補方法的套件，如：KNN、IterativeImputer等。

In [None]:
pip -q install fancyimpute

安裝相關套件

In [None]:
import pandas as pd                   # 數據分析套件
import numpy as np                   # 矩陣數學運算套件
from sklearn.linear_model import LinearRegression    # 線性迴歸套件(擅長處理數值型資料)
from sklearn.linear_model import LogisticRegression   # 邏輯迴歸套件(擅長處理變數型資料)
from sklearn.ensemble import RandomForestClassifier   # 隨機森林分類器套件
from sklearn.ensemble import RandomForestRegressor   # 隨機森林迴歸分析套件
from sklearn.model_selection import train_test_split  # 分割資料集的套件
from sklearn.preprocessing import LabelEncoder     # 類別編碼套件
from sklearn.impute import SimpleImputer        # 使用簡單策略完成缺失值的套件
from fancyimpute import IterativeImputer        # 以循環方式，將具有缺失值的每個特徵建模，為其他特徵的函數來估算缺失值的策略。
from sklearn.metrics import accuracy_score       # 可以衡量分類模型整體性能的套件

# 讀取 Titanic 數據集
df = pd.read_csv("titanic.csv")

In [None]:
# 觀察那些特徵具有缺失值
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


看一下數據資料(看前15行)

In [None]:
print(df.head(15))

    PassengerId  Survived  Pclass  \
0             1         0       3   
1             2         1       1   
2             3         1       3   
3             4         1       1   
4             5         0       3   
5             6         0       3   
6             7         0       1   
7             8         0       3   
8             9         1       3   
9            10         1       2   
10           11         1       3   
11           12         1       1   
12           13         0       3   
13           14         0       3   
14           15         0       3   

                                                 Name     Sex   Age  SibSp  \
0                             Braund, Mr. Owen Harris    male  22.0      1   
1   Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                              Heikkinen, Miss. Laina  female  26.0      0   
3        Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                

先進行Label Encoding

In [None]:
labelencoder = LabelEncoder()

# 對'Sex'欄位進行編碼
df['Sex'] = labelencoder.fit_transform(df['Sex'].values)

# 對'Embarked'欄位進行編碼
df['Embarked'] = labelencoder.fit_transform(df['Embarked'].values)

確認數據資料(看前15行)

In [None]:
print(df.head(15))

    PassengerId  Survived  Pclass  \
0             1         0       3   
1             2         1       1   
2             3         1       3   
3             4         1       1   
4             5         0       3   
5             6         0       3   
6             7         0       1   
7             8         0       3   
8             9         1       3   
9            10         1       2   
10           11         1       3   
11           12         1       1   
12           13         0       3   
13           14         0       3   
14           15         0       3   

                                                 Name  Sex   Age  SibSp  \
0                             Braund, Mr. Owen Harris    1  22.0      1   
1   Cumings, Mrs. John Bradley (Florence Briggs Th...    0  38.0      1   
2                              Heikkinen, Miss. Laina    0  26.0      0   
3        Futrelle, Mrs. Jacques Heath (Lily May Peel)    0  35.0      1   
4                            All

In [None]:
# 選擇與生存率（Survived）相關的數值型變數
# 由於主要是去分析跟存活相關的機率，而PassengerId不具特殊意義，Name、Ticket(船票號碼)、Cabin(船艙號碼)相對跟存活關聯性較少，因此此處先捨去這四個特徵。
df = df[['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Pclass', 'Embarked']]

In [None]:
# 了解現在的處理後的檔案大小
print(df.shape)

(891, 8)


## Deletion（刪除法）

In [None]:
# 刪除缺失值
df_deleted = df.dropna()

# 了解現在的處理後的檔案大小
print('由此可以看出dropna將sex的缺失值皆刪掉了，891-177=714')
print(df_deleted.shape)

# X, y特徵選擇
X = df_deleted[['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Pclass', 'Embarked']]
y = df_deleted['Survived']

# 分割訓練與測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# 訓練邏輯回歸模型
model = LogisticRegression(max_iter=2000)
model.fit(X_train, y_train)

# 預測與評估
y_pred1 = model.predict(X_test)
accuracy_deletion = accuracy_score(y_test, y_pred1)
print("Accuracy (Deletion Method):", accuracy_deletion)

由此可以看出dropna將sex的缺失值皆刪掉了，891-177=714
(714, 8)
Accuracy (Deletion Method): 0.7972027972027972


## Mean Imputation（平均值補值）

In [None]:
# 使用平均值"mean"策略補值
imputer = SimpleImputer(strategy="mean")
df_imputed = df.copy()

# 對'Age'欄位進行補值
df_imputed[['Age']] = imputer.fit_transform(df[['Age']])

# 對'Embarked'欄位進行補值
imputer_mode = SimpleImputer(strategy="mean")
df_imputed[['Embarked']] = imputer_mode.fit_transform(df[['Embarked']])

# 了解現在的處理後的檔案大小
print('由此可以看出通過平均值補值的方式，資料數量都沒變。')
print(df_imputed.shape)

# X, y特徵選擇
X = df_imputed[['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Pclass', 'Embarked']]
y = df_imputed['Survived']

# 分割訓練與測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# 預測與評估
model.fit(X_train, y_train)
y_pred2 = model.predict(X_test)
accuracy_imputation = accuracy_score(y_test, y_pred2)
print("Accuracy (Imputation Method):", accuracy_imputation)

由此可以看出通過平均值補值的方式，資料數量都沒變。
(891, 8)
Accuracy (Imputation Method): 0.7877094972067039


## Regression Imputation（線性迴歸補值）

In [None]:
# 找出並分類'Age'缺失的數據
df_missing = df[df['Age'].isnull()]
df_non_missing = df.dropna(subset=['Age'])

# X, y特徵選擇來進一步預測 Age
X_train_age = df_non_missing[['Sex', 'SibSp', 'Parch', 'Fare', 'Pclass', 'Embarked']]
y_train_age = df_non_missing['Age']

#　定義要進行補值的線性迴歸模型
regressor = LinearRegression()
regressor.fit(X_train_age, y_train_age)

# 預測缺失值、並且讓預測結果維持是小數第一位且範圍在0.1~90歲的資料型態
df_missing.loc[:, 'Age'] = np.clip(np.round(regressor.predict(df_missing[['Sex', 'SibSp', 'Parch', 'Fare', 'Pclass', 'Embarked']]), 1), 0.1, 90)

# 將填補的預測值合併拼接回原數據
df_filled = pd.concat([df_missing, df_non_missing])

# 了解現在的處理後的檔案大小
print('由此可以看出通過線性迴歸補值的方式，資料數量也都沒變。')
print(df_filled.shape)

# 了解現在的補植後的資料長怎樣
print(df_filled['Age'].head(10))

# X, y特徵選擇
X = df_filled[['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Pclass', 'Embarked']]
y = df_filled['Survived']

# 分割訓練與測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# 預測與評估
model.fit(X_train, y_train)
y_pred3 = model.predict(X_test)
accuracy_regression = accuracy_score(y_test, y_pred3)
print("Accuracy (Regression Imputation):", accuracy_regression)


由此可以看出通過線性迴歸補值的方式，資料數量也都沒變。
(891, 8)
5     27.5
17    35.6
19    23.4
26    26.6
28    24.4
29    28.5
31    31.8
32    24.4
36    26.6
42    26.6
Name: Age, dtype: float64
Accuracy (Regression Imputation): 0.7821229050279329


## MICE（多重插補法）單獨對每個變數建模

In [None]:
# 使用 MICE 進行補值
imputer = IterativeImputer()
df_mice = df.copy()
df_mice[['Age']] = imputer.fit_transform(df[['Age']])
df_mice[['Embarked']] = imputer.fit_transform(df[['Embarked']])

print(df_mice.shape)

# X, y特徵選擇
X = df_mice[['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Pclass', 'Embarked']]
y = df_mice['Survived']

# 分割訓練與測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# 預測與評估
model.fit(X_train, y_train)
y_pred4 = model.predict(X_test)
accuracy_MICE = accuracy_score(y_test, y_pred4)
print("Accuracy (MICE Method):", accuracy_MICE)

(891, 8)
Accuracy (MICE Method): 0.7877094972067039


## MICE（多重插補法）對所有資料變數建模

In [None]:
# 使用 MICE 進行多重插補
imputer = IterativeImputer()
df_imputed = imputer.fit_transform(df)

df = pd.DataFrame(df_imputed, columns=['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Pclass', 'Embarked'])
print(df.shape)

# X, y特徵選擇
X = df[['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Pclass', 'Embarked']]
y = df['Survived']

# 分割訓練與測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# 預測與評估
model.fit(X_train, y_train)
y_pred4 = model.predict(X_test)
accuracy_MICE2 = accuracy_score(y_test, y_pred4)
print("Accuracy (MICE2 Method):", accuracy_MICE2)

(891, 8)
Accuracy (MICE2 Method): 0.8156424581005587


## 綜合版本

In [None]:
# 讀取數據集
df = pd.read_csv("titanic.csv")

# 刪除 Embarked 缺失值（因為只有 2 筆，相對影響不大。）
df = df.dropna(subset=["Embarked"])

# 因為 Age 和 Name 的相關性建立 Title 特徵，並根據 Title 來進行後續年齡補值
# 先將姓和名分開成兩部分(根據", ")並拿第二部分([1])，然後進一步分割(根據".")得到，並取第一部分得到職稱。
df["Title"] = df["Name"].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

# 計算每個 Title 的平均(mean)年齡並將結果轉換成字典
age_means = df.groupby("Title")["Age"].mean().to_dict()

# 將缺失的 Age 補上字典中對應 Title 的平均年齡
df["Age"] = df.apply(lambda row: age_means[row["Title"]] if pd.isnull(row["Age"]) else row["Age"], axis=1)

# 轉換 Sex 和 Embarked 為數值類別
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
df["Embarked"] = df["Embarked"].map({"C": 0, "Q": 1, "S": 2})

print(df.shape)

# 特徵選擇
X = df[['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Pclass', 'Embarked']]
y = df['Survived']

# 分割訓練與測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# 訓練邏輯回歸模型
model = LogisticRegression(penalty='l1', solver='liblinear')
model.fit(X_train, y_train)

# 預測
y_pred = model.predict(X_test)
accuracy_combined1 = accuracy_score(y_test, y_pred)

print("Accuracy (Comprehensive Approach):", accuracy_combined1)


(889, 13)
Accuracy (Comprehensive Approach): 0.8202247191011236


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Title"] = df["Name"].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Age"] = df.apply(lambda row: age_means[row["Title"]] if pd.isnull(row["Age"]) else row["Age"], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Sex"] 

In [None]:
# 讀取數據集
df = pd.read_csv("titanic.csv")

# 刪除 Embarked 缺失值（因為只有 2 筆，相對影響不大。）
df = df.dropna(subset=["Embarked"])

# 因為 Age 和 Name 的相關性建立 Title 特徵，並根據 Title 來進行後續年齡補值
# 先將姓和名分開成兩部分(根據", ")並拿第二部分([1])，然後進一步分割(根據".")得到，並取第一部分得到職稱。
df["Title"] = df["Name"].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]

# 計算每個 Title 的平均(mean)年齡並將結果轉換成字典
age_means = df.groupby("Title")["Age"].mean().to_dict()

# 將缺失的 Age 補上字典中對應 Title 的平均年齡
df["Age"] = df.apply(lambda row: age_means[row["Title"]] if pd.isnull(row["Age"]) else row["Age"], axis=1)

# 轉換 Sex 和 Embarked 為數值類別
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
df["Embarked"] = df["Embarked"].map({"C": 0, "Q": 1, "S": 2})

print(df.shape)

# 特徵選擇
X = df[['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Pclass', 'Embarked']]
y = df['Survived']

# 分割訓練與測試集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# 訓練模型
model = RandomForestClassifier(n_estimators=1500, random_state=12)
model.fit(X_train, y_train)

# 預測
y_pred = model.predict(X_test)
accuracy_combined = accuracy_score(y_test, y_pred)

print("Accuracy (Comprehensive Approach):", accuracy_combined)

(889, 13)
Accuracy (Comprehensive Approach): 0.848314606741573


In [None]:
print("Accuracy (Deletion Method):", accuracy_deletion)
print("Accuracy (Imputation Method):", accuracy_imputation)
print("Accuracy (Regression Imputation):", accuracy_regression)
print("Accuracy (MICE Method):", accuracy_MICE)
print("Accuracy (MICE2 Method):", accuracy_MICE2)
print("Accuracy (combined Approach):", accuracy_combined)

Accuracy (Deletion Method): 0.7972027972027972
Accuracy (Imputation Method): 0.7877094972067039
Accuracy (Regression Imputation): 0.7821229050279329
Accuracy (MICE Method): 0.7877094972067039
Accuracy (MICE2 Method): 0.8156424581005587
Accuracy (combined Approach): 0.848314606741573
