#### 特徵篩選

In [71]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.under_sampling import RandomUnderSampler

In [101]:
sex = 'Male'
control=pd.read_csv(f"../result/Control_{sex}.csv")
training=pd.read_csv(f"../result/Training_{sex}.csv")

print(control.shape)
print(training.shape)

(2774, 158)
(1316, 158)


In [102]:
features = pd.concat([control,training],ignore_index=True)
features = features.drop(columns=['TWB2_ID','I_32','Label'])

#將ID排除
X = features.iloc[:,1::]
print(X.info())

feature_names = X.columns
X = [X.iloc[i,:].values.flatten().tolist() for i in range(control.shape[0]+training.shape[0])]
print('樣本數: ',len(X))

control_label=control.iloc[:,-1]
training_label=training.iloc[:,-1]
y=list(pd.concat([control_label,training_label],ignore_index=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4090 entries, 0 to 4089
Columns: 154 entries, AGE to MICROALB
dtypes: float64(154)
memory usage: 4.8 MB
None
樣本數:  4090


In [103]:
from collections import Counter

# 分割資料集 (training and validation sets)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

print(f"訓練集樣本數量： {len(X_train)}")
print(f"測試集樣本數量： {len(X_val)}")
train_class_distribution = Counter(y_train)
val_class_distribution = Counter(y_val)
print("訓練集中各類別樣本數量：")
print(train_class_distribution)
print("測試集中各類別樣本數量：")
print(val_class_distribution)

訓練集樣本數量： 3681
測試集樣本數量： 409
訓練集中各類別樣本數量：
Counter({0: 2486, 1: 1195})
測試集中各類別樣本數量：
Counter({0: 288, 1: 121})


In [104]:
# train set use SMOTE
smote_train_X, smote_train_Y = SMOTE(random_state=42).fit_resample(X_train, y_train)
print('SMOTE train set sampling:')
print(Counter(smote_train_Y))

# train set use BSMOTE
bsmote_train_X, bsmote_train_Y = BorderlineSMOTE(random_state=42, kind='borderline-2').fit_resample(X_train, y_train)
print('BSMOTE train set sampling:')
print(Counter(bsmote_train_Y))

# test set balance
bal_x_test, bal_y_test = RandomUnderSampler(random_state=42).fit_resample(X_val,y_val)
print('Random sampling:')
print(Counter(bal_y_test))

smote_test_X, smote_test_Y = SMOTE(random_state=42).fit_resample(X_val,y_val)
print('SMOTE test set sampling:')
print(Counter(smote_test_Y))

bsmote_test_X, bsmote_test_Y = BorderlineSMOTE(random_state=42).fit_resample(X_val,y_val)
print('BSMOTE test set sampling:')
print(Counter(bsmote_test_Y))

SMOTE train set sampling:
Counter({0: 2486, 1: 2486})
BSMOTE train set sampling:
Counter({0: 2486, 1: 2486})
Random sampling:
Counter({0: 121, 1: 121})
SMOTE test set sampling:
Counter({0: 288, 1: 288})
BSMOTE test set sampling:
Counter({0: 288, 1: 288})


In [78]:
#xgb
# model = xgb.XGBClassifier(random_state=42)
# param_grid = {
#     # Male
#     'learning_rate': [0.01,0.05, 0.1],
#     'n_estimators': [10,50,100],
#     'max_depth': [2,3, 4],
#     'subsample': [0.5,0.6,0.7],
#     'colsample_bytree': [0.2,0.3,0.4],
#     'min_child_weight': [20,30,40],
#     # Female
#     # 'learning_rate': [0.001, 0.01, 0.1],
#     # 'n_estimators': [100,200,300],
#     # 'max_depth': [2, 3, 4],
#     # 'subsample': [0.6, 0.7, 0.8],
#     # 'colsample_bytree': [0.4, 0.5, 0.6],
# }

# dt
# from sklearn.tree import DecisionTreeClassifier
# model=DecisionTreeClassifier(random_state=42)
# param_grid = {
#     # Male
#     'max_depth': [3,4,5],
#     'min_samples_split': [5, 10, 20,30],
#     'min_samples_leaf': [5, 10, 20],
#     'max_features': [10,20,30]
#     # Female
#     # 'max_depth': [6,7,8,9,10],
#     # 'max_features': [30,35,40]
# }

# rf
from sklearn import ensemble
model = ensemble.RandomForestClassifier(random_state=42)
param_grid = {
    # Male
    'n_estimators': [100,150,200],
    'min_samples_split': [50,60,70],
    'min_samples_leaf': [50,60,70],

    # Female
    # 'n_estimators': [100,150,200],
    # 'max_depth': [None, 5, 10, 20],
    # 'min_samples_split': [30,40,50],
    # 'min_samples_leaf': [30,40,50],
    # 'max_features': [5,10,20]
}
# SVM
# from sklearn import svm
# model = svm.SVC(random_state=42)
# param_grid = {
#     # Male
#     'kernel': ['rbf', 'poly', 'sigmoid']
#     # Female
# }

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(smote_train_X, smote_train_Y)
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Parameters: {'min_samples_leaf': 50, 'min_samples_split': 50, 'n_estimators': 150}
Best Score: 0.7524610982477781


#### SVM查看特徵重要性 (permutation importance) 

In [59]:
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error

# train
y_pred = best_model.predict(smote_train_X)
err = mean_squared_error(smote_train_Y, y_pred, squared=False)
print("Model Training Error:", err)

# test
y_pred = best_model.predict(bsmote_test_X)
err = mean_squared_error(bsmote_test_Y, y_pred, squared=False)
print("Model Testing Error:", err)

result = permutation_importance(best_model, smote_train_X,smote_train_Y, n_repeats=10, random_state=42, n_jobs=-1)
importance = result.importances_mean

positive_scores = [v for i, v in enumerate(importance) if v > 0]
positive_feature_names = [feature_names[i] for i, v in enumerate(importance) if v > 0]

top_features = sorted(zip(positive_scores, positive_feature_names), reverse=True)
top_scores = [score for score, _ in top_features]
top_feature_names = [name for _, name in top_features]

print("feature importances:")
print(pd.DataFrame(zip(top_feature_names, top_scores)))
pd.DataFrame(list(zip(top_feature_names, top_scores)), columns=['feature', 'importance']).to_csv(f'../result/feature_importance/{sex}_svm.csv', index=False)

Model Training Error: 0.5685155898168872
Model Testing Error: 0.5936586186989586
feature importances:
                              0         1
0                          I_18  0.029445
1                           AGE  0.020696
2                          I_11  0.013576
3                            TG  0.003942
4                          I_30  0.003721
5                          I_23  0.003681
6                           I_9  0.003500
7               FASTING_GLUCOSE  0.003278
8                           I_3  0.002816
9                           I_4  0.002615
10                         I_34  0.001931
11                  BODY_WEIGHT  0.001911
12                     MICROALB  0.001891
13                     GAMMA_GT  0.001529
14          JOB_LGST_OCCUPATION  0.001508
15                          I_7  0.001488
16                         SGPT  0.001307
17                     PLATELET  0.001287
18                        LDL_C  0.001187
19                  BODY_HEIGHT  0.001146
20              

#### XGBoost、Decision Tree、Random Forest查看特徵重要性 (feature_importance_) 

通常用於快速查看模型對各個特徵的相對重要性，但不一定會反映特徵在模型預測中的實際效果。

In [79]:
from sklearn.metrics import accuracy_score
best_model.fit(smote_train_X, smote_train_Y)

y_pred = best_model.predict(smote_test_X)
print(accuracy_score(smote_test_Y, y_pred.round()))

y_pred_train = best_model.predict(smote_train_X)
print(accuracy_score(smote_train_Y, y_pred_train.round()))

# 獲取特徵重要度和標籤
feature_importances = best_model.feature_importances_
feature_names = features.columns[1:]
df_feature_importances = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
})

# 排序重要度
df_feature_importances.sort_values('importance', ascending=False, inplace=True)

print("feature importances:")
print(pd.DataFrame(df_feature_importances))
(pd.DataFrame(df_feature_importances)).to_csv(f'../result/feature_importance/{sex}_rf.csv', index=False)

0.7760416666666666
0.8173773129525342
feature importances:
                feature  importance
86                 I_18    0.091910
79                 I_11    0.081825
0                   AGE    0.043578
91                 I_23    0.043116
84                 I_16    0.031879
..                  ...         ...
68                  I_0    0.000000
66          COLOR_BLIND    0.000000
65                BLIND    0.000000
63  RENTINAL_DETACHMENT    0.000000
17       CARDIOMYOPATHY    0.000000

[154 rows x 2 columns]


#### 正規化重要度

In [90]:
dt_imp=pd.read_csv(f'../result/feature_importance/{sex}_dt.csv')
svm_imp=pd.read_csv(f'../result/feature_importance/{sex}_svm.csv')
xgb_imp=pd.read_csv(f'../result/feature_importance/{sex}_xgb.csv')
rf_imp=pd.read_csv(f'../result/feature_importance/{sex}_rf.csv')

# 正規化重要度
from sklearn.preprocessing import MinMaxScaler

def MinMax_normalize(data):
    scaler = MinMaxScaler()
    data_norm = scaler.fit_transform(data['importance'].to_frame())
    data['importance'] = pd.Series(data_norm.flatten())

MinMax_normalize(dt_imp)
MinMax_normalize(svm_imp)
MinMax_normalize(xgb_imp)
MinMax_normalize(rf_imp)

dt_imp.to_csv(f'../result/feature_importance/normalized/{sex}_dt.csv',index=False)
svm_imp.to_csv(f'../result/feature_importance/normalized/{sex}_svm.csv',index=False)
xgb_imp.to_csv(f'../result/feature_importance/normalized/{sex}_xgb.csv',index=False)
rf_imp.to_csv(f'../result/feature_importance/normalized/{sex}_rf.csv' ,index=False)