Logging 初始化

In [28]:
import logging

# 設置 logging 配置
logging.basicConfig(
    filename='training_log.log',  # Log 文件名稱
    filemode='a',                 # 'a' 表示追加模式，'w' 表示覆蓋模式
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO             # 設置日誌級別
)

# 開始記錄
logging.info("Logging initialized. Starting the training process.")

## config yaml

In [29]:
# 安裝所需庫 (僅在初次運行時執行)
!pip install pandas matplotlib seaborn scikit-learn imbalanced-learn pyyaml shap

# 導入主要模組
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import KNNImputer
from imblearn.over_sampling import SMOTE
import shap

# 加載配置文件
with open("config/config.yaml", "r", encoding="utf-8") as f:
    config = yaml.safe_load(f)




### Support輔助函數-
1.資料處理 # Data Utilities from data_utils.py
2.評估指標 # Metrics Utilities from metrics.py

In [30]:
# Data Utilities from data_utils.py

def load_data(file_path):
    """從 CSV 文件加載數據"""
    return pd.read_csv(file_path)

def save_data(df, file_path):
    """將數據保存為 CSV 文件"""
    df.to_csv(file_path, index=False)
    print(f"Data saved to {file_path}")


In [31]:
# Metrics Utilities from metrics.py
from sklearn.metrics import f1_score, roc_auc_score

def calculate_f1(y_true, y_pred):
    """計算 Macro F1-Score"""
    return f1_score(y_true, y_pred, average='macro')

def calculate_auc(y_true, y_proba):
    """計算 AUROC 分數"""
    return roc_auc_score(y_true, y_proba[:, 1])

## Reading Data!

In [32]:
# 加載數據
train_X_path = config['data_paths']['train_X']
train_y_path = config['data_paths']['train_y']
test_X_path = config['data_paths']['test_X']

# 使用 data_utils 中的 load_data 函數加載數據
train_data = load_data(config['data_paths']['train_X'])
train_target = load_data(config['data_paths']['train_y'])
test_data = load_data(config['data_paths']['test_X'])

# 檢查數據形狀
print("Training data shape:", train_data.shape)
print("Test data shape:", test_data.shape)
print("Target shape:", train_target.shape)

# logging
try:
    train_data = load_data(config['data_paths']['train_X'])
    train_target = load_data(config['data_paths']['train_y'])
    test_data = load_data(config['data_paths']['test_X'])
    logging.info(f"Data loaded successfully. Train shape: {train_data.shape}, Test shape: {test_data.shape}")
except Exception as e:
    logging.error(f"Error loading data: {e}")



Training data shape: (44939, 83)
Test data shape: (19260, 83)
Target shape: (44939, 1)


## Data check!

In [33]:
# 基本數據檢查
print("Data types:\n", train_data.dtypes)
print("First few rows:\n", train_data.head())
print("Missing values:\n", train_data.isnull().sum())

Data types:
 encounter_id                     int64
patient_id                       int64
hospital_id                      int64
age                            float64
bmi                            float64
                                ...   
leukemia                       float64
lymphoma                       float64
solid_tumor_with_metastasis    float64
apache_3j_bodysystem            object
apache_2_bodysystem             object
Length: 83, dtype: object
First few rows:
    encounter_id  patient_id  hospital_id   age        bmi  elective_surgery  \
0        126956      125763           26  75.0  23.147277                 0   
1         18184       25399           54  42.0  35.071807                 1   
2         51597        7974           81  39.0        NaN                 0   
3         40078       79625          161  62.0  42.070672                 0   
4        130673       88261           29  82.0        NaN                 0   

          ethnicity gender  height      

## data analysis（EDA）

In [34]:
# 基本統計
def describe_data(data):
    print("Statistical Summary:\n")
    return data.describe()

# 缺失值檢查
def check_missing_values(data):
    missing_data = data.isnull().sum()
    missing_percentage = (missing_data / len(data)) * 100
    return missing_data[missing_data > 0], missing_percentage[missing_percentage > 0]

# 執行數據分析
print("Description of training data:")
print(describe_data(train_data))  # 統一打印描述性統計
print("\nMissing Values Analysis:")
missing_data, missing_percentage = check_missing_values(train_data)
print("Missing Values:\n", missing_data)
print("\nMissing Percentage:\n", missing_percentage)


Description of training data:
Statistical Summary:

        encounter_id     patient_id   hospital_id           age           bmi  \
count   44939.000000   44939.000000  44939.000000  42874.000000  43288.000000   
mean    65642.668751   65491.091346    105.732460     62.318701     29.195878   
std     37736.732171   37696.448956     62.901289     16.804263      8.263748   
min         1.000000       1.000000      2.000000     16.000000     14.844926   
25%     33059.000000   32732.000000     47.000000     52.000000     23.638493   
50%     65732.000000   65467.000000    109.000000     65.000000     27.680158   
75%     98239.500000   98089.500000    161.000000     75.000000     32.962064   
max    131049.000000  131049.000000    204.000000     89.000000     67.814990   

       elective_surgery        height        icu_id  pre_icu_los_days  \
count      44939.000000  44308.000000  44939.000000      44939.000000   
mean           0.182314    169.565517    509.502659          0.836469   

### 數值特徵的成對散佈圖（Pair Plot）

In [35]:
def plot_pairplot(data, numerical_columns):
    sns.pairplot(data[numerical_columns].dropna())
    plt.show()

plot_pairplot(train_data, config['numerical_columns'])


KeyError: 'numerical_columns'

### 數值特徵的箱線圖

In [None]:
def plot_boxplots(data, numerical_columns):
    for column in numerical_columns:
        plt.figure(figsize=(8, 6))
        sns.boxplot(x=data[column].dropna())
        plt.title(f'Boxplot of {column}')
        plt.show()

plot_boxplots(train_data, config['numerical_columns'])


### 數值特徵的直方圖

In [None]:
def plot_numeric_distributions(data, numerical_columns):
    for column in numerical_columns:
        plt.figure(figsize=(8, 6))
        sns.histplot(data[column].dropna(), kde=True)
        plt.title(f'Distribution of {column}')
        plt.show()

# 假設數值特徵名稱為 'numerical_columns'
plot_numeric_distributions(train_data, config['numerical_columns'])


### 目標變數的分佈圖

In [None]:
def plot_target_distribution(data, target_column):
    plt.figure(figsize=(8, 6))
    sns.countplot(x=data[target_column])
    plt.title(f'Distribution of target variable: {target_column}')
    plt.show()

# 假設目標變數名稱為 'target_column'
plot_target_distribution(train_data, 'target_column')


### 類別特徵的頻率圖

In [None]:
def plot_categorical_counts(data, categorical_columns):
    for column in categorical_columns:
        plt.figure(figsize=(8, 6))
        sns.countplot(x=data[column])
        plt.title(f'Count of each category in {column}')
        plt.show()

# 假設類別特徵名稱為 'categorical_columns'
categorical_columns = ['column1', 'column2']  # 替換為你的類別特徵名稱
plot_categorical_counts(train_data, categorical_columns)


## Data cleaning

## missing value?

In [41]:
# 缺失值檢查
missing_data = train_data.isnull().sum()
missing_percentage = (missing_data / len(train_data)) * 100
print(f"總共有:{len(train_data)}位病人")
print("Missing Values:\n", missing_data[missing_data > 0])
print("Missing Percentage:\n", missing_percentage[missing_percentage > 0])

'''# 開啟一個 .txt 檔案並寫入缺失值數量和缺失百分比
with open('missing_data.txt', 'w') as f:
    f.write("Missing Values:\n")
    f.write(missing_data.to_string())  # 寫入缺失值數量
    f.write("\n\nMissing Percentage:\n")
    f.write(missing_percentage.to_string())  # 寫入缺失百分比

missing_summary = pd.DataFrame({
    'Missing Values': missing_data,
    'Missing Percentage': missing_percentage
})
missing_summary.to_csv('missing_data.csv', index=True)'''

總共有:44939位病人
Missing Values:
 age                            2065
bmi                            1651
ethnicity                       700
gender                           13
height                          631
                               ... 
leukemia                        346
lymphoma                        346
solid_tumor_with_metastasis     346
apache_3j_bodysystem            807
apache_2_bodysystem             807
Length: 74, dtype: int64
Missing Percentage:
 age                            4.595118
bmi                            3.673869
ethnicity                      1.557667
gender                         0.028928
height                         1.404126
                                 ...   
leukemia                       0.769933
lymphoma                       0.769933
solid_tumor_with_metastasis    0.769933
apache_3j_bodysystem           1.795768
apache_2_bodysystem            1.795768
Length: 74, dtype: float64


'# 開啟一個 .txt 檔案並寫入缺失值數量和缺失百分比\nwith open(\'missing_data.txt\', \'w\') as f:\n    f.write("Missing Values:\n")\n    f.write(missing_data.to_string())  # 寫入缺失值數量\n    f.write("\n\nMissing Percentage:\n")\n    f.write(missing_percentage.to_string())  # 寫入缺失百分比\n\nmissing_summary = pd.DataFrame({\n    \'Missing Values\': missing_data,\n    \'Missing Percentage\': missing_percentage\n})\nmissing_summary.to_csv(\'missing_data.csv\', index=True)'

### 刪除 threshold under 0.5

In [None]:
# 刪除缺失值
def drop_missing_values(data, threshold=0.5):
    return data.loc[:, data.isnull().mean() < threshold]

# 執行清理
train_data = drop_missing_values(train_data, threshold=config.get('missing_value_threshold', 0.5))


### 填補(KNN插補)

In [None]:
from sklearn.impute import KNNImputer

# 檢查是否有缺失值的列
missing_cols = train_data.columns[train_data.isnull().any()]
if len(missing_cols) > 0:
    # 初始化 KNNImputer 並使用配置中的 n_neighbors
    knn_imputer = KNNImputer(n_neighbors=config['model_params']['n_neighbors'])
    
    # 進行 KNN 插補
    train_data[missing_cols] = knn_imputer.fit_transform(train_data[missing_cols])
    print(f"KNN imputation completed for columns: {list(missing_cols)} with n_neighbors={config['model_params']['n_neighbors']}")
else:
    print("No missing values found for KNN imputation.")



## data transformation

### 類別特徵編碼

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import pandas as pd

# 根據 config.yaml 中的 categorical_encoding 配置選擇編碼方法
if config['model_params']['categorical_encoding'] == 'onehot':
    encoder = OneHotEncoder(sparse=False)
    encoded_df = pd.DataFrame(encoder.fit_transform(train_data[['categorical_column']]),
                              columns=encoder.get_feature_names_out(['categorical_column']))
    train_data = train_data.drop('categorical_column', axis=1).join(encoded_df)
elif config['model_params']['categorical_encoding'] == 'label':
    encoder = LabelEncoder()
    train_data['categorical_column'] = encoder.fit_transform(train_data['categorical_column'])
else:
    raise ValueError("Unsupported categorical encoding type in config.yaml")

print(f"Categorical encoding applied using {config['model_params']['categorical_encoding']} encoding.")


### 數值特徵標準化

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# 根據 config.yaml 中的 scaler 配置選擇縮放方法
if config['model_params']['scaler'] == 'standard':
    scaler = StandardScaler()
elif config['model_params']['scaler'] == 'minmax':
    scaler = MinMaxScaler()
else:
    raise ValueError("Unsupported scaler type in config.yaml")

# 應用選擇的縮放方法
train_data[config['numerical_columns']] = scaler.fit_transform(train_data[config['numerical_columns']])
print(f"Data scaled using {config['model_params']['scaler']} scaler.")



## Data Imbalance 處理

In [None]:
# 平衡數據
from imblearn.over_sampling import SMOTE

# 使用配置中的 smote_strategy 和 random_state 參數
smote = SMOTE(sampling_strategy=config['model_params']['smote_strategy'], random_state=config['model_params']['random_state'])
train_data_res, train_target_res = smote.fit_resample(train_data, train_target)
print(f"Data imbalance handled with SMOTE strategy: {config['model_params']['smote_strategy']}")


##  data visualization

In [None]:
def plot_distribution(data, column):
    plt.figure(figsize=(8, 6))
    sns.histplot(data[column], kde=True)
    plt.title(f'Distribution of {column}')
    plt.show()

def plot_boxplot(data, column):
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=data[column])
    plt.title(f'Boxplot of {column}')
    plt.show()

for col in config['numerical_columns']:
    plot_distribution(train_data, col)
    plot_boxplot(train_data, col)

### 相關性矩陣分析(Correlation Matrix Analysis)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# 繪製相關矩陣的函數
def plot_correlation_matrix(data):
    # 選擇數值類型的欄位
    numeric_data = data.select_dtypes(include=['float64', 'int64'])
    
    # 繪製相關矩陣
    plt.figure(figsize=(10, 8))
    sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.show()

# 在數據轉換後執行相關性矩陣分析
plot_correlation_matrix(train_data)

## feature engineering 特徵工程

In [None]:
from sklearn.decomposition import PCA
import pandas as pd

# PCA 降維的綜合處理函數
def apply_pca(data, n_components):
    pca = PCA(n_components=n_components)
    principal_components = pca.fit_transform(data)
    pca_df = pd.DataFrame(data=principal_components, columns=[f'PC{i+1}' for i in range(n_components)])
    return pd.concat([data, pca_df], axis=1)

# 使用配置中的 pca_components 參數執行 PCA
train_data = apply_pca(train_data, n_components=config['model_params']['pca_components'])
print(f"PCA applied with {config['model_params']['pca_components']} components.")


## Model trainning

In [None]:
from sklearn.ensemble import RandomForestClassifier

# 模型初始化(隨機森林模型)
model = RandomForestClassifier(random_state=config['model_params']['random_state'])

# 訓練模型
model.fit(train_data_res, train_target_res)
print("Model training completed.")

#訓練模型的logging
try:
    model.fit(train_data_res, train_target_res)
    logging.info("Model training completed successfully.")
except Exception as e:
    logging.error(f"Error during model training: {e}")

## cross-valid

In [None]:
# 交叉驗證
from sklearn.model_selection import cross_val_score
f1_scores = cross_val_score(model, train_data_res, train_target_res, cv=5, scoring='f1_macro')
print("Cross-validated Macro F1 Score:", f1_scores.mean())

## feature importance

In [None]:
# 特徵重要性分析
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(train_data_res)

# 顯示前20個重要特徵
shap.summary_plot(shap_values[1], train_data_res, plot_type="bar", max_display=20)

## model evaluation

In [None]:
from sklearn.metrics import f1_score, roc_auc_score
test_predictions = model.predict(test_data)

# 假設有測試集標籤，進行 AUROC 和 F1 計算
# test_true_labels = pd.read_csv("path_to_true_labels.csv")
# f1 = f1_score(test_true_labels, test_predictions, average='macro')
# auroc = roc_auc_score(test_true_labels, test_predictions)
# print("Test F1 Score:", f1)
# print("Test AUROC:", auroc)
# 假設 test_predictions 是模型在測試集上的預測
# 假設 test_proba 是模型預測的概率 (e.g., test_proba = model.predict_proba(test_data))

# 示例：計算 F1-Score 和 AUROC
f1 = calculate_f1(train_target, test_predictions)
auroc = calculate_auc(train_target, test_proba)
print("Test F1 Score:", f1)
print("Test AUROC:", auroc)


# evaluation的logging
try:
    f1 = calculate_f1(train_target, test_predictions)
    auroc = calculate_auc(train_target, test_proba)
    logging.info(f"Evaluation results - F1 Score: {f1}, AUROC: {auroc}")
except Exception as e:
    logging.error(f"Error during model evaluation: {e}")

#### Kaggle gogo!

In [None]:
import pandas as pd

# 提交文件生成函數
def create_submission_file(model, test_data, config):
    # 使用模型進行預測
    predictions = model.predict(test_data)
    
    # 創建提交 DataFrame，將索引作為 ID 和預測結果 'has_died' 放入
    submission = pd.DataFrame({'Id': test_data.index, 'has_died': predictions})
    
    # 使用配置中的 submission_file 路徑保存文件
    submission.to_csv(config['data_paths']['submission_file'], index=False)
    print(f"Submission file saved to {config['data_paths']['submission_file']}")

# 調用函數生成提交文件
create_submission_file(model, test_data, config)



### 模型監控

In [None]:
import time
from sklearn.metrics import accuracy_score

def monitor_model(new_data, true_labels):
    start_time = time.time()
    predictions = model.predict(new_data)
    end_time = time.time()
    
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='macro')
    latency = end_time - start_time

    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    print(f"Prediction Latency: {latency} seconds")
    
    return accuracy, f1, latency

# 假設 new_test_data 和 new_test_labels 為新數據
# monitor_model(new_test_data, new_test_labels)


初始化和配置加載的 Logging


try:
    with open("config/config.yaml", "r") as f:
        config = yaml.safe_load(f)
    logging.info("Configuration loaded successfully.")
    logging.info(f"Configurations: {config}")
except Exception as e:
    logging.error(f"Error loading configuration: {e}")




缺失值處理的 Logging
try:
    train_data = drop_missing_values(train_data, threshold=0.5)
    logging.info("Missing values handled successfully. Remaining columns: {train_data.columns}")
except Exception as e:
    logging.error(f"Error handling missing values: {e}")




特徵轉換和標準化的 Logging
try:
    train_data = encode_categorical(train_data, 'categorical_column')
    train_data = standardize_data(train_data, config['numerical_columns'])
    logging.info("Data transformation and standardization completed.")
except Exception as e:
    logging.error(f"Error during data transformation: {e}")




處理數據不平衡的 Logging
try:
    smote = SMOTE(random_state=config['model_params']['random_state'])
    train_data_res, train_target_res = smote.fit_resample(train_data, train_target)
    logging.info("Data imbalance handled successfully. Resampled data shape: {}".format(train_data_res.shape))
except Exception as e:
    logging.error(f"Error during data imbalance handling: {e}")




特徵工程的 Logging
try:
    train_data = apply_pca(train_data, n_components=2)
    logging.info("PCA applied successfully. Data shape after PCA: {}".format(train_data.shape))
except Exception as e:
    logging.error(f"Error during PCA transformation: {e}")




生成 Kaggle 提交文件的 Logging
try:
    save_data(submission, "testing_result.csv")
    logging.info("Kaggle submission file created successfully.")
except Exception as e:
    logging.error(f"Error creating Kaggle submission file: {e}")




模型監控的 Logging
try:
    accuracy, f1, latency = monitor_model(new_test_data, new_test_labels)
    logging.info(f"Monitoring results - Accuracy: {accuracy}, F1 Score: {f1}, Latency: {latency} seconds")
except Exception as e:
    logging.error(f"Error during model monitoring: {e}")
