<a href="https://colab.research.google.com/github/wenyenyeh/HW2/blob/main/HW2%E9%90%B5%E9%81%94%E5%B0%BC%E8%99%9F%E7%94%9F%E5%AD%98%E9%A0%90%E6%B8%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install optuna
!pip install xgboost




In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE, SelectKBest, chi2
from sklearn.metrics import confusion_matrix, accuracy_score
import optuna
import xgboost as xgb

# 載入 Titanic 資料
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
gender_submission = pd.read_csv('gender_submission.csv')
# 步驟 1: 數據探索
print(train.info())
print(train.describe())
print(train.isnull().sum())

# 步驟 2: 特徵工程
# 填補遺漏值
train['Age'].fillna(train['Age'].median(), inplace=True)
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)

# 編碼類別變數
train = pd.get_dummies(train, columns=['Sex', 'Embarked'], drop_first=True)

# 移除不必要的欄位
train.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# 分割特徵和目標變數
X = train.drop(['Survived', 'PassengerId'], axis=1)
y = train['Survived']

# 步驟 3: 使用 RFE, SelectKBest 和 Optuna 進行特徵選擇
# RFE
rf = RandomForestClassifier()
rfe = RFE(rf, n_features_to_select=5)
X_rfe = rfe.fit_transform(X, y)

# SelectKBest
skb = SelectKBest(chi2, k=5)
X_skb = skb.fit_transform(X, y)

# 使用 Optuna 進行超參數調整
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    max_depth = trial.suggest_int('max_depth', 2, 32, log=True)
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
    model.fit(X, y)
    return model.score(X, y)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

best_params = study.best_params
print(f"最佳參數: {best_params}")

# 步驟 4: 模型訓練與評估
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(**best_params)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
print("混淆矩陣:")
print(confusion_matrix(y_val, y_pred))
print(f"準確率: {accuracy_score(y_val, y_pred)}")

# 步驟 5: 測試集預測
test['Age'].fillna(test['Age'].median(), inplace=True)
test['Embarked'].fillna(test['Embarked'].mode()[0], inplace=True)
test = pd.get_dummies(test, columns=['Sex', 'Embarked'], drop_first=True)
test.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

X_test = test.drop(['PassengerId'], axis=1)

test['Survived'] = model.predict(X_test)
test[['PassengerId', 'Survived']].to_csv('submission_rf.csv', index=False)

# 步驟 6: 使用 XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_val)

print("XGBoost 混淆矩陣:")
print(confusion_matrix(y_val, y_pred_xgb))
print(f"XGBoost 準確率: {accuracy_score(y_val, y_pred_xgb)}")

test['Survived'] = xgb_model.predict(X_test)
test[['PassengerId', 'Survived']].to_csv('submission_xgb.csv', index=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.48659

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Age'].fillna(train['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)
[I 2025-01-05 08:47:55,466] A new study created in memory with name: no-name-99c3f291-8ba5-49f1-a515-edf1b7

最佳參數: {'n_estimators': 413, 'max_depth': 27}
混淆矩陣:
[[90 15]
 [17 57]]
準確率: 0.8212290502793296
XGBoost 混淆矩陣:
[[91 14]
 [18 56]]
XGBoost 準確率: 0.8212290502793296


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['Age'].fillna(test['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['Embarked'].fillna(test['Embarked'].mode()[0], inplace=True)
Parameters: { "use_label_encoder" } are not used.

