In [533]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [534]:
train_raw_df = pd.read_csv('../raw_data/train.csv')
train_raw_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [535]:
train_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [536]:
train_raw_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## 資料前處理

In [537]:
train_processed_df = train_raw_df.copy()

In [538]:
def fill_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df['Age'] = df['Age'].fillna(df['Age'].median())
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

    return df

In [539]:
def cast_data_type(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df['Pclass'] = df['Pclass'].astype('category')
    df['Sex'] = df['Sex'].astype('category')
    df['SibSp'] = df['SibSp'].astype('float64')
    df['Parch'] = df['Parch'].astype('float64')
    df['Embarked'] = df['Embarked'].astype('category')

    return df

In [540]:
train_processed_df = fill_missing_values(train_processed_df)
train_processed_df = cast_data_type(train_processed_df)

In [541]:
train_processed_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0.0,0.0,373450,8.05,,S


In [542]:
train_processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    int64   
 1   Survived     891 non-null    int64   
 2   Pclass       891 non-null    category
 3   Name         891 non-null    object  
 4   Sex          891 non-null    category
 5   Age          891 non-null    float64 
 6   SibSp        891 non-null    float64 
 7   Parch        891 non-null    float64 
 8   Ticket       891 non-null    object  
 9   Fare         891 non-null    float64 
 10  Cabin        204 non-null    object  
 11  Embarked     891 non-null    category
dtypes: category(3), float64(4), int64(2), object(3)
memory usage: 65.8+ KB


In [543]:
train_processed_df.describe()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.486592,13.019697,1.102743,0.806057,49.693429
min,1.0,0.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,35.0,1.0,0.0,31.0
max,891.0,1.0,80.0,8.0,6.0,512.3292


## 資料探索

## 特徵工程

In [544]:
def create_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df['FamilySize'] = df['SibSp'] + df['Parch']

    return df

In [545]:
def drop_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    drop_columns = ['Name', 'Ticket', 'Cabin']
    df = df.drop(columns=drop_columns)

    return df

In [546]:
train_processed_df = create_features(train_processed_df)
train_processed_df = drop_features(train_processed_df)

In [547]:
train_processed_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize
0,1,0,3,male,22.0,1.0,0.0,7.25,S,1.0
1,2,1,1,female,38.0,1.0,0.0,71.2833,C,1.0
2,3,1,3,female,26.0,0.0,0.0,7.925,S,0.0
3,4,1,1,female,35.0,1.0,0.0,53.1,S,1.0
4,5,0,3,male,35.0,0.0,0.0,8.05,S,0.0


In [548]:
train_processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    int64   
 1   Survived     891 non-null    int64   
 2   Pclass       891 non-null    category
 3   Sex          891 non-null    category
 4   Age          891 non-null    float64 
 5   SibSp        891 non-null    float64 
 6   Parch        891 non-null    float64 
 7   Fare         891 non-null    float64 
 8   Embarked     891 non-null    category
 9   FamilySize   891 non-null    float64 
dtypes: category(3), float64(5), int64(2)
memory usage: 51.8 KB


In [549]:
train_processed_df.describe()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,FamilySize
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,29.361582,0.523008,0.381594,32.204208,0.904602
std,257.353842,0.486592,13.019697,1.102743,0.806057,49.693429,1.613459
min,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,223.5,0.0,22.0,0.0,0.0,7.9104,0.0
50%,446.0,0.0,28.0,0.0,0.0,14.4542,0.0
75%,668.5,1.0,35.0,1.0,0.0,31.0,1.0
max,891.0,1.0,80.0,8.0,6.0,512.3292,10.0


## 模型訓練

In [550]:
X = train_processed_df.drop(columns=['Survived', 'PassengerId'])
y = train_processed_df['Survived']

In [551]:
categorical_features = ['Pclass', 'Sex', 'Embarked']
numerical_features = ['Age', 'Fare', 'FamilySize']

categorical_transformer = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline([
    ('poly', PolynomialFeatures(degree=2)),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('cat', categorical_transformer, categorical_features),
    ('num', numerical_transformer, numerical_features)
])

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [552]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc',
}

results = cross_validate(pipeline, X, y, cv=kf, scoring=scoring, return_train_score=True)

print("🔍 訓練集結果 (平均)")
print(f"✅ Accuracy: {results['train_accuracy'].mean():.4f}")
print(f"🎯 Precision: {results['train_precision'].mean():.4f}")
print(f"📢 Recall: {results['train_recall'].mean():.4f}")
print(f"📊 F1 Score: {results['train_f1'].mean():.4f}")
print(f"📈 ROC AUC: {results['train_roc_auc'].mean():.4f}")

print("🔍 測試集結果 (平均)")
print(f"✅ Accuracy: {results['test_accuracy'].mean():.4f}")
print(f"🎯 Precision: {results['test_precision'].mean():.4f}")
print(f"📢 Recall: {results['test_recall'].mean():.4f}")
print(f"📊 F1 Score: {results['test_f1'].mean():.4f}")
print(f"📈 ROC AUC: {results['test_roc_auc'].mean():.4f}")

🔍 訓練集結果 (平均)
✅ Accuracy: 0.8148
🎯 Precision: 0.7823
📢 Recall: 0.7170
📊 F1 Score: 0.7482
📈 ROC AUC: 0.8634
🔍 測試集結果 (平均)
✅ Accuracy: 0.8025
🎯 Precision: 0.7665
📢 Recall: 0.6916
📊 F1 Score: 0.7259
📈 ROC AUC: 0.8445


In [553]:
pipeline.fit(X, y)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,degree,2
,interaction_only,False
,include_bias,True
,order,'C'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [554]:
y_pred = pipeline.predict(X)
print("Accuracy:", accuracy_score(y, y_pred))
print("F1 Score:", f1_score(y, y_pred))
print("Precision:", precision_score(y, y_pred))
print("Recall:", recall_score(y, y_pred))

Accuracy: 0.8170594837261503
F1 Score: 0.7503828483920367
Precision: 0.7877813504823151
Recall: 0.716374269005848


## 預測

In [555]:
test_raw_df = pd.read_csv('../raw_data/test.csv')
test_raw_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [556]:
test_processed_df = test_raw_df.copy()
test_processed_df = fill_missing_values(test_processed_df)
test_processed_df = cast_data_type(test_processed_df)
test_processed_df = create_features(test_processed_df)
test_processed_df = drop_features(test_processed_df)

In [557]:
test_processed_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize
0,892,3,male,34.5,0.0,0.0,7.8292,Q,0.0
1,893,3,female,47.0,1.0,0.0,7.0,S,1.0
2,894,2,male,62.0,0.0,0.0,9.6875,Q,0.0
3,895,3,male,27.0,0.0,0.0,8.6625,S,0.0
4,896,3,female,22.0,1.0,1.0,12.2875,S,2.0


In [558]:
X = test_processed_df.drop(columns=['PassengerId'])

In [559]:
y_pred = pipeline.predict(X)
submission_df = pd.DataFrame({'PassengerId': test_raw_df['PassengerId'], 'Survived': y_pred})
submission_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [560]:
submission_df.to_csv('submission.csv', index=False)