# W7 Titanic Data Preprocessing (Demo)

## Step 1：載入資料與初步觀察

In [None]:
import pandas as pd

# 載入 Titanic 資料
df = pd.read_csv('data/titanic.csv')
print('原始欄位名稱:', df.columns.tolist())

# 統一欄位首字母大寫
df.columns = [c.capitalize() for c in df.columns]
print('修正後欄位名稱:', df.columns.tolist())

print('\n資料筆數:', len(df))
df.info()
df.head()

## Step 2：處理缺失值

In [None]:
# 以 Age 的中位數填補
df['Age'] = df['Age'].fillna(df['Age'].median())

# 以 Embarked 的眾數填補
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# 再次檢查缺失值
df.isnull().sum()

## Step 3：移除異常值

In [None]:
mean = df['Fare'].mean()
std = df['Fare'].std()
threshold = mean + 3 * std

print('移除前筆數:', len(df))
df = df[df['Fare'] <= threshold]
print('移除後筆數:', len(df))

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(6,3))
plt.boxplot(df['Fare'])
plt.title('Fare Boxplot (After Outlier Removal)')
plt.show()

## Step 4：類別變數編碼

In [None]:
df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=False)
df.head()

## Step 5：數值標準化

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])
df[['Age', 'Fare']].describe()

## Step 6：資料切割 (Train/Test Split)

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Survived'])
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

len(X_train), len(X_test)

## Step 7：輸出處理後資料

In [None]:
df.to_csv('data/titanic_processed.csv', index=False, encoding='utf-8-sig')
print('資料處理完成並已輸出至 data/titanic_processed.csv')