In [23]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score

# 加载数据
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')


def data_preprocess(data):
    # 特征工程示例
    # data['Title'] = data['Name'].apply(
    #     lambda name: name.split(',')[1].split('.')[0].strip())
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    data['IsAlone'] = (data['FamilySize'] == 1).astype(int)
    data['AgeGroup'] = pd.cut(data['Age'], bins=[0, 12, 20, 40, 60, 80], labels=[
        'Child', 'Teen', 'Adult', 'Middle-aged', 'Senior'])
    data['FareBand'] = pd.qcut(data['Fare'], 4, labels=[1, 2, 3, 4])

    # 填补缺失值
    data['Age'] = data['Age'].fillna(data['Age'].median())
    data['Embarked'] = data['Embarked'].fillna(
        data['Embarked'].mode()[0])

    # 特征编码
    data = pd.get_dummies(data, columns=[
        'Sex', 'Embarked', 'AgeGroup', 'FareBand'], drop_first=True)

    # 模型训练
    X = data.drop(['PassengerId', 'Survived',
                  'Name', 'Title', 'Ticket', 'Cabin'], axis=1, errors='ignore')
    return X


X = data_preprocess(train)
y = train['Survived']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(
    n_estimators=100, max_depth=5, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_valid)
print('Validation Accuracy:', accuracy_score(y_valid, y_pred))

# 交叉验证分数
cv_scores = cross_val_score(rf_model, X, y, cv=5)
print('Cross-Validation Accuracy:', np.mean(cv_scores))

Validation Accuracy: 0.8156424581005587
Cross-Validation Accuracy: 0.8159939740129307


In [26]:
data = data_preprocess(test)
pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': rf_model.predict(
    data)}).to_csv('./data/result.csv', index=False)