In [1]:
import pandas as pd

In [None]:
# 定义一个数据预处理的函数:prepare_data
# 参数说明：
# path: 读取的数据文件路径
# role：数据文件的用途 ,两个取值：
#        train（表明预处理的是训练集）
#        test （表明预处理的是测试集）
# 返回值：返回预处理好后的训练集或测试集
def prepare_data(path,role):
    titanic_df = pd.read_csv(path)

    if role == 'train':
        # train数据集drop掉：PassengerId,Name,Ticket,Cabin
        titanic_df = titanic_df.drop(['PassengerId','Name','Ticket','Cabin'], axis = 1)
    else:
        # test数据集drop掉Name,Ticket,Cabin
        titanic_df = titanic_df.drop(['Name','Ticket','Cabin'], axis = 1)
        
    # 构造一个新变量AgeIsMissing
    titanic_df['AgeIsMissing'] = 0
    titanic_df.loc[titanic_df['Age'].isnull(), 'AgeIsMissing'] = 1

    # 对Age缺失值进行均值填充
    age_mean = round(titanic_df['Age'].mean())
    titanic_df['Age'].fillna(age_mean, inplace=True)

    # 对Embarked缺失值用'S'替换
    titanic_df['Embarked'].fillna('S', inplace=True)

    # 对Age进行分箱--自定义分箱
    cut_points = [0,18,25,40,60,100]
    titanic_df["AgeBin"] = pd.cut(titanic_df.Age, bins=cut_points)

    # 对Fare船票价格进行分箱--等深分箱
    titanic_df["FareBin"] = pd.qcut(titanic_df.Fare, 5)

    # 构造FamilySize变量
    titanic_df['FamilySize'] = titanic_df['SibSp'] + titanic_df['Parch'] + 1

    # 构造一个新变量IsAlone（是否独自一人)
    titanic_df['IsAlone'] = 0
    titanic_df.loc[titanic_df['FamilySize'] == 1, 'IsAlone'] = 1

    # 构造一个新变量IsMother（是否是母亲）
    titanic_df['IsMother'] = 0
    titanic_df.loc[(titanic_df['Sex']=='female') & (titanic_df['Parch']>0) & (titanic_df['Age']>20), 'IsMother'] = 1

    # 把Sex性别和AgeBin特征进行组合
    titanic_df['SexAgeCombo'] = titanic_df['Sex'] + "_" + titanic_df['AgeBin'].astype(str)

    # 对Pclass,Sex,Embarked,AgeBin,FareBin,FamilySize,Sex_Age_combo进行独热编码
    Pclass = pd.get_dummies(titanic_df.Pclass,prefix='Pclass')
    Sex = pd.get_dummies(titanic_df.Sex,prefix='Sex')
    Embarked = pd.get_dummies(titanic_df.Embarked,prefix='Embarked')
    AgeBin = pd.get_dummies(titanic_df.AgeBin,prefix='AgeBin')
    FareBin = pd.get_dummies(titanic_df.FareBin,prefix='FareBin')
    FamilySize = pd.get_dummies(titanic_df.FamilySize,prefix='FamilySize')
    SexAgeCombo = pd.get_dummies(titanic_df.SexAgeCombo,prefix='SexAgeCombo')

    if role == 'train':
        # train数据集保留Survived
        data = pd.concat([titanic_df[['Survived','AgeIsMissing','IsAlone','IsMother']],
                              Pclass,Sex,Embarked,AgeBin,FareBin,FamilySize,SexAgeCombo],axis=1)
    else:
        # test数据集保留PassengerId
        data = pd.concat([titanic_df[['PassengerId','AgeIsMissing','IsAlone','IsMother']],
                              Pclass,Sex,Embarked,AgeBin,FareBin,FamilySize,SexAgeCombo],axis=1)
    return data