# 特征工程与模型建立

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
warnings.filterwarnings('ignore')

In [3]:
data_path='../data/train.csv'
data=pd.read_csv(data_path)
train_data=pd.DataFrame()
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


在训练集中加入最重要的性别维度

In [4]:
train_data['is_female']=data['Sex']=='female'
train_data['is_female']=train_data['is_female'].astype(int)

加入每个人的价格维度

In [6]:
train_data['mean_Fare']=data.groupby('Ticket')['Fare'].transform('mean')

加入团体相关维度

In [7]:
train_data['SibSp']=data['SibSp']
train_data['Parch']=data['Parch']
train_data['total_relatives']=data['SibSp']+data['Parch']

我们对Embarked维度做one-hot编码

In [8]:
train_data['Embarked_in_C']=(data['Embarked']=='C').astype(int)
train_data['Embarked_in_S']=(data['Embarked']=='S').astype(int)
train_data['Embarked_in_Q']=(data['Embarked']=='Q').astype(int)
train_data.head()

Unnamed: 0,is_female,Pclass,mean_Fare,SibSp,Parch,total_relatives,Embarked_in_C,Embarked_in_S,Embarked_in_Q
0,0,3,7.25,1,0,1,0,1,0
1,1,1,71.2833,1,0,1,1,0,0
2,1,3,7.925,0,0,0,0,1,0
3,1,1,53.1,1,0,1,0,1,0
4,0,3,8.05,0,0,0,0,1,0


我们对Cabin是否缺失编码,并对未缺失者提取位置信息

In [9]:
train_data['Cabin_isna']=data['Cabin'].isna()
train_data['Cabin_isna']=train_data['Cabin_isna'].astype(int)

提取船舱位置维度

In [10]:
data['Cabin_type']=data['Cabin'].str[0]
data[['Cabin','Cabin_type']].head(15)
value={'A':1,'G':2,'C':3,'F':4,'B':5,'E':6,'D':7}#按照生存率由低到高目标编码
def TypeToValue(row):
    tmp=row['Cabin_type']
    if tmp in value:
        return value[tmp]
    else :
        return 0
train_data['Cabin_value']=data.apply(TypeToValue,axis=1)

名字中带有master的小男孩是男性中生存率最高的部分，将它提取至训练集中

In [11]:
data['name_title']=data['Name'].str.extract(r',\s*([^\.]+)\.',expand=False)#提取名字中的称谓
standard={'Mrs','Miss','Mr','Master'}
def standard_title(row):
    tmp = row['name_title']
    if tmp in ['Ms', 'Mlle']:       # 法语 Mademoiselle
        return 'Miss'
    elif tmp == 'Mme':              # 法语 Madame
        return 'Mrs'
    elif tmp not in standard:       # 其他少见称谓
        return 'Rare'
    else:
        return tmp
data['name_title'] = data.apply(standard_title, axis=1)
train_data['is_Mrs']=(data['name_title']=='Mrs').astype(int)
train_data['is_Mr']=(data['name_title']=='Mr').astype(int)
train_data['is_Miss']=(data['name_title']=='Miss').astype(int)
train_data['is_Master']=(data['name_title']=='Master').astype(int)
train_data['is_Rare']=(data['name_title']=='Rare').astype(int)


我们将小于两岁的婴儿单独编码，他们是生存率最高的群体

In [18]:
train_data['is_baby']=(data['Age']<2).astype(int)

加入舱位等级维度

In [19]:
train_data['Pclass']=data['Pclass']

进而创建cabin是否缺失与舱位等级交互维度

In [29]:
train_data['Cabin_Pclass']=((train_data['Cabin_isna']==0 ) & (train_data['Pclass']==1)).astype(int)

加入高级票维度

In [None]:
better_feature=r'^(?:PC|C(?:\.?A\.?)?|P\.?P\.?(?!/)|P/PP|SC(?:/PARIS|/AH)?)\b'#读取高级票特征
train_data['is_senior_ticket']=(data['Ticket'].str.contains(better_feature,regex=True,na=False)).astype(int)

加入几乎必死维度

In [None]:
train_data['almost_die']=((data['Fare']==0)|(data['Ticket'].str.contains(r'LINE',regex=True,na=False))).astype(int)

年龄缺失的生存率相对较低，加入训练集

In [25]:
train_data['Age_isna']=(data['Age'].isna()).astype(int)

将年龄缺失与三等舱的强相关性编入训练集

In [27]:
train_data['naAge_Pclass']=((train_data['Pclass']==3)&(train_data['Age_isna']==1)).astype(int)

将年龄填补缺失值后加入训练集

In [28]:
train_data['Age'] = data['Age'].fillna(data['Age'].median())

创建女性-年龄-舱位交互特征

In [32]:
# 根据你之前的发现，创建一个 Age × Sex × Pclass 交互特征
train_data['Age_Sex_Pclass'] = train_data['Age'] * (train_data['is_female'] == 1).astype(int) * train_data['Pclass']


创建男性-一等舱交互特征

In [33]:
train_data['male-1pclass']=((train_data['is_female']==0)&(train_data['Pclass']==1)).astype(int)

创建女性-一二等舱交互特征

In [None]:
train_data['female-12pclass']=((train_data['is_female']==1)&(train_data['Pclass']==1)&(train_data['Pclass']==2)).astype(int)

In [34]:
print(train_data.columns.tolist())

['is_female', 'Pclass', 'mean_Fare', 'SibSp', 'Parch', 'total_relatives', 'Embarked_in_C', 'Embarked_in_S', 'Embarked_in_Q', 'Cabin_isna', 'Cabin_value', 'is_Mrs', 'is_Mr', 'is_Miss', 'is_Master', 'is_Rare', 'is_baby', 'Cabin_Pclass', 'Age', 'Age_isna', 'naAge_Pclass', 'Age_Sex_Pclass', 'male-1pclass']
