In [1]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

print(train_data.info()) # 查看数去条数方便数据完成后分离      
print('*'*30)
print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
******************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null obje

In [3]:
# 将训练数据和测试数据组合在一起方便处理
train = train_data.loc[:,train_data.columns != 'Survived'] # 去掉标签
test = test_data
data = pd.concat([train,test], ignore_index=True) # 合并并整理index
data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# 去调PassengerId, Ticket
data.drop(['PassengerId', 'Ticket'], axis=1, inplace=True)
data.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S
4,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S


In [5]:
# 处理Pclass列
pclass_dummies = pd.get_dummies(data['Pclass'])
pclass_dummies.columns = ['Pclass1', 'Pclass2', 'Pclass3']
data = data.join(pclass_dummies)
data.drop(['Pclass'], axis=1, inplace=True)
data.head()

Unnamed: 0,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Pclass1,Pclass2,Pclass3
0,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S,0,0,1
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C,1,0,0
2,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S,0,0,1
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S,1,0,0
4,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S,0,0,1


In [6]:
# 处理Embarked列
embarked_dummies = pd.get_dummies(data['Embarked'])
embarked_dummies.columns = ['Embarked1', 'Embarked2', 'Embarked3']
data = data.join(embarked_dummies)
data.drop(['Embarked'], axis=1, inplace=True)
data.head()

Unnamed: 0,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Pclass1,Pclass2,Pclass3,Embarked1,Embarked2,Embarked3
0,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,0,0,1,0,0,1
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,1,0,0,1,0,0
2,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,0,0,1,0,0,1
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,1,0,0,0,0,1
4,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,0,0,1,0,0,1


In [7]:
# 提取Name中的称呼
import re

title = data['Name'].map(lambda x: re.compile(", (.*?)\.").findall(x)[0])

title_Dict = {}
title_Dict.update(dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer'))
title_Dict.update(dict.fromkeys(['Don', 'Sir', 'the Countess', 'Dona', 'Lady'], 'Royalty'))
title_Dict.update(dict.fromkeys(['Mme', 'Ms', 'Mrs'], 'Mrs'))
title_Dict.update(dict.fromkeys(['Mlle', 'Miss'], 'Miss'))
title_Dict.update(dict.fromkeys(['Mr'], 'Mr'))
title_Dict.update(dict.fromkeys(['Master','Jonkheer'], 'Master'))

data['Title'] = title.map(title_Dict)
data.drop(['Name'], axis=1,inplace=True)
data.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Cabin,Pclass1,Pclass2,Pclass3,Embarked1,Embarked2,Embarked3,Title
0,male,22.0,1,0,7.25,,0,0,1,0,0,1,Mr
1,female,38.0,1,0,71.2833,C85,1,0,0,1,0,0,Mrs
2,female,26.0,0,0,7.925,,0,0,1,0,0,1,Miss
3,female,35.0,1,0,53.1,C123,1,0,0,0,0,1,Mrs
4,male,35.0,0,0,8.05,,0,0,1,0,0,1,Mr


In [8]:
# 处理Title列
title = pd.get_dummies(data['Title'])
data = data.join(title)

In [9]:
data.drop(['Title'], axis=1, inplace=True)
data.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Cabin,Pclass1,Pclass2,Pclass3,Embarked1,Embarked2,Embarked3,Master,Miss,Mr,Mrs,Officer,Royalty
0,male,22.0,1,0,7.25,,0,0,1,0,0,1,0,0,1,0,0,0
1,female,38.0,1,0,71.2833,C85,1,0,0,1,0,0,0,0,0,1,0,0
2,female,26.0,0,0,7.925,,0,0,1,0,0,1,0,1,0,0,0,0
3,female,35.0,1,0,53.1,C123,1,0,0,0,0,1,0,0,0,1,0,0
4,male,35.0,0,0,8.05,,0,0,1,0,0,1,0,0,1,0,0,0


In [10]:
# 处理Sex列
from sklearn.preprocessing import LabelEncoder

data['Sex'] = LabelEncoder().fit_transform(data['Sex'])
data.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Cabin,Pclass1,Pclass2,Pclass3,Embarked1,Embarked2,Embarked3,Master,Miss,Mr,Mrs,Officer,Royalty
0,1,22.0,1,0,7.25,,0,0,1,0,0,1,0,0,1,0,0,0
1,0,38.0,1,0,71.2833,C85,1,0,0,1,0,0,0,0,0,1,0,0
2,0,26.0,0,0,7.925,,0,0,1,0,0,1,0,1,0,0,0,0
3,0,35.0,1,0,53.1,C123,1,0,0,0,0,1,0,0,0,1,0,0
4,1,35.0,0,0,8.05,,0,0,1,0,0,1,0,0,1,0,0,0


In [11]:
# 处理Age列
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Age'] = scaler.fit_transform(data['Age'].values.reshape(-1, 1)) # 标准化处理数据
data.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Cabin,Pclass1,Pclass2,Pclass3,Embarked1,Embarked2,Embarked3,Master,Miss,Mr,Mrs,Officer,Royalty
0,1,-0.581628,1,0,7.25,,0,0,1,0,0,1,0,0,1,0,0,0
1,0,0.658652,1,0,71.2833,C85,1,0,0,1,0,0,0,0,0,1,0,0
2,0,-0.271558,0,0,7.925,,0,0,1,0,0,1,0,1,0,0,0,0
3,0,0.426099,1,0,53.1,C123,1,0,0,0,0,1,0,0,0,1,0,0
4,1,0.426099,0,0,8.05,,0,0,1,0,0,1,0,0,1,0,0,0


In [12]:
# 处理SibSp和Parch列

def family_size_category(family_size):
    if family_size <= 1:
        return 'Single'
    elif family_size <= 4:
        return 'Small_Family'
    else:
        return 'Large_Family'
    
data['Family_Size'] = data['Parch'] + data['SibSp'] + 1
data['Family_Size'] = data['Family_Size'].map(family_size_category)
data.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Cabin,Pclass1,Pclass2,Pclass3,Embarked1,Embarked2,Embarked3,Master,Miss,Mr,Mrs,Officer,Royalty,Family_Size
0,1,-0.581628,1,0,7.25,,0,0,1,0,0,1,0,0,1,0,0,0,Small_Family
1,0,0.658652,1,0,71.2833,C85,1,0,0,1,0,0,0,0,0,1,0,0,Small_Family
2,0,-0.271558,0,0,7.925,,0,0,1,0,0,1,0,1,0,0,0,0,Single
3,0,0.426099,1,0,53.1,C123,1,0,0,0,0,1,0,0,0,1,0,0,Small_Family
4,1,0.426099,0,0,8.05,,0,0,1,0,0,1,0,0,1,0,0,0,Single


In [13]:
family_size = pd.get_dummies(data['Family_Size'])
data = data.join(family_size)
data.drop(['Family_Size'], axis=1, inplace=True)
data.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Cabin,Pclass1,Pclass2,Pclass3,Embarked1,...,Embarked3,Master,Miss,Mr,Mrs,Officer,Royalty,Large_Family,Single,Small_Family
0,1,-0.581628,1,0,7.25,,0,0,1,0,...,1,0,0,1,0,0,0,0,0,1
1,0,0.658652,1,0,71.2833,C85,1,0,0,1,...,0,0,0,0,1,0,0,0,0,1
2,0,-0.271558,0,0,7.925,,0,0,1,0,...,1,0,1,0,0,0,0,0,1,0
3,0,0.426099,1,0,53.1,C123,1,0,0,0,...,1,0,0,0,1,0,0,0,0,1
4,1,0.426099,0,0,8.05,,0,0,1,0,...,1,0,0,1,0,0,0,0,1,0


In [14]:
data.drop(['SibSp', 'Parch'], axis=1, inplace=True)

In [15]:
# 处理票价

from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
data['Fare'].fillna(data['Fare'].mean(), inplace=True)
data['Fare'] = scaler.fit_transform(data['Fare'].values.reshape(-1, 1)) # 标准化处理数据
data.head()

Unnamed: 0,Sex,Age,Fare,Cabin,Pclass1,Pclass2,Pclass3,Embarked1,Embarked2,Embarked3,Master,Miss,Mr,Mrs,Officer,Royalty,Large_Family,Single,Small_Family
0,1,-0.581628,-0.503595,,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1
1,0,0.658652,0.734503,C85,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1
2,0,-0.271558,-0.490544,,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0
3,0,0.426099,0.382925,C123,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1
4,1,0.426099,-0.488127,,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0


In [16]:
# Cabin缺失过多，去掉
data.drop(['Cabin'], axis=1, inplace=True)

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 18 columns):
Sex             1309 non-null int64
Age             1309 non-null float64
Fare            1309 non-null float64
Pclass1         1309 non-null uint8
Pclass2         1309 non-null uint8
Pclass3         1309 non-null uint8
Embarked1       1309 non-null uint8
Embarked2       1309 non-null uint8
Embarked3       1309 non-null uint8
Master          1309 non-null uint8
Miss            1309 non-null uint8
Mr              1309 non-null uint8
Mrs             1309 non-null uint8
Officer         1309 non-null uint8
Royalty         1309 non-null uint8
Large_Family    1309 non-null uint8
Single          1309 non-null uint8
Small_Family    1309 non-null uint8
dtypes: float64(2), int64(1), uint8(15)
memory usage: 49.9 KB


In [18]:
# Age用中位数填充
data['Age'] = data['Age'].fillna(data['Age'].median())

# Fare用平均数填充
data['Fare'] = data['Fare'].fillna(data['Fare'].mean())

In [19]:
# 将训练数据集和测试数据集分开

train_data = data.loc[:890]
train_data['Survived'] = pd.read_csv('data/train.csv')['Survived']
test_data = data.loc[891:].reset_index(drop=True)
test_data['PassengerId'] = pd.read_csv('data/test.csv')['PassengerId']

In [20]:
print(train_data.info())   
print('*'*30)
print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 19 columns):
Sex             891 non-null int64
Age             891 non-null float64
Fare            891 non-null float64
Pclass1         891 non-null uint8
Pclass2         891 non-null uint8
Pclass3         891 non-null uint8
Embarked1       891 non-null uint8
Embarked2       891 non-null uint8
Embarked3       891 non-null uint8
Master          891 non-null uint8
Miss            891 non-null uint8
Mr              891 non-null uint8
Mrs             891 non-null uint8
Officer         891 non-null uint8
Royalty         891 non-null uint8
Large_Family    891 non-null uint8
Single          891 non-null uint8
Small_Family    891 non-null uint8
Survived        891 non-null int64
dtypes: float64(2), int64(2), uint8(15)
memory usage: 41.0 KB
None
******************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 19 columns):
Sex             418 non

In [21]:
# 保存train_data和test_data
train_data.to_csv('data/train_data.csv', index=False)
test_data.to_csv('data/test_data.csv', index=False)