# 泰坦尼克获救预测

本案例为将要得到用户是否被获救。所以是一个二分类问题。

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## 数据简介

* PassengerId：乘客编号
* Survived：是否被获救。标签列
* Pclass：乘客舱位等级。特征列
* Name：乘客姓名。特征列
* Sex：乘客性别。特征列
* Age：年龄
* SibSp：兄弟姐妹有多少个
* Parch：带老人孩子有几个
* Ticket：船票编号
* Fare：船票价格
* Cabin：确实值比较多。直接舍弃
* Embarked：乘客登船地点。


## 数据分析

In [2]:
titanic = pd.read_csv('../../dataSet/Titanic-train.csv')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
pd.DataFrame(titanic.describe()) # 统计个列数据
#有些列存在缺失情况，比如Age

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].mean()) #使用age的平均值填充缺失数据
pd.DataFrame(titanic.describe())

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
print(titanic['Sex'].unique()) # 查看sex列有几种可能性

#将male与female使用数字替换
titanic.loc[titanic['Sex']=='male','Sex']=0
titanic.loc[titanic['Sex']=='female','Sex']=1

['male' 'female']


In [6]:
print(titanic['Embarked'].unique()) #查看Embarked列有几种可能
print(titanic['Embarked'].value_counts()) # 查看没中类型出现了多少次
titanic['Embarked']=titanic['Embarked'].fillna('S') # 使用出现次数最多的类型替换nan
titanic.loc[titanic['Embarked']=='S','Embarked']=0
titanic.loc[titanic['Embarked']=='C','Embarked']=1
titanic.loc[titanic['Embarked']=='Q','Embarked']=2

['S' 'C' 'Q' nan]
S    644
C    168
Q     77
Name: Embarked, dtype: int64


### 使用线性回归

In [7]:
from sklearn.linear_model import LinearRegression # 线性回归模块
from sklearn.cross_validation import KFold # 交叉验证模块

# 选择的特征数据列
predictors = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']

alg = LinearRegression()
kf = KFold(titanic.shape[0],n_folds=3,random_state=1)

predictions=[]
for train,test in kf:
    train_predictors = titanic[predictors].iloc[train,:] # 得到训练集x
    train_target= titanic['Survived'].iloc[train] # 得到训练集y
    alg.fit(train_predictors,train_target)
    test_predictions = alg.predict(titanic[predictors].iloc[test,:])
    predictions.append(test_predictions)



In [8]:
pd.DataFrame(predictions)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,287,288,289,290,291,292,293,294,295,296
0,0.092779,0.962681,0.594186,0.93148,0.054268,0.166699,0.368953,0.107816,0.521526,0.879052,...,0.126444,0.219484,0.699182,1.02406,1.075274,0.294193,0.605627,0.115332,0.536823,0.15929
1,1.137402,0.431448,0.98609,0.663307,0.077926,0.153533,0.830249,0.087484,0.64733,1.029306,...,0.127335,1.024678,0.139506,0.243315,0.13942,0.087484,0.051021,0.804148,-0.029833,0.642868
2,0.173848,0.017564,0.774556,-0.008229,0.141669,0.312422,0.729144,0.094207,0.416776,0.016286,...,0.076234,0.656955,0.269676,0.120079,0.676307,0.274279,1.000917,0.578559,0.48833,0.1762


In [9]:
predictions = np.concatenate(predictions,axis=0)

predictions[predictions>.5]=1
predictions[predictions<=.5]=0

pd.DataFrame(predictions.astype(int))

accuracy = sum(predictions[predictions==titanic['Survived']])/len(predictions)
print(accuracy)

0.26038159371492703


### 使用逻辑回归进行测试

In [10]:
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression

alg = LogisticRegression(random_state=1)
scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic['Survived'],cv=3)

print(scores.mean())


0.7901234567901234


### 使用随机森林