## 泰坦尼克号乘客生存预测

In [92]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

data = pd.read_csv('titanic_train.csv')
data[:3]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [93]:
data.info()     # 获取数据信息

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [94]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [95]:
# 将年龄缺失数据用中位数填充
data['Age'] = data['Age'].fillna(data['Age'].median())
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


## 线性回归算法

In [96]:
from sklearn.linear_model import LinearRegression 
from sklearn.cross_validation import KFold 

predictors = ['Pclass', 'Age', 'Fare']
alg = LinearRegression()
kf = KFold(data.shape[0], shuffle=False, random_state=1)

# alg.fit(data[predictors][:800], data['Survived'][:800])
# alg.predict(data[predictors][800:])

predictions = []
for train, test in kf:
    train_predictors = data[predictors].iloc[train, :]
    train_target = data['Survived'].iloc[train]
    alg.fit(train_predictors, train_target)
    test_predictions = alg.predict(data[predictors].iloc[test,:])
    predictions.append(test_predictions)

In [97]:
predictions = np.concatenate(predictions, axis=0)
predictions[predictions>=0.5] = 1
predictions[predictions<0.5] = 0
accuracy = sum(predictions == data['Survived']) / len(predictions)
print('准确率：', accuracy)

准确率： 0.7003367003367004


## 逻辑回归

In [98]:
from sklearn import model_selection 
from sklearn.linear_model import LogisticRegression 

lgr = LogisticRegression(random_state=1)
lgr.fit(data[predictors], data['Survived'])
scores = model_selection.cross_val_score(lgr, data[predictors], data['Survived'], cv=3)
print('准确率：', scores.mean())

准确率： 0.6936026936026937


### (逻辑回归)增加其他特征

In [99]:
# 将性别数值化
data.loc[data['Sex'] == 'male', 'Sex'] = 0
data.loc[data['Sex'] == 'female', 'Sex'] = 1

In [100]:
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
lgr = LogisticRegression()
lgr.fit(data[predictors], data['Survived'])
scores = model_selection.cross_val_score(lgr, data[predictors], data['Survived'], cv=3)
print('准确率：', scores.mean())

准确率： 0.7901234567901234


## 随机森林算法

In [116]:
from sklearn import model_selection 
from sklearn.ensemble import RandomForestClassifier 
 
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
# 10棵决策树;  停止条件：样本个数为2，叶子节点个数为1
alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=2)
kf = model_selection.KFold(n_splits=3, shuffle=False, random_state=1)
scores = model_selection.cross_val_score(alg, data[predictors], data['Survived'], cv=kf)
print(scores)
print('准确率：', scores.mean())

[0.7979798  0.85185185 0.84511785]
准确率： 0.8316498316498316


In [117]:
# 30棵决策树； 停止条件：样本个数为4，叶子节点个数为2 
alg = RandomForestClassifier(random_state=1, n_estimators=30, min_samples_split=4, min_samples_leaf=2)
kf = model_selection.KFold(n_splits=4, shuffle=False, random_state=1)
scores = model_selection.cross_val_score(alg, data[predictors], data['Survived'], cv=kf)
print(scores)
print(scores.mean())

[0.80717489 0.85650224 0.83408072 0.83783784]
0.8338989213428676


## 对测试集数据进行处理，并进行结果预测

In [78]:
data_test = pd.read_csv('titanic_test.csv')
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [85]:
data_test['Fare'] = data_test['Fare'].fillna(data_test['Fare'].median())
data_test['Age'] = data_test['Age'].fillna(data_test['Age'].median())
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           418 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [86]:
data_test.loc[data_test['Sex'] == 'male', 'Sex'] = 0
data_test.loc[data_test['Sex'] == 'female', 'Sex'] = 1

In [118]:
test_features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']
test_predictors = lgr.predict(data_test[test_features])
data_test['Survived'] = test_predictors
data_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,S,0
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,S,1
