In [1]:
# /usr/bin/python
# -*- encoding:utf-8 -*-

import xgboost as xgb
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import csv



In [5]:
def show_accuracy(a, b, tip):
    acc = a.ravel() == b.ravel()
    acc_rate = 100 * float(acc.sum()) / a.size
    print '%s正确率：%.3f%%' % (tip, acc_rate)
    return acc_rate


def load_data(file_name, is_train):
    data = pd.read_csv(file_name)  # 数据文件路径
    # print 'data.describe() = \n', data.describe()

    # 性别
    data['Sex'] = data['Sex'].map({'female': 0, 'male': 1}).astype(int)

    # 补齐船票价格缺失值
    if len(data.Fare[data.Fare.isnull()]) > 0:
        fare = np.zeros(3)
        for f in range(0, 3):
            fare[f] = data[data.Pclass == f + 1]['Fare'].dropna().median()
        for f in range(0, 3):  # loop 0 to 2
            data.loc[(data.Fare.isnull()) & (data.Pclass == f + 1), 'Fare'] = fare[f]

    # 年龄：使用均值代替缺失值
    # mean_age = data['Age'].dropna().mean()
    # data.loc[(data.Age.isnull()), 'Age'] = mean_age
    if is_train:
        # 年龄：使用随机森林预测年龄缺失值
        print '随机森林预测缺失年龄：--start--'
        data_for_age = data[['Age', 'Survived', 'Fare', 'Parch', 'SibSp', 'Pclass']]
        age_exist = data_for_age.loc[(data.Age.notnull())]  # 年龄不缺失的数据
        age_null = data_for_age.loc[(data.Age.isnull())]
        # print age_exist
        x = age_exist.values[:, 1:]
        y = age_exist.values[:, 0]
        rfr = RandomForestRegressor(n_estimators=1000)
        rfr.fit(x, y)
        age_hat = rfr.predict(age_null.values[:, 1:])
        # print age_hat
        data.loc[(data.Age.isnull()), 'Age'] = age_hat
        print '随机森林预测缺失年龄：--over--'
    else:
        print '随机森林预测缺失年龄2：--start--'
        data_for_age = data[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
        age_exist = data_for_age.loc[(data.Age.notnull())]  # 年龄不缺失的数据
        age_null = data_for_age.loc[(data.Age.isnull())]
        # print age_exist
        x = age_exist.values[:, 1:]
        y = age_exist.values[:, 0]
        rfr = RandomForestRegressor(n_estimators=1000)
        rfr.fit(x, y)
        age_hat = rfr.predict(age_null.values[:, 1:])
        # print age_hat
        data.loc[(data.Age.isnull()), 'Age'] = age_hat
        print '随机森林预测缺失年龄2：--over--'

    # 起始城市
    data.loc[(data.Embarked.isnull()), 'Embarked'] = 'S'  # 保留缺失出发城市
    # data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2, 'U': 0}).astype(int)
    # print data['Embarked']
    embarked_data = pd.get_dummies(data.Embarked)
    print embarked_data
    # embarked_data = embarked_data.rename(columns={'S': 'Southampton', 'C': 'Cherbourg', 'Q': 'Queenstown', 'U': 'UnknownCity'})
    embarked_data = embarked_data.rename(columns=lambda x: 'Embarked_' + str(x))
    data = pd.concat([data, embarked_data], axis=1)
    print data.describe()
    data.to_csv('New_Data.csv')

    x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]
    # x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
    y = None
    if 'Survived' in data:
        y = data['Survived']

    x = np.array(x)
    y = np.array(y)

    # 思考：这样做，其实发生了什么？
    x = np.tile(x, (5, 1))
    y = np.tile(y, (5,))
    if is_train:
        return x, y
    return x, data['PassengerId']


def write_result(c, c_type):
    file_name = '../dataset/Titanic.test.csv'
    x, passenger_id = load_data(file_name, False)

    if type == 3:
        x = xgb.DMatrix(x)
    y = c.predict(x)
    y[y > 0.5] = 1
    y[~(y > 0.5)] = 0

    predictions_file = open("Prediction_%d.csv" % c_type, "wb")
    open_file_object = csv.writer(predictions_file)
    open_file_object.writerow(["PassengerId", "Survived"])
    open_file_object.writerows(zip(passenger_id, y))
    predictions_file.close()

In [7]:
x, y = load_data('../dataset/Titanic.train.csv', True)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)
#
lr = LogisticRegression(penalty='l2')
lr.fit(x_train, y_train)
y_hat = lr.predict(x_test)
lr_acc = accuracy_score(y_test, y_hat)
# write_result(lr, 1)
print 'Logistic回归：%.3f%%' % lr_acc

随机森林预测缺失年龄：--start--
随机森林预测缺失年龄：--over--
     C  Q  S  U
0    0  0  1  0
1    1  0  0  0
2    0  0  1  0
3    0  0  1  0
4    0  0  1  0
5    0  1  0  0
6    0  0  1  0
7    0  0  1  0
8    0  0  1  0
9    1  0  0  0
10   0  0  1  0
11   0  0  1  0
12   0  0  1  0
13   0  0  1  0
14   0  0  1  0
15   0  0  1  0
16   0  1  0  0
17   0  0  1  0
18   0  0  1  0
19   1  0  0  0
20   0  0  1  0
21   0  0  1  0
22   0  1  0  0
23   0  0  1  0
24   0  0  1  0
25   0  0  1  0
26   1  0  0  0
27   0  0  1  0
28   0  1  0  0
29   0  0  1  0
..  .. .. .. ..
861  0  0  1  0
862  0  0  1  0
863  0  0  1  0
864  0  0  1  0
865  0  0  1  0
866  1  0  0  0
867  0  0  1  0
868  0  0  1  0
869  0  0  1  0
870  0  0  1  0
871  0  0  1  0
872  0  0  1  0
873  0  0  1  0
874  1  0  0  0
875  1  0  0  0
876  0  0  1  0
877  0  0  1  0
878  0  0  1  0
879  1  0  0  0
880  0  0  1  0
881  0  0  1  0
882  0  0  1  0
883  0  0  1  0
884  0  0  1  0
885  0  1  0  0
886  0  0  1  0
887  0  0  1  0
888  0  0  1  0

In [8]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(x_train, y_train)
y_hat = rfc.predict(x_test)
rfc_acc = accuracy_score(y_test, y_hat)
# write_result(rfc, 2)
print '随机森林：%.3f%%' % rfc_acc

随机森林：0.983%


In [9]:
# XGBoost
data_train = xgb.DMatrix(x_train, label=y_train)
data_test = xgb.DMatrix(x_test, label=y_test)
watch_list = [(data_test, 'eval'), (data_train, 'train')]
param = {'max_depth': 6, 'eta': 0.8, 'silent': 1, 'objective': 'binary:logistic'}
# 'subsample': 1, 'alpha': 0, 'lambda': 0, 'min_child_weight': 1}
bst = xgb.train(param, data_train, num_boost_round=100, evals=watch_list)
y_hat = bst.predict(data_test)
# write_result(bst, 3)
y_hat[y_hat > 0.5] = 1
y_hat[~(y_hat > 0.5)] = 0
xgb_acc = accuracy_score(y_test, y_hat)

print 'XGBoost：%.3f%%' % xgb_acc

[0]	eval-error:0.151706	train-error:0.129003
[1]	eval-error:0.124776	train-error:0.108051
[2]	eval-error:0.118492	train-error:0.093685
[3]	eval-error:0.091562	train-error:0.068243
[4]	eval-error:0.085278	train-error:0.058366
[5]	eval-error:0.081688	train-error:0.055073
[6]	eval-error:0.073609	train-error:0.048788
[7]	eval-error:0.06912	train-error:0.045795
[8]	eval-error:0.06553	train-error:0.042502
[9]	eval-error:0.06912	train-error:0.042802
[10]	eval-error:0.05386	train-error:0.028435
[11]	eval-error:0.058348	train-error:0.028435
[12]	eval-error:0.050269	train-error:0.026639
[13]	eval-error:0.046679	train-error:0.023346
[14]	eval-error:0.045781	train-error:0.023646
[15]	eval-error:0.047576	train-error:0.024544
[16]	eval-error:0.047576	train-error:0.024544
[17]	eval-error:0.037702	train-error:0.018857
[18]	eval-error:0.032316	train-error:0.016163
[19]	eval-error:0.030521	train-error:0.015265
[20]	eval-error:0.030521	train-error:0.015265
[21]	eval-error:0.025135	train-error:0.012571
[2