1.数据读入及预处理

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#读取训练数据
data = pd.read_csv("data/train.csv")
#查看数据情况
data.info()

#将Sex列数据转换为1或0
data['Sex'] = data['Sex'].apply(lambda s : 1 if s == 'male' else 0)
#缺失字段填充为0
data = data.fillna(0)
#选择以下特征用于分类
dataset_X = data[['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Fare']]
dataset_X = dataset_X.as_matrix()

#两种分类分别是幸存和死亡，‘Survived’字段是其中一种分类的标签
#新增加'Deceased'字段表示第二种分类的标签，取值为'Survived'取非
data['Deceased'] = data['Survived'].apply(lambda s : int(not s))
dataset_Y = data[['Deceased', 'Survived']]
dataset_Y = dataset_Y.as_matrix()
#在训练数据中选择20%数据用来进行测试
X_train, X_val, y_train, y_val = train_test_split(dataset_X, dataset_Y, test_size=0.2, random_state=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


  app.launch_new_instance()


2.构建计算图，采用逻辑回归进行构建

In [2]:

import tensorflow as tf
#声明输入数据占位符
#shape参数的第一个元素为None,表示可以同时放入任意条记录，每条记录都有6个特征
X = tf.placeholder(tf.float32, shape=[None, 6])
y = tf.placeholder(tf.float32, shape=[None, 2])

#声明参数变量权重W和bias
W = tf.Variable(tf.random_normal([6, 2]), name='weights')
bias = tf.Variable(tf.zeros([2]), name='bias')

#构造前向传播计算图
y_pred = tf.nn.softmax(tf.matmul(X, W) + bias)

#代价函数
cross_entropy = -tf.reduce_sum(y * tf.log(y_pred + 1e-10), reduction_indices=1)
cost = tf.reduce_mean(cross_entropy)

#加入优化算法：随机梯度下降算法
train_op = tf.train.GradientDescentOptimizer(0.001).minimize(cost)

  from ._conv import register_converters as _register_converters


3.构建训练迭代过程

In [3]:
# 存档入口
saver = tf.train.Saver()

with tf.Session() as sess:
    tf.global_variables_initializer().run()
    
    #以下为训练迭代，迭代100轮
    for epoch in range(100):
        total_loss = 0.
        for i in range(len(X_train)):
            feed = {X: [X_train[i]], y: [y_train[i]]}
            #通过session.run接口触发执行
            _, loss = sess.run([train_op, cost], feed_dict=feed)
            total_loss += loss
        print('Epoch: %04d, total loss=%.9f' % (epoch + 1, total_loss))
    print('Training complete!')
    
    #评估准确率
    pred = sess.run(y_pred, feed_dict={X: X_val})
    correct = np.equal(np.argmax(pred, 1), np.argmax(y_val, 1))
    accuracy = np.mean(correct.astype(np.float32))
    print("Accuracy on validation set: %.9f" % accuracy)
    
    save_path = saver.save(sess, "save/model.ckpt")
    

Epoch: 0001, total loss=2185.383065543
Epoch: 0002, total loss=1399.061143801
Epoch: 0003, total loss=1381.790588486
Epoch: 0004, total loss=1367.578045154
Epoch: 0005, total loss=1355.553322729
Epoch: 0006, total loss=1344.684032992
Epoch: 0007, total loss=1334.926935668
Epoch: 0008, total loss=1325.748763263
Epoch: 0009, total loss=1316.905705124
Epoch: 0010, total loss=1308.393036272
Epoch: 0011, total loss=1300.311208090
Epoch: 0012, total loss=1292.830575945
Epoch: 0013, total loss=1286.138685631
Epoch: 0014, total loss=1280.230724424
Epoch: 0015, total loss=1274.808053114
Epoch: 0016, total loss=1269.679954715
Epoch: 0017, total loss=1264.822800308
Epoch: 0018, total loss=1260.253172231
Epoch: 0019, total loss=1255.990858019
Epoch: 0020, total loss=1252.050416565
Epoch: 0021, total loss=1248.434137513
Epoch: 0022, total loss=1245.128064301
Epoch: 0023, total loss=1242.101214024
Epoch: 0024, total loss=1239.303256493
Epoch: 0025, total loss=1236.671945274
Epoch: 0026, total loss=1

###预测测试数据###

In [4]:
#读入测试数据集并完成预处理, 
testdata = pd.read_csv('data/test.csv')
testdata = testdata.fillna(0)

testdata['Sex'] = testdata['Sex'].apply(lambda s: 1 if s== 'male' else 0)
X_test = testdata[['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Fare']]

with tf.Session() as sess2:
    tf.global_variables_initializer().run()
    #加载模型存档
    saver.restore(sess2, save_path)
    #正向传播计算
    predictions = np.argmax(sess2.run(y_pred, feed_dict={X:X_test}), 1)

    #构建提交结果的数据结构，并将结果存储为csv文件
    submission = pd.DataFrame({
        "PassengerId": testdata["PassengerId"],
        "Survived": predictions
    })
    submission.to_csv("data/titanic_submission.csv", index=False)

INFO:tensorflow:Restoring parameters from save/model.ckpt
