In [332]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
import matplotlib.pyplot as plt
from sklearn import preprocessing  

In [333]:
# 导入数据
train_data = pd.read_csv('input/train.csv')
test_data = pd.read_csv('input/test.csv')
PassengerId = test_data['PassengerId']

In [334]:
# 定义保留的列
cols = ['Survived','Name','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
train_data = train_data[cols]
cols2 = ['Name','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
test_data = test_data[cols2]

In [335]:
# 将训练样本分一部分作为测试样本
msk = np.random.rand(len(train_data)) < 0.9
train_data2 = train_data[msk]
test_data2 = train_data[~msk]

In [336]:
print(len(train_data))
print(len(train_data2))
print(len(test_data2))

891
797
94


In [337]:

# 定义一个数据处理函数 换掉字符串 返回特征值和标签
def PreprocessData(data):
    # name字段用不到 先删除
    data = data.drop(['Name'],axis=1)
    # age和farm中有一些值为空 均值代替
    age_mean = data['Age'].mean()
    data['Age'] = data['Age'].fillna(age_mean)
    fare_mean = data['Fare'].mean()
    data['Fare'] = data['Fare'].fillna(fare_mean)
    # 替换Sex字符串
    data['Sex'] = data['Sex'].map({'female':0,'male':1}).astype(int)
    
    if 'Survived' in data.columns:
        Labels = data['Survived']
        _data = data.drop('Survived', axis=1)
    else:
        Labels = []
        _data = data
    # 将embarked转换为一位热编码
    data_OneHot = pd.get_dummies(data=_data,columns=['Embarked','Pclass'])
    
    # 获取特征值和标签
    ndarray = data_OneHot.values
    Features = ndarray
    
    
    # 特征值再做标准化处理
    minmax_scaled = preprocessing.MinMaxScaler(feature_range=(0,1))
    scaled_features = minmax_scaled.fit_transform(Features)
    
    return scaled_features,Labels

In [338]:
train_features,train_labels = PreprocessData(train_data2)
test_features,test_labels = PreprocessData(test_data2)
test_data, _= PreprocessData(test_data)

In [339]:
model = Sequential()  # 初始化
model.add(Dense(units=40,input_dim=11,kernel_initializer='uniform',activation='relu'))  # 隐层1 40个神经元 9个特征输入 均匀化初始化权重 relu激活函数
model.add(Dense(units=30,kernel_initializer='uniform',activation='relu')) # 隐层2 30个神经元
model.add(Dense(units=1,kernel_initializer='uniform',activation='sigmoid')) # 输出层 1个输出神经元 输出0或1

In [340]:
# 开始训练
# 定义训练方式 
# 二分 交叉熵损失函数  adam优化器 accuracy作为评判指标
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
# 开始训练
# 90%作为训练  10%作为验证 30个训练周期 每批次30个 显示训练过程
train_history=model.fit(train_features,train_labels,validation_split=0.1,epochs=30,batch_size=64,verbose=2)


Epoch 1/30
12/12 - 1s - loss: 0.6921 - accuracy: 0.6025 - val_loss: 0.6903 - val_accuracy: 0.6125 - 669ms/epoch - 56ms/step
Epoch 2/30
12/12 - 0s - loss: 0.6886 - accuracy: 0.6165 - val_loss: 0.6851 - val_accuracy: 0.6125 - 36ms/epoch - 3ms/step
Epoch 3/30
12/12 - 0s - loss: 0.6808 - accuracy: 0.6165 - val_loss: 0.6723 - val_accuracy: 0.6125 - 36ms/epoch - 3ms/step
Epoch 4/30
12/12 - 0s - loss: 0.6658 - accuracy: 0.6165 - val_loss: 0.6476 - val_accuracy: 0.6125 - 34ms/epoch - 3ms/step
Epoch 5/30
12/12 - 0s - loss: 0.6391 - accuracy: 0.6290 - val_loss: 0.6108 - val_accuracy: 0.6500 - 37ms/epoch - 3ms/step
Epoch 6/30
12/12 - 0s - loss: 0.6065 - accuracy: 0.6569 - val_loss: 0.5727 - val_accuracy: 0.6625 - 34ms/epoch - 3ms/step
Epoch 7/30
12/12 - 0s - loss: 0.5731 - accuracy: 0.6890 - val_loss: 0.5410 - val_accuracy: 0.7750 - 35ms/epoch - 3ms/step
Epoch 8/30
12/12 - 0s - loss: 0.5461 - accuracy: 0.7490 - val_loss: 0.5186 - val_accuracy: 0.7875 - 33ms/epoch - 3ms/step
Epoch 9/30
12/12 - 0s 

In [341]:
scores = model.evaluate(test_features,test_labels)
print(scores[1])

0.8085106611251831


In [342]:
result = model.predict(test_data)



In [343]:
df = pd.DataFrame()
df['PassengerId'] = PassengerId
df['Survived'] = result.reshape(-1)
df['Survived'] = df['Survived'].apply(lambda x: 1 if x>=0.5 else 0 )

In [344]:
df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [345]:
df.to_csv('predict.csv',index=False)