In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
#import pandas_profiling as ppf  #eda
from sklearn.preprocessing import LabelEncoder #标签编码
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split #数据集的划分
from sklearn.neural_network import MLPClassifier #多层感知机
from sklearn import metrics
from sklearn.metrics import mean_absolute_error #评估函数
%matplotlib inline

In [2]:
def read_and_process(file):
    data = pd.read_csv(file)
    # 缺失值处理
    data['Age'] = data['Age'].fillna(np.mean(data['Age']))
    data['Fare'] = data['Fare'].fillna(np.mean(data['Fare']))
    
    #将字符串做成onehot数值型
    data_dummy = pd.get_dummies(data[['Sex', 'Embarked']])
    data = data.drop(labels=['Embarked', 'Sex'], axis=1)
    data = data.join(data_dummy)
    
    #删除没必要的列
    data = data.drop(labels=['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)
    
    #数据归一化
    stdsc = StandardScaler()
    data['Age'] = stdsc.fit_transform(np.array(data['Age']).reshape(-1, 1))
    data['Fare'] = stdsc.fit_transform(np.array(data['Fare']).reshape(-1, 1))
    
    return data

In [3]:
test = pd.read_csv('data/test.csv')
test_PassengerId = test['PassengerId']

In [4]:
train = read_and_process('data/train.csv')
test = read_and_process('data/test.csv')

## 数据建模
### 1.划分数据集

In [5]:
x = train.drop('Survived', axis=1)
y = train['Survived']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [6]:
x_train.shape, y_train.shape

((712, 10), (712,))

### 2.搭建模型，训练

In [44]:
model = MLPClassifier(hidden_layer_sizes=(90,), max_iter=10000, 
                      solver='adam', tol=1e-4, early_stopping=True, 
                      learning_rate='adaptive')
model.fit(x_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=True, epsilon=1e-08,
              hidden_layer_sizes=(90,), learning_rate='adaptive',
              learning_rate_init=0.001, max_fun=15000, max_iter=10000,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [45]:
print('train score ', model.score(x_train, y_train)) #过拟合
print('test score ', model.score(x_test, y_test))

train score  0.8089887640449438
test score  0.776536312849162


In [46]:
metrics.f1_score(y_test,model.predict(x_test))#查准率、精准率等等

0.7014925373134329

In [47]:
metrics.accuracy_score(y_test,model.predict(x_test))

0.776536312849162

通过对分类阈值θ（默认0.5）从大到小或者从小到大依次取值，我们可以得到很多组TPR和FPR的值，将其在图像中依次画出就可以得到一条ROC曲线，阈值θ取值范围为[0,1]。

　　ROC曲线在图像上越接近左上角(0,1)模型越好，即ROC曲线下面与横轴和直线FPR = 1围成的面积（AUC值）越大越好。直观上理解，纵坐标TPR就是recallpositive值，横坐标FPR就是(1 - recallnegative)，前者越大越好，后者整体越小越好，在图像上表示就是曲线越接近左上角(0,1)坐标越好。

　　图展示了一个模型的ROC曲线，AUC值由定义通过计算ROC曲线、横轴和直线FPR = 1三者围成的面积即可得到。

In [48]:
metrics.roc_auc_score(y_test,model.predict(x_test), average='macro', sample_weight=None)

0.7587615283267457

### 3.提交

In [44]:
predict = model.predict(test)

In [45]:
submission = pd.DataFrame({'PassengerId':test_PassengerId, 'Survived': predict})

In [46]:
submission.to_csv('submission.csv', index=False)

In [47]:
pd.read_csv('submission.csv')

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
