In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
#import pandas_profiling as ppf  #eda
from sklearn.preprocessing import LabelEncoder #标签编码
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split #数据集的划分
from sklearn.neural_network import MLPClassifier #多层感知机
from sklearn import metrics
from sklearn.metrics import mean_absolute_error #评估函数
%matplotlib inline

In [2]:
def read_and_process(file):
    data = pd.read_csv(file)
    # 缺失值处理
    data['Age'] = data['Age'].fillna(np.mean(data['Age']))
    data['Fare'] = data['Fare'].fillna(np.mean(data['Fare']))
    
    #将字符串做成onehot数值型
    data_dummy = pd.get_dummies(data[['Sex', 'Embarked']])
    data = data.drop(labels=['Embarked', 'Sex'], axis=1)
    data = data.join(data_dummy)
    
    #删除没必要的列
    data = data.drop(labels=['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)
    
    #数据归一化
    stdsc = StandardScaler()
    data['Age'] = stdsc.fit_transform(np.array(data['Age']).reshape(-1, 1))
    data['Fare'] = stdsc.fit_transform(np.array(data['Fare']).reshape(-1, 1))
    
    return data

In [3]:
test = pd.read_csv('data/test.csv')
test_PassengerId = test['PassengerId']

In [4]:
train = read_and_process('data/train.csv')
test = read_and_process('data/test.csv')

## 数据建模
### 1.划分数据集

In [5]:
x = train.drop('Survived', axis=1)
y = train['Survived']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [6]:
x_train.shape, y_train.shape

((712, 10), (712,))

### 2.搭建模型，训练

In [22]:
model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, 
                      solver='adam', tol=1e-4, early_stopping=True)
model.fit(x_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=True, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=1000,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [23]:
print('train score ', model.score(x_train, y_train)) #过拟合
print('test score ', model.score(x_test, y_test))

train score  0.8033707865168539
test score  0.7932960893854749


### 3.优化模型 交叉验证调参

In [38]:
from sklearn.model_selection import GridSearchCV
import joblib # 模型保存模块

In [39]:
hidden_layer_sizes=[(a,) for a in range(10, 200, 30)]
param_grid = {"hidden_layer_sizes":hidden_layer_sizes}##字典形式

In [40]:
#模型（单个也可以是多个，参数范围，交叉验证五次）
model = GridSearchCV(MLPClassifier(max_iter=1000, solver='adam', 
                                   tol=1e-4, early_stopping=True),
                     param_grid,cv=5)
model.fit(x_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=True,
                                     epsilon=1e-08, hidden_layer_sizes=(100,),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_fun=15000,
                                     max_iter=1000, momentum=0.9,
                                     n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_state=None, shuffle=True,
                                     solver='adam', tol=0.0001,
                                     validation_fraction=0.1, verbose=False,
                                     warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid=

In [41]:
model.best_params_,model.best_score_

({'hidden_layer_sizes': (160,)}, 0.7963360583078894)

In [42]:
model.score(x_test,y_test)

0.7653631284916201

In [43]:
metrics.f1_score(y_test,model.predict(x_test))#查准率、精准率等等

0.6666666666666666

### 3.提交

In [44]:
predict = model.predict(test)

In [45]:
submission = pd.DataFrame({'PassengerId':test_PassengerId, 'Survived': predict})

In [46]:
submission.to_csv('submission.csv', index=False)

In [47]:
pd.read_csv('submission.csv')

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
