In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import LabelEncoder #标签编码
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split #数据集的划分
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn import metrics
%matplotlib inline

In [13]:
def read_and_process(file):
    data = pd.read_csv(file)
    # 缺失值处理
    data['Age'] = data['Age'].fillna(np.mean(data['Age']))
    data['Fare'] = data['Fare'].fillna(np.mean(data['Fare']))
    
    #将字符串做成onehot数值型
    data_dummy = pd.get_dummies(data[['Sex', 'Embarked']])
    data = data.drop(labels=['Embarked', 'Sex'], axis=1)
    data = data.join(data_dummy)
    
    #删除没必要的列
    data = data.drop(labels=['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)
    
    #数据归一化
    stdsc = StandardScaler()
    data['Age'] = stdsc.fit_transform(np.array(data['Age']).reshape(-1, 1))
    data['Fare'] = stdsc.fit_transform(np.array(data['Fare']).reshape(-1, 1))
    
    return data

In [14]:
test = pd.read_csv('data/test.csv')
test_PassengerId = test['PassengerId']

In [15]:
train = read_and_process('data/train.csv')
test = read_and_process('data/test.csv')

## 数据建模
### 1.划分数据集

In [16]:
x = train.drop('Survived', axis=1)
y = train['Survived']

In [17]:
x.shape, y.shape

((891, 10), (891,))

### 降维

In [26]:
# 基于树模型的特征选择
clf = RandomForestClassifier(max_depth=5, min_impurity_decrease=0.001).fit(x, y)
clf.feature_importances_
x_clf = SelectFromModel(clf, prefit=True).transform(x)
test_clf = SelectFromModel(clf,prefit=True).transform(test)
x_clf.shape #(891, 4)

(891, 4)

In [27]:
x_input = x_clf
x_train,x_test,y_train,y_test = train_test_split(x_input, y, test_size=0.2)

### 2.搭建模型，训练

In [29]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=5, min_impurity_decrease=0.001)
model.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.001, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [98]:
print('train score ', model.score(x_train, y_train)) #过拟合
print('test score ', model.score(x_test, y_test))

train score  0.8300561797752809
test score  0.888268156424581


### 3.优化模型 交叉验证调参

In [56]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
import joblib # 模型保存模块

In [57]:
depth_list = np.arange(4, 8)
min_inpurity_list = np.linspace(0, 0.1, 10)
param_grid = {"max_depth":depth_list, 
              "min_impurity_decrease":min_inpurity_list,
             }

In [58]:
# ll_list = [0.1, 0.01, 0.001]
# estimator_list = np.arange(10, 60, 10)
# depth_list = [3, 5, 8]
# param_grid = {"max_depth":depth_list, 
#               "n_estimators":estimator_list,
#               "learning_rate": ll_list
#              }

In [59]:
#模型（单个也可以是多个，参数范围，交叉验证五次）
model = GridSearchCV(RandomForestClassifier(),param_grid,cv=5)
# model = GridSearchCV(GradientBoostingClassifier(), param_grid)
model.fit(x_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [60]:
model.best_params_,model.best_score_

({'max_depth': 5, 'min_impurity_decrease': 0.0}, 0.8047966118388654)

In [61]:
model.score(x_test,y_test)

0.8379888268156425

In [62]:
metrics.f1_score(y_test,model.predict(x_test))#查准率、精准率等等

0.7010309278350515

### 3.提交

In [110]:
predict = model.predict(test_clf)

In [111]:
submission = pd.DataFrame({'PassengerId':test_PassengerId, 'Survived': predict})

In [112]:
submission.to_csv('submission.csv', index=False)

In [113]:
pd.read_csv('submission.csv')

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
