In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
#import pandas_profiling as ppf  #eda
from sklearn.preprocessing import LabelEncoder #标签编码
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split #数据集的划分
from sklearn.svm import SVR, SVC
from sklearn import metrics
from sklearn.metrics import mean_absolute_error #评估函数
%matplotlib inline

In [20]:
def read_and_process(file):
    data = pd.read_csv(file)
    # 缺失值处理
    data['Age'] = data['Age'].fillna(np.mean(data['Age']))
    data['Fare'] = data['Fare'].fillna(np.mean(data['Fare']))
    
    #将字符串做成onehot数值型
    data_dummy = pd.get_dummies(data[['Sex', 'Embarked']])
    data = data.drop(labels=['Embarked', 'Sex'], axis=1)
    data = data.join(data_dummy)
    
    #删除没必要的列
    data = data.drop(labels=['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)
    
    #数据归一化
    stdsc = StandardScaler()
    data['Age'] = stdsc.fit_transform(np.array(data['Age']).reshape(-1, 1))
    data['Fare'] = stdsc.fit_transform(np.array(data['Fare']).reshape(-1, 1))
    
    return data

In [21]:
test = pd.read_csv('data/test.csv')
test_PassengerId = test['PassengerId']

In [22]:
train = read_and_process('data/train.csv')
test = read_and_process('data/test.csv')

## 数据建模
### 1.划分数据集

In [23]:
x = train.drop('Survived', axis=1)
y = train['Survived']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [24]:
x_train.shape, y_train.shape

((712, 10), (712,))

### 2.搭建模型，训练

In [25]:
model = SVC()
model.fit(x_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [26]:
print('train score ', model.score(x_train, y_train)) #过拟合
print('test score ', model.score(x_test, y_test))

train score  0.8328651685393258
test score  0.8100558659217877


### 3.优化模型 交叉验证调参

In [27]:
from sklearn.model_selection import GridSearchCV
import joblib # 模型保存模块

In [28]:
degree=[1,2,3,4,5,6,7]
param_grid = {"degree":degree}##字典形式

In [29]:
#模型（单个也可以是多个，参数范围，交叉验证五次）
model = GridSearchCV(SVC(),param_grid,cv=5)
model.fit(x_train,y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'degree': [1, 2, 3, 4, 5, 6, 7]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [30]:
model.best_params_,model.best_score_

({'degree': 1}, 0.8272431793558553)

In [31]:
model.score(x_test,y_test)

0.8100558659217877

In [32]:
metrics.f1_score(y_test,model.predict(x_test))#查准率、精准率等等

0.7424242424242424

### 3.提交

In [33]:
predict = model.predict(test)

In [34]:
submission = pd.DataFrame({'PassengerId':test_PassengerId, 'Survived': predict})

In [35]:
submission.to_csv('submission.csv', index=False)

In [36]:
pd.read_csv('submission.csv')

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
