## from sklearn.svm import SVC/SVR
https://scikit-learn.org/stable/modules/svm.html
![image.png](attachment:image.png)

In [15]:
from sklearn.svm import SVR
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [7]:
data = load_boston()
data.data.shape,data.target.shape

((506, 13), (506,))

In [11]:
X_train,X_test,y_train,y_test = train_test_split(data.data,data.target,test_size=0.2)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((404, 13), (102, 13), (404,), (102,))

In [17]:
model = SVR()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
metrics.mean_absolute_error(y_test,y_pred)

6.454223832487549

![image.png](attachment:image.png)

https://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm  详细的参数

## 这里开始对Titanic号的存活率的预测

In [19]:
import pandas as pd
import numpy as np

def read_dataset(fname):
#     指定第一列作为行索引
    data = pd.read_csv(fname, index_col=0)
#     丢弃无用数据
    data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
#     处理性别数据
    lables = data['Sex'].unique().tolist()#变成列表的意思
    data['Sex'] = [*map(lambda x: lables.index(x) , data['Sex'])]
#     处理登船港口数据## sklearn里面的labelencoder
    lables = data['Embarked'].unique().tolist()
    data['Embarked'] = data['Embarked'].apply(lambda n: lables.index(n))
#     处理缺失数据填充0
    data = data.fillna(0)
    return data
train = read_dataset("data/train.csv")
import warnings
warnings.filterwarnings("ignore")#过滤掉警告的意思

In [20]:
from sklearn.model_selection import train_test_split#数据拆分的函数

y = train['Survived'].values#标签
X = train.drop(['Survived'], axis=1).values#特征

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)#随机选择百分之二十作为验证测试用
print("X_train_shape:", X_train.shape, " y_train_shape:", y_train.shape)
print("X_test_shape:", X_test.shape,"  y_test_shape:", y_test.shape)

X_train_shape: (712, 7)  y_train_shape: (712,)
X_test_shape: (179, 7)   y_test_shape: (179,)


In [21]:
from sklearn.svm import SVC#支持向量机分类器
model = SVC()#实例化
model.fit(X_train,y_train)#拟合
print("train score:", model.score(X_train, y_train))#过拟合
print("test score:", model.score(X_test, y_test))#

train score: 0.9129213483146067
test score: 0.7094972067039106


## 交叉验证调参

In [22]:
from sklearn.model_selection import GridSearchCV#网格搜索最优参数
from sklearn import metrics#评分模块
from sklearn.externals import joblib##模型保存模块

In [23]:
degree=[1,2,3,4,5,6,7]
param_grid = {"degree":degree}##字典形式

In [24]:
model = GridSearchCV(SVC(),param_grid,cv=5)#模型（单个也可以是多个，参数范围，交叉验证五次）

In [25]:
model.fit(X_train,y_train)#模型的训练

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'degree': [1, 2, 3, 4, 5, 6, 7]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [57]:
model.best_params_,model.best_score_

({'degree': 1}, 0.6685393258426966)

In [58]:
model.score(X_test,y_test)

0.6480446927374302

In [64]:
metrics.accuracy_score(y_test,model.predict(X_test))

0.6480446927374302

In [65]:
metrics.f1_score(y_test,model.predict(X_test))#查准率、精准率等等

0.5467625899280575

## 开始真正的做测试

In [59]:
test = read_dataset("data/test.csv")#测试集一样的要做处理

In [60]:
model.predict(test)

array([0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,

In [61]:
submission = pd.DataFrame({'PassengerId': test.index, 'Survived': model.predict(test)})#以字典的形式来建立da

In [62]:
submission.to_csv("submission.csv",index=False)#转换成csv文件

In [63]:
pd.read_csv("submission.csv")#提交文件要科学上网

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,0
4,896,0
5,897,1
6,898,0
7,899,1
8,900,1
9,901,1
