# 在iris数据集上sklearn分类器比较
&emsp;&emsp;默认参数下sklearn分类器的准确度表：
* 最耗时的是LogisticRegressionCV(逻辑回归)、AdaBoostClassifier(弱分类投票)、GradientBoostingClassifier(迭代决策树)
* 准确率最低的是PassiveAggressiveClassifier(被动进攻)、SGDClassifier(随机梯度下降)
* 准确率最高的是SVC(支持向量机)、KNeighborsClassifier(K近邻)、DecisionTreeClassifier(决策树)、GaussianProcessClassifier(高斯过程)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import *
from sklearn.tree import *
from sklearn.neighbors import *
from sklearn.ensemble import *
from sklearn.gaussian_process import *
from sklearn.svm import *
from sklearn.naive_bayes import *
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import pandas as pd
import time

class dataset(object):
    def __init__(self, filename, columns, classcol, datacol):
        self.filename = filename
        self.columns = columns		# 所有属性名
        self.classcol = classcol	# 类型名
        self.datacol = datacol		# 有效属性名	

clfs = [
    LogisticRegression(),
    LogisticRegressionCV(),
    LinearSVC(),
    NuSVC(),
    SVC(),
    PassiveAggressiveClassifier(),
    RidgeClassifier(),
    RidgeClassifierCV(),
    SGDClassifier(),
    KNeighborsClassifier(),
    NearestCentroid(),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    AdaBoostClassifier(),
    BaggingClassifier(),
    ExtraTreeClassifier(),
    GradientBoostingClassifier(),
    RandomForestClassifier(),
    GaussianProcessClassifier(),
    BernoulliNB(),
    GaussianNB(),
]
pipes = [Pipeline([['sc', StandardScaler()], ['clf', clf]]) for clf in clfs] # 用于统一化初值处理、分类

def learn_from_data(dataset):
    filename = dataset.filename
    columns = dataset.columns
    classcol = dataset.classcol
    datacol = dataset.datacol
    iris = pd.read_csv(filename, header=None)
    iris.columns = columns
    y = iris[classcol].values
    X = iris[datacol].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) # 测试集占10%

    for i in range(0, len(clfs)):
        minscore = 1.0 # 记录最小准确度用于后续进一步优化
        start = time.time()
        for j in range(0, testnum):
            pipes[i].fit(X_train, y_train)
            y_pred = pipes[i].predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            if accuracy < minscore:
                minscore = accuracy
        end = time.time()
        print('Accuraty:%s score=%.2f time=%d' % (type(clfs[i]), minscore, end - start))

testnum = 100
irisdataset = dataset(
    '../dataset/iris_dataset/iris.data',
    ['Sepal length', 'Sepal width', 'Petal length', 'Petal width', 'Class label'],
    'Class label',
    ['Sepal length', 'Sepal width', 'Petal length', 'Petal width']
)

if __name__ == '__main__':
    learn_from_data(irisdataset)

Accuraty:<class 'sklearn.linear_model.logistic.LogisticRegression'> score=1.00 time=0
Accuraty:<class 'sklearn.linear_model.logistic.LogisticRegressionCV'> score=1.00 time=17
Accuraty:<class 'sklearn.svm.classes.LinearSVC'> score=1.00 time=0
Accuraty:<class 'sklearn.svm.classes.NuSVC'> score=1.00 time=0
Accuraty:<class 'sklearn.svm.classes.SVC'> score=1.00 time=0
Accuraty:<class 'sklearn.linear_model.passive_aggressive.PassiveAggressiveClassifier'> score=0.47 time=0
Accuraty:<class 'sklearn.linear_model.ridge.RidgeClassifier'> score=0.93 time=0
Accuraty:<class 'sklearn.linear_model.ridge.RidgeClassifierCV'> score=0.93 time=0
Accuraty:<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> score=0.67 time=0
Accuraty:<class 'sklearn.neighbors.classification.KNeighborsClassifier'> score=1.00 time=0
Accuraty:<class 'sklearn.neighbors.nearest_centroid.NearestCentroid'> score=0.93 time=0
Accuraty:<class 'sklearn.tree.tree.DecisionTreeClassifier'> score=0.93 time=0
Accuraty:<class 's