# 在adult数据集上sklearn分类器比较
&emsp;&emsp;默认参数下sklearn分类器的准确度表：
* 最耗时的是SVC(支持向量机)
* 准确率最低的是PassiveAggressiveClassifier、NearestCentroid
* 准确率最高的是AdaBoostClassifier、GradientBoostingClassifier、RandomForestClassifier 

In [12]:
#encoding=utf8
from sklearn.pipeline import Pipeline
from sklearn.linear_model import *
from sklearn.tree import *
from sklearn.neighbors import *
from sklearn.ensemble import *
from sklearn.gaussian_process import *
from sklearn.svm import *
from sklearn.naive_bayes import *
from sklearn.preprocessing import *
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neural_network import *
from sklearn.model_selection import *

import matplotlib.pyplot as plt
import pandas as pd
import time

class dataset(object):
    def __init__(self, filename, columns, classcol, datacol, encodecol = []):
        self.filename = filename
        self.columns = columns		# 所有属性名
        self.classcol = classcol	# 类型名
        self.datacol = datacol		# 有效属性名
        self.encodecol = encodecol  # 编码属性
# 第二个成员用于参数调优
clfs = [
    [ LogisticRegression(max_iter=1000), {'C': [0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0], 'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']} ],
    [ LogisticRegressionCV(), {} ],
    [ LinearSVC(), {'C': [0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0]} ],
#   [ NuSVC(), {} ],
    [ SVC(), {'C': [0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0]} ],
    [ PassiveAggressiveClassifier(), {} ],
    [ RidgeClassifier(), {} ],
    [ RidgeClassifierCV(), {} ],
    [ SGDClassifier(), {} ],
    [ KNeighborsClassifier(n_neighbors=20), {} ],
    [ NearestCentroid(), {} ],
    [ DecisionTreeClassifier(), {} ],
    [ ExtraTreeClassifier(), {} ],
    [ AdaBoostClassifier(), {} ],
    [ BaggingClassifier(), {} ],
    [ ExtraTreeClassifier(), {} ],
    [ GradientBoostingClassifier(), {} ],
    [ RandomForestClassifier(), {} ],
#   [ GaussianProcessClassifier(), {} ],
    [ BernoulliNB(), {} ],
    [ GaussianNB(), {} ],
    [ MLPClassifier(), {} ],
]

pipes = [Pipeline([
    ['sc', StandardScaler()],
    ['clf', GridSearchCV(pair[0], param_grid = pair[1])]
]) for pair in clfs] # 用于统一化初值处理、分类

def learn_from_data(dataset):
    filename = dataset.filename
    columns = dataset.columns
    classcol = dataset.classcol
    datacol = dataset.datacol
    adult = pd.read_csv(filename, header=None)
    adult.columns = columns
    y = adult[classcol].values
    X = adult[datacol].values

    for i in dataset.encodecol:
        X[:, i] = LabelEncoder().fit_transform(X[:, i])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) # 测试集占10%

    for i in range(0, len(clfs)):
        minscore = 1.0 # 记录最小准确度用于后续进一步优化
        start = time.time()
        for j in range(0, testnum):
            pipes[i].fit(X_train, y_train)
            y_pred = pipes[i].predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            if accuracy < minscore:
                minscore = accuracy
        end = time.time()
        print('Accuraty:%s score=%.2f time=%d' % (type(clfs[i][0]), minscore, end - start))

testnum = 1
adultdataset = dataset(
    '../dataset/adult_dataset/adult.data',
    ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
        'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
        'hours-per-week', 'native-country', 'salary'],
    'salary',
    ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
        'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
        'hours-per-week', 'native-country'],
    [1, 3, 5, 6, 7, 8, 9, 13]
)

if __name__ == '__main__':
    learn_from_data(adultdataset)

Accuraty:<class 'sklearn.linear_model.logistic.LogisticRegression'> score=0.81 time=3
Accuraty:<class 'sklearn.linear_model.logistic.LogisticRegressionCV'> score=0.81 time=0
Accuraty:<class 'sklearn.svm.classes.LinearSVC'> score=0.80 time=4
Accuraty:<class 'sklearn.svm.classes.SVC'> score=0.83 time=10
Accuraty:<class 'sklearn.linear_model.passive_aggressive.PassiveAggressiveClassifier'> score=0.70 time=0
Accuraty:<class 'sklearn.linear_model.ridge.RidgeClassifier'> score=0.79 time=0
Accuraty:<class 'sklearn.linear_model.ridge.RidgeClassifierCV'> score=0.79 time=0
Accuraty:<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> score=0.75 time=0
Accuraty:<class 'sklearn.neighbors.classification.KNeighborsClassifier'> score=0.81 time=1
Accuraty:<class 'sklearn.neighbors.nearest_centroid.NearestCentroid'> score=0.72 time=0
Accuraty:<class 'sklearn.tree.tree.DecisionTreeClassifier'> score=0.79 time=0
Accuraty:<class 'sklearn.tree.tree.ExtraTreeClassifier'> score=0.75 time=0
Accura