In [2]:
#encoding=utf8
from sklearn.pipeline import Pipeline
from sklearn.linear_model import *
from sklearn.tree import *
from sklearn.neighbors import *
from sklearn.ensemble import *
from sklearn.gaussian_process import *
from sklearn.svm import *
from sklearn.naive_bayes import *
from sklearn.preprocessing import *
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neural_network import *
from sklearn.model_selection import *
from sklearn.preprocessing import Imputer

import matplotlib.pyplot as plt
import pandas as pd
import time

class dataset(object):
    def __init__(self, filename, columns, classcol, datacol, encodecol = []):
        self.filename = filename
        self.columns = columns		# 所有属性名
        self.classcol = classcol	# 类型名
        self.datacol = datacol		# 有效属性名
        self.encodecol = encodecol  # 编码属性

clfs = [
    [ LogisticRegression(max_iter=1000), {} ],
    [ LogisticRegressionCV(), {} ],
    [ LinearSVC(), {'C': [0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0]} ],
#   [ NuSVC(), {} ],
    [ SVC(), {'C': [0.1, 0.2, 0.5, 1.0, 2.0, 5.0, 10.0]} ],
    [ PassiveAggressiveClassifier(), {} ],
    [ RidgeClassifier(), {} ],
    [ RidgeClassifierCV(), {} ],
    [ SGDClassifier(), {} ],
    [ KNeighborsClassifier(n_neighbors=20), {} ],
    [ NearestCentroid(), {} ],
    [ DecisionTreeClassifier(), {} ],
    [ ExtraTreeClassifier(), {} ],
    [ AdaBoostClassifier(), {} ],
    [ BaggingClassifier(), {} ],
    [ ExtraTreeClassifier(), {} ],
    [ GradientBoostingClassifier(), {} ],
    [ RandomForestClassifier(), {} ],
#   [ GaussianProcessClassifier(), {} ],
    [ BernoulliNB(), {} ],
    [ GaussianNB(), {} ],
    [ MLPClassifier(), {} ],
]

pipes = [Pipeline([
    ['imputer', Imputer(strategy = "mean")],  # 将NaN替换为平均值
    ['sc', StandardScaler()],
    ['clf', GridSearchCV(pair[0], param_grid = pair[1])]
]) for pair in clfs] # 用于统一化初值处理、分类

def learn_from_data(dataset):
    filename = dataset.filename
    columns = dataset.columns
    classcol = dataset.classcol
    datacol = dataset.datacol
    data = pd.read_csv(filename, header=None)
    data.columns = columns
    y = data[classcol].values
    X = data[datacol].values

    for i in dataset.encodecol:
        X[:, i] = LabelEncoder().fit_transform(X[:, i])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) # 测试集占10%

    for i in range(0, len(clfs)):
        minscore = 1.0 # 记录最小准确度用于后续进一步优化
        start = time.time()
        for j in range(0, testnum):
            pipes[i].fit(X_train, y_train)
            y_pred = pipes[i].predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            if accuracy < minscore:
                minscore = accuracy
        end = time.time()
        print('Accuraty:%s score=%.2f time=%d' % (type(clfs[i][0]), minscore, end - start))

testnum = 1
breastcancerdataset1 = dataset(
    '../dataset/breastcancer_dataset/wisconsin.data',
    ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
     'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
     'Normal Nucleoli', 'Mitoses', 'Class'],
    'Class',
    ['Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion',
     'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses'],
    [], # embarked由Imputer处理
)

breastcancerdataset2 = dataset(
    '../dataset/breastcancer_dataset/wdbc.data',
    ['ID number', 'Diagnosis',
     'radius1', ' texture1', 'perimeter1', 'area1', 'smoothness1', 'compactness1', 'concavity1', 'concave points1', 'symmetry1', 'fractal dimension1',
     'radius2', ' texture2', 'perimeter2', 'area2', 'smoothness2', 'compactness2', 'concavity2', 'concave points2', 'symmetry2', 'fractal dimension2',
     'radius3', ' texture3', 'perimeter3', 'area3', 'smoothness3', 'compactness3', 'concavity3', 'concave points3', 'symmetry3', 'fractal dimension3'],
    'Diagnosis',
    ['radius1', ' texture1', 'perimeter1', 'area1', 'smoothness1', 'compactness1', 'concavity1', 'concave points1', 'symmetry1', 'fractal dimension1',
     'radius2', ' texture2', 'perimeter2', 'area2', 'smoothness2', 'compactness2', 'concavity2', 'concave points2', 'symmetry2', 'fractal dimension2',
     'radius3', ' texture3', 'perimeter3', 'area3', 'smoothness3', 'compactness3', 'concavity3', 'concave points3', 'symmetry3', 'fractal dimension3'],
    [], # embarked由Imputer处理
)

breastcancerdataset3 = dataset(
    '../dataset/breastcancer_dataset/wpbc.data',
    ['ID number', 'Outcome', 'Time',
     'radius1', ' texture1', 'perimeter1', 'area1', 'smoothness1', 'compactness1', 'concavity1', 'concave points1', 'symmetry1', 'fractal dimension1',
     'radius2', ' texture2', 'perimeter2', 'area2', 'smoothness2', 'compactness2', 'concavity2', 'concave points2', 'symmetry2', 'fractal dimension2',
     'radius3', ' texture3', 'perimeter3', 'area3', 'smoothness3', 'compactness3', 'concavity3', 'concave points3', 'symmetry3', 'fractal dimension3',
     'Tumor size', 'Lymph node status'],
    'Outcome',
    ['Time',
     'radius1', ' texture1', 'perimeter1', 'area1', 'smoothness1', 'compactness1', 'concavity1', 'concave points1', 'symmetry1', 'fractal dimension1',
     'radius2', ' texture2', 'perimeter2', 'area2', 'smoothness2', 'compactness2', 'concavity2', 'concave points2', 'symmetry2', 'fractal dimension2',
     'radius3', ' texture3', 'perimeter3', 'area3', 'smoothness3', 'compactness3', 'concavity3', 'concave points3', 'symmetry3', 'fractal dimension3',
     'Tumor size', 'Lymph node status'],
    [], # embarked由Imputer处理
)

if __name__ == '__main__':
    learn_from_data(breastcancerdataset1)
    learn_from_data(breastcancerdataset2)
    learn_from_data(breastcancerdataset3)

Accuraty:<class 'sklearn.linear_model.logistic.LogisticRegression'> score=0.94 time=0
Accuraty:<class 'sklearn.linear_model.logistic.LogisticRegressionCV'> score=0.93 time=0
Accuraty:<class 'sklearn.svm.classes.LinearSVC'> score=0.94 time=0
Accuraty:<class 'sklearn.svm.classes.SVC'> score=0.96 time=0
Accuraty:<class 'sklearn.linear_model.passive_aggressive.PassiveAggressiveClassifier'> score=0.86 time=0
Accuraty:<class 'sklearn.linear_model.ridge.RidgeClassifier'> score=0.90 time=0
Accuraty:<class 'sklearn.linear_model.ridge.RidgeClassifierCV'> score=0.90 time=0
Accuraty:<class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> score=0.86 time=0
Accuraty:<class 'sklearn.neighbors.classification.KNeighborsClassifier'> score=0.91 time=0
Accuraty:<class 'sklearn.neighbors.nearest_centroid.NearestCentroid'> score=0.91 time=0
Accuraty:<class 'sklearn.tree.tree.DecisionTreeClassifier'> score=0.91 time=0
Accuraty:<class 'sklearn.tree.tree.ExtraTreeClassifier'> score=0.96 time=0
Accurat