In [1]:
import pandas as pd
import numpy as np
from sklearn import cross_validation
from sklearn.metrics import accuracy_score
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import grid_search



# Генерация данных 

In [2]:
bioresponce = pd.read_csv('bioresponse.csv', header=0, sep=',')

In [3]:
bioresponce.head(5)

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


# Задание датасета

In [4]:
y = bioresponce.Activity.values

In [5]:
X = bioresponce.iloc[:, 1:]

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

# Задание моделей

In [7]:
classifierSVC = LinearSVC(random_state=0)
classifierKNeigh = KNeighborsClassifier()
classifierDecTree = DecisionTreeClassifier()
classifierRanForest= RandomForestClassifier()
classifierGradBoost = GradientBoostingClassifier()

# Генерация сетки

In [8]:
classifierKNeigh.get_params().keys()

dict_keys(['algorithm', 'leaf_size', 'metric', 'metric_params', 'n_jobs', 'n_neighbors', 'p', 'weights'])

In [9]:
param_grid_SVC ={
    'multi_class': ['ovr', 'crammer_singer'],
    'max_iter': range(600, 1401, 200),
    'C' : [0.5, 1., 2.],
    'tol': [0.0001, 0.0005, 0.001]
}
param_grid_KNeigh ={
    'n_neighbors' : range(3, 7),
    'weights' : ['uniform'],
    'algorithm' : ['auto'],
    'leaf_size' : range(20, 41, 5),
    'p' : [1, 2],
    'n_jobs' : [-1]
}
param_grid_DecTree={
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_features': [None, 'sqrt', 'log2'],
    'min_samples_split': range(2, 5)
}
param_grid_RanForest = {
   'n_estimators': range(70, 111, 10),
    'criterion': ['gini', 'entropy'],
    'max_features': [None, 'sqrt', 'log2'],
    'min_samples_leaf': range(1, 5),
    'n_jobs' : [-1]
}
param_grid_GradBoost ={
   'loss': ['deviance', 'exponential'],
    'n_estimators': range(70, 111, 10),
    'criterion': ['friedman_mse', 'mse', 'mae'],
    'max_features': [None, 'sqrt', 'log2'],
}

In [10]:
cv = cross_validation.StratifiedShuffleSplit(y_train, n_iter =10, test_size=0.2, random_state=0)

In [13]:
grid_cv_SVC = grid_search.GridSearchCV(classifierSVC, param_grid_SVC, scoring = 'accuracy', cv = cv)
grid_cv_KNeigh = grid_search.GridSearchCV(classifierKNeigh, param_grid_KNeigh, scoring = 'accuracy', cv = cv)
grid_cv_DecTree = grid_search.GridSearchCV(classifierDecTree, param_grid_DecTree, scoring = 'accuracy', cv = cv)
grid_cv_RanForest = grid_search.GridSearchCV(classifierRanForest, param_grid_RanForest, scoring = 'accuracy', cv = cv)
grid_cv_GradBoost = grid_search.GridSearchCV(classifierGradBoost, param_grid_GradBoost, scoring = 'accuracy', cv = cv)

In [14]:
%%time
grid_cv_SVC.fit(X_train, y_train)
grid_cv_KNeigh.fit(X_train, y_train)
grid_cv_DecTree.fit(X_train, y_train)
grid_cv_RanForest.fit(X_train, y_train)
grid_cv_GradBoost.fit(X_train, y_train)

ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

In [77]:
grid_cv_SVC.best_estimator_
grid_cv_SVC.best_estimator_
grid_cv_KNeigh.best_estimator_
grid_cv_DecTree.best_estimator_
grid_cv_RanForest.best_estimator_
grid_cv_GradBoost.best_estimator_

LinearSVC(C=0.90000000000000002, class_weight=None, dual=True,
     fit_intercept=True, intercept_scaling=1, loss='squared_hinge',
     max_iter=1000, multi_class='ovr', penalty='l2', random_state=0,
     tol=0.0001, verbose=0)

In [30]:
%%time

models = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    LinearSVC(),
    RandomForestClassifier(n_estimators=100), 
    GradientBoostingClassifier(n_estimators=100)
]

for model in models:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(accuracy_score(preds, y_test), model)

0.723747980614 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
0.721324717286 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
0.730210016155 LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
0.797253634895 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
 