#  sklearn

### Данные


По данным характеристикам молекулы требуется определить, будет ли дан биологический ответ (biological response).

Для демонстрации используется обучающая выборка из исходных данных bioresponse.csv, файл с данными прилагается.

### Возьмём данные с семинара

In [6]:
import pandas as pd

bioresponce = pd.read_csv('bioresponse.csv', header=0, sep=',')

In [7]:
bioresponce.head(5)

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [8]:
y = bioresponce.Activity.values
y

array([1, 1, 1, ..., 0, 1, 0])

In [9]:
X = bioresponce.iloc[:, 1:]


## Вспомним разные модели с семинара

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.model_selection import cross_val_score

In [11]:
models = [
    LogisticRegression(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    LinearSVC(),
    RandomForestClassifier(n_estimators=100), 
    GradientBoostingClassifier(n_estimators=100)
]

for model in models:
    %time print(model, '\nAccuracy:', cross_val_score(model, X, y, cv=5).mean())
    print()

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) 
Accuracy: 0.7518013365357094
CPU times: user 7.88 s, sys: 365 ms, total: 8.24 s
Wall time: 4.29 s

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform') 
Accuracy: 0.7480751086372301
CPU times: user 48.8 s, sys: 161 ms, total: 48.9 s
Wall time: 47.9 s

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best') 
Accuracy: 0.72406797

# Задание

Используя класс для перебора параметров по сетке [GridSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html), напишите перебор параметров LogisticRegression:
- Параметр C по степеням 10 от -5 до +5
- Параметр max_iter от 100 до 500 включительно с шагом 100
- Параметр penalty либо 'l1' либо 'l2'
- Параметр random_state обязательно 42
- Параметр fit_intercept либо False либо True


In [10]:
from sklearn.model_selection import GridSearchCV

In [8]:
param_grid = dict(C = [1e-05] , 
                  max_iter =  [100, 200, 300, 400, 500], 
                  penalty = ['l1', 'l2'], 
                  random_state = [42], 
                  fit_intercept = [True, False]) 

print(round(GridSearchCV(
   LogisticRegression(),  param_grid, cv=5
).fit(X, y).best_score_, 4))

0.5423


In [9]:
param_grid = dict(C = [0.0001] , 
                  max_iter =  [100, 200, 300, 400, 500], 
                  penalty = ['l1', 'l2'], 
                  random_state = [42], 
                  fit_intercept = [True, False]) 

print(round(GridSearchCV(
   LogisticRegression(),  param_grid, cv=5
).fit(X, y).best_score_, 4))

0.5985


In [10]:
param_grid = dict(C = [0.001] , 
                  max_iter =  [100, 200, 300, 400, 500], 
                  penalty = ['l1', 'l2'], 
                  random_state = [42], 
                  fit_intercept = [True, False]) 

print(round(GridSearchCV(
   LogisticRegression(),  param_grid, cv=5
).fit(X, y).best_score_, 4))

0.7289


In [11]:
param_grid = dict(C = [0.01] , 
                  max_iter =  [100, 200, 300, 400, 500], 
                  penalty = ['l1', 'l2'], 
                  random_state = [42], 
                  fit_intercept = [True, False]) 

print(round(GridSearchCV(
   LogisticRegression(),  param_grid, cv=5
).fit(X, y).best_score_, 4))

0.7593


In [12]:
param_grid = dict(C = [0.1] , 
                  max_iter =  [100, 200, 300, 400, 500], 
                  penalty = ['l1', 'l2'], 
                  random_state = [42], 
                  fit_intercept = [True, False]) 

print(round(GridSearchCV(
   LogisticRegression(),  param_grid, cv=5
).fit(X, y).best_score_, 4))

0.7627


In [13]:
param_grid = dict(C = [1] , 
                  max_iter =  [100, 200, 300, 400, 500], 
                  penalty = ['l1', 'l2'], 
                  random_state = [42], 
                  fit_intercept = [True, False]) 

print(round(GridSearchCV(
   LogisticRegression(),  param_grid, cv=5
).fit(X, y).best_score_, 4))

0.7537


In [14]:
param_grid = dict(C = [10] , 
                  max_iter =  [100, 200, 300, 400, 500], 
                  penalty = ['l1', 'l2'], 
                  random_state = [42], 
                  fit_intercept = [True, False]) 

print(round(GridSearchCV(
   LogisticRegression(),  param_grid, cv=5
).fit(X, y).best_score_, 4))

0.7379


In [15]:
param_grid = dict(C = [100] , ##
                  max_iter =  [100, 200, 300, 400, 500], 
                  penalty = ['l1', 'l2'], 
                  random_state = [42], 
                  fit_intercept = [True, False]) 

print(round(GridSearchCV(
   LogisticRegression(),  param_grid, cv=5
).fit(X, y).best_score_, 4))

0.7289


In [12]:
param_grid = dict(C = [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100] , 
                  max_iter =  [100, 200, 300, 400, 500], 
                  penalty = ['l1', 'l2'], 
                  random_state = [42], 
                  fit_intercept = [True, False]) 

print(round(GridSearchCV(
   LogisticRegression(),  param_grid, cv=5
).fit(X, y).best_score_, 4))

0.7627


Полученное число введите в форму для ответов