In [32]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_breast_cancer

import warnings; warnings.simplefilter('ignore')

from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler

In [33]:
data = load_breast_cancer()
#display(data.data,data.target)

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.33, random_state=42)

### Model Comparison

In [35]:
model_dict ={'LogisticRegression': LogisticRegression(penalty='l2', C = 0.1),
             'SVC':SVC(C=0.2),
             'DecisionTreeClassifier': DecisionTreeClassifier(),
             'RandomForestClassifier':RandomForestClassifier(n_estimators=100,n_jobs=-1),
             'AdaBoostClassifier': AdaBoostClassifier(n_estimators=100,learning_rate=0.1),
             'GradientBoostingClassifier':GradientBoostingClassifier(learning_rate=0.1,n_estimators=100),
             'ExtraTreesClassifier':ExtraTreesClassifier(n_estimators=100,n_jobs=-1),
             'Lasso':Lasso(alpha=0.1)
            }

In [36]:
for clf_name, clf in model_dict.items():
    pipeline = Pipeline([('scaler', StandardScaler()), (clf_name, clf)])
    pipeline.fit(X_train,y_train)
    print(clf_name,':',np.array(cross_val_score(pipeline, X_test, y_test, cv=3)).mean())

LogisticRegression : 0.9625336021505376
SVC : 0.941364247311828
DecisionTreeClassifier : 0.9361559139784946
RandomForestClassifier : 0.9411962365591399
AdaBoostClassifier : 0.9411962365591399
GradientBoostingClassifier : 0.9411962365591399
ExtraTreesClassifier : 0.9519489247311829
Lasso : 0.6435556214583623


### LogisticRegression fine tuning

In [57]:
clf_name, clf = 'LogisticRegression', LogisticRegression()
pipeline = Pipeline([('scaler', StandardScaler()), ('LR', clf)])
parameters = {'LR__penalty':('l1','l2'), 'LR__C':np.arange(1,10)}
gridsearch = GridSearchCV(pipeline,parameters,cv=3)
gridsearch.fit(X_train, y_train)
display(gridsearch.best_params_)
display('Best Parameters gives accuracy of ',np.array(cross_val_score(gridsearch.best_estimator_, X_test, y_test, cv=3)).mean())

{'LR__C': 1, 'LR__penalty': 'l2'}

'Best Parameters gives accuracy of '

0.9679099462365591

### ExtraTreesClassifier fine tuning

In [64]:
clf = ExtraTreesClassifier(n_jobs=-1)
pipeline = Pipeline([('scaler', StandardScaler()), ('ETR', clf)])
parameters = {'ETR__n_estimators':range(20,401,20), 'ETR__min_samples_split':np.arange(0.1,1.1,0.1)}
gridsearch = GridSearchCV(pipeline,parameters,cv=3)
gridsearch.fit(X_train, y_train)
display(gridsearch.best_params_)
display('Best Parameters gives accuracy of ',np.array(cross_val_score(gridsearch.best_estimator_, X_test, y_test, cv=3)).mean())

{'ETR__min_samples_split': 0.1, 'ETR__n_estimators': 100}

'Best Parameters gives accuracy of '

0.9627016129032259

***Logistic Regression outperforms every other model for breast cancer prediction***