In [77]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV

In [12]:
from sklearn.preprocessing import LabelEncoder

# Fetching Breast Cancer Wisconsin Dataset

In [50]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
["id", "diagnosis"]
features_per_nucleus = ["radius", "texture", "perimeter", "area", "smoothness", "compactness",
        "concavity", "concave_points", "symmetry", "fractal_dimension"]
features = []
for nucleus_id in range(1,4):
    features.extend([feature + "_" + str(nucleus_id) for feature in features_per_nucleus])
names = ["id", "diagnosis"] + features
dataset = pd.read_csv(url, names=names)

## Encoding Binary Label

In [51]:
le = LabelEncoder()
dataset["diagnosis"] = le.fit_transform(dataset["diagnosis"])

## Drop ID column

In [54]:
dataset = dataset.drop('id', 1)

## Train-Test-Split

In [62]:
array = dataset.values
X = array[:, 1:]
Y = array[:, 0]
validation_size = 0.2
seed = 1
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size,
                                                    random_state=seed)

# Create Pipeline

In [89]:
sc = StandardScaler()
pca = PCA(n_components=2)
clf = LogisticRegression(random_state=1)
pipeline = Pipeline([("Standard scaler", sc), ("PCA", pca), ("logistic regression", clf)])

In [90]:
pipeline.fit(X_train, Y_train)

Pipeline(steps=[('Standard scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('PCA', PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('logistic regression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [92]:
pipeline.score(X_test, Y_test)

0.94736842105263153

## Recursive Feature Elimination

In [96]:
sc = StandardScaler()
clf = LogisticRegression(random_state=1)
rfecv = RFECV(estimator=clf)
pipeline = Pipeline([("Standard scaler", sc), ("Recursive Feature Elimination", rfecv), ("logistic regression", clf)])

In [97]:
pipeline.fit(X_train, Y_train)

Pipeline(steps=[('Standard scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('Recursive Feature Elimination', RFECV(cv=None,
   estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
 ...nalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [98]:
pipeline.score(X_test, Y_test)

0.98245614035087714

In [105]:
print("Optimal number of features: %d" % rfecv.n_features_)

Optimal number of features: 25
