## Load Packages

In [None]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

## Load Data and define Features X and Target y

In [52]:
cancer = load_breast_cancer()

In [53]:
X = cancer['data']
y = cancer['target']

In [66]:
X.shape

(569, 30)

In [65]:
#percentage of target values = 1; dataset relatively well balanced
np.mean(y)

0.6274165202108963

## Train Test Data Split (70/30)

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

## 1) Use default Logistic Regression Classifier

In [69]:
#max_iter to avoid converge warning
clf = LogisticRegression(max_iter=100000)

In [70]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [71]:
round(clf.score(X_train, y_train), 2)

0.98

In [72]:
round(clf.score(X_test, y_test), 2)

0.94

In [73]:
pred = clf.predict(X_test)

In [74]:
#10 wrong predictions out of 171 test samples
print(confusion_matrix(y_test, pred))

[[ 59   7]
 [  3 102]]


In [75]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.95      0.89      0.92        66
           1       0.94      0.97      0.95       105

    accuracy                           0.94       171
   macro avg       0.94      0.93      0.94       171
weighted avg       0.94      0.94      0.94       171



## 2) Use a more complex Model with a higher C parameter

In [76]:
#max_iter to avoid converge warning
clf = LogisticRegression(C=100, max_iter=100000)

In [77]:
clf.fit(X_train, y_train)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [78]:
#same training score
round(clf.score(X_train, y_train), 2)

0.98

In [79]:
#higher test score
round(clf.score(X_test, y_test), 2)

0.96

In [80]:
pred = clf.predict(X_test)

In [81]:
#Only 7 wrong predictions out of 171 test samples with a more complex model
print(confusion_matrix(y_test, pred))

[[ 60   6]
 [  1 104]]


## 3) Use L1 Penalty (use only most important features) and more complex Model

In [84]:
#max_iter to avoid converge warning
clf = LogisticRegression(C=100, max_iter=100000, penalty='l1', solver='liblinear')

In [85]:
clf.fit(X_train, y_train)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100000,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [86]:
#higher train score
round(clf.score(X_train, y_train), 2)

0.99

In [87]:
#same test score
round(clf.score(X_test, y_test), 2)

0.96

In [88]:
pred = clf.predict(X_test)

In [89]:
#Only 7 wrong predictions out of 171 test samples with a more complex model and a L1 penalty (only use most important features)
print(confusion_matrix(y_test, pred))

[[ 63   3]
 [  4 101]]
