### Logistische Regression am Beispiel der Breast-Cancer-Daten

#### a) Daten laden

In [1]:
import pandas as pd
from math import ceil

pd.set_option('display.max_columns', 6)

data_url=r'https://github.com/tplusone/hanser_ml_zeitreihen/blob/master/Daten/breast_cancer_wisconsin.csv?raw=true'
df = pd.read_csv(data_url)
df.head()

Unnamed: 0,id,clump thickness,uniformity cell size,...,normal nucleoli,mitoses,label
0,1000025,5,1,...,1,1,benign
1,1002945,5,4,...,2,1,benign
2,1015425,3,1,...,1,1,benign
3,1016277,6,8,...,7,1,benign
4,1017023,4,1,...,1,1,benign


#### b) x- und y-Daten selektieren

In [2]:
X = df[['clump thickness', 'uniformity cell size', 
        'uniformity cell shape', 'marginal adhesion', 
        'epithelial cell size', 'bare nuclei',
       'bland chromatin', 'normal nucleoli', 'mitoses']]
y = df['label']

#### c) Train-Test-Split durchführen

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                    test_size=0.2, random_state=12)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((546, 9), (137, 9), (546,), (137,))

#### d) Logistische Regression aufsetzen und anlernen

In [4]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

#### e) Evaluation durchführen
- Accuracy

In [5]:
from sklearn.metrics import accuracy_score, confusion_matrix

### Zuerst predictions über die Testdaten durchführen
y_test_pred = model.predict(X_test)  
accuracy = accuracy_score(y_test, y_test_pred)
accuracy

0.948905109489051

- Confusion Matrix

In [6]:
matrix = confusion_matrix(y_test, y_test_pred)
matrix

array([[79,  2],
       [ 5, 51]], dtype=int64)

- Methode: predict_proba (predicted probabilities)

In [7]:
X_pred = [[6., 3., 4., 1., 5., 2., 3., 9., 1.]]
y_pred_proba = model.predict_proba(X_pred)
y_pred = model.predict(X_pred)
y_pred_proba, y_pred

(array([[0.56285278, 0.43714722]]), array(['benign'], dtype=object))

#### f) Wahrscheinlichkeitsgrundlage der Zuordnung zu benign umstellen:
Gutartig sollen nur Fälle sein, bei denen die Wahrscheinlichkeit für benign > 99% ist

In [8]:
y_test_pred_proba = model.predict_proba(X_test)
y_test_pred99 = [ 'benign' if prob[0] > .99 else 'malignant' 
                  for prob in y_test_pred_proba ]
accuracy99 = accuracy_score(y_test, y_test_pred99)
matrix99 = confusion_matrix(y_test, y_test_pred99)
print('neue Accuracy:', accuracy99)
matrix99

neue Accuracy: 0.8686131386861314


array([[63, 18],
       [ 0, 56]], dtype=int64)