# 3.2.3 Logistische Regression (Breast Cancer Data)

### 01 - Daten laden

In [1]:
from os.path import join
import numpy as np
import pandas as pd

path = '..\Data'
file = 'breast_cancer_wisconsin.csv'

df = pd.read_csv(join(path, file))
df = df.drop('id', axis=1)
df.head()

Unnamed: 0,clump thickness,uniformity cell size,uniformity cell shape,marginal adhesion,epithelial cell size,bare nuclei,bland chromatin,normal nucleoli,mitoses,label
0,5,1,1,1,2,1.0,3,1,1,benign
1,5,4,4,5,7,10.0,3,2,1,benign
2,3,1,1,1,2,2.0,3,1,1,benign
3,6,8,8,1,3,4.0,3,7,1,benign
4,4,1,1,3,2,1.0,3,1,1,benign


### 02 - X- und y-Variablen separieren

In [2]:
X = df[['clump thickness', 'uniformity cell size', 
        'uniformity cell shape','marginal adhesion', 
        'epithelial cell size', 'bare nuclei','bland chromatin', 
        'normal nucleoli', 'mitoses']]
y = df['label']

X.shape, y.shape

((683, 9), (683,))

### 03 - Trainings- und Testpartitionen separieren

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=11)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((546, 9), (137, 9), (546,), (137,))

### 05 - Logistisches Regressionsmodell instanziieren und anlernen

In [4]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

# Abfrage des Intercepts und der Koeffizienten 
model.intercept_.round(3), model.coef_.round(3)

(array([-9.701]),
 array([[0.501, 0.133, 0.272, 0.324, 0.109, 0.345, 0.422, 0.157, 0.477]]))

### 06 - Schätzungen durchführen

In [5]:
x_pred = [[ 5.,  4.,  4.,  5.,  7., 10.,  3.,  2.,  1.]]
y_pred = model.predict(x_pred)
y_pred_prob = model.predict_proba(x_pred)

print('predicted label:', y_pred[0], '\npredicted probability for benign [0] and malignant [1]:', y_pred_prob.round(3))

predicted label: malignant 
predicted probability for benign [0] and malignant [1]: [[0.09 0.91]]


### 07 - Qualität des Modells eruieren
*Confusion Matrix* und *Accuracy* ausgeben 

In [6]:
from sklearn.metrics import confusion_matrix, accuracy_score

y_train_pred = model.predict(X_train)
matrix = confusion_matrix(y_train, y_train_pred)
accuracy = accuracy_score(y_train, y_train_pred)

print('confusion matrix:\n', matrix,  '\naccuracy:', accuracy.round(3))

confusion matrix:
 [[349   9]
 [  8 180]] 
accuracy: 0.969
