# Projekt Python
Tomasz Ancukiewicz 127219, Mikołaj Leśny 127218

## Init libs

In [1]:
import pandas as pd
from scipy.io import arff
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
import numpy as np
from collections import Counter

## Load data

In [2]:
SEED = 42

# data, meta = arff.loadarff("data/breast-cancer.arff")
# df = pd.DataFrame(data)
df = pd.read_csv("data/breast-cancer.data", sep=',', 
                 names=['Class', 'age', 'menopause', 'tumor-size', 
                        'inv-nodes', 'node-caps', 'deg-malig', 
                        'breast', 'breast-quad', 'irradiat'])
# df = df.astype('str')

df[["age-begin", "age-end"]] = df.pop('age').astype('str').str.split("-", n=1, expand=True)
df[["tumor-size-begin", "tumor-size-end"]] = df.pop('tumor-size').astype('str').str.split("-", n=1, expand=True)
df[["inv-nodes-begin", "inv-nodes-end"]] = df.pop('inv-nodes').astype('str').str.split("-", n=1, expand=True)

df[["age-begin", "age-end", "tumor-size-begin", 
    "tumor-size-end", "inv-nodes-begin", "inv-nodes-end"]] = \
    df[["age-begin", "age-end", "tumor-size-begin", 
        "tumor-size-end", "inv-nodes-begin", "inv-nodes-end"]].astype(int)

y = df.pop('Class')
X = df
X = X.apply(lambda x: LabelEncoder().fit_transform(x))

## Prepared data

In [3]:
df.head()

Unnamed: 0,menopause,node-caps,deg-malig,breast,breast-quad,irradiat,age-begin,age-end,tumor-size-begin,tumor-size-end,inv-nodes-begin,inv-nodes-end
0,premeno,no,3,left,left_low,no,30,39,30,34,0,2
1,premeno,no,2,right,right_up,no,40,49,20,24,0,2
2,premeno,no,2,left,left_low,no,40,49,20,24,0,2
3,ge40,no,2,right,left_up,no,60,69,15,19,0,2
4,premeno,no,2,right,right_low,no,40,49,0,4,0,2


In [4]:
y.value_counts()

no-recurrence-events    201
recurrence-events        85
Name: Class, dtype: int64

## Make test and train sets

In [5]:
rus = RandomUnderSampler(random_state=SEED)
X_resampled, y_resampled = rus.fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))

[('no-recurrence-events', 85), ('recurrence-events', 85)]


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=SEED)

## Classification

In [7]:
# model = DecisionTreeClassifier(
#     criterion='entropy',
#     max_depth=7,
#     min_samples_leaf=7,
#     random_state=SEED,
#     class_weight={"no-recurrence-events": 1.0, "recurrence-events": 5.0}
# )

# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)

# tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
# print(f'confusion matrix\n{tp, fp}\n{fn, tn}')
# print(f'accuracy: {accuracy_score(y_test, y_pred)}')
# recal_recorrence_events = classification_report(y_test, y_pred, output_dict=True)["recurrence-events"]["recall"]
# print(f'recall[recurrence-events]: {recal_recorrence_events}')

In [8]:
param_grid = {'criterion':['gini', 'entropy'],
              'max_depth': np.arange(5, 12),
              'min_samples_leaf': np.arange(5, 12),
              'class_weight':[{"recurrence-events": w} for w in [2, 4, 6, 10]]}

tree = GridSearchCV(DecisionTreeClassifier(), param_grid, scoring='roc_auc')

tree.fit(X_train, y_train)
y_pred_proba = tree.predict_proba(X_test)[:, 1]
y_pred = tree.best_estimator_.predict(X_test)
tree_performance = roc_auc_score(y_test, y_pred_proba)

print('DecisionTree: Area under the ROC curve = {}'.format(tree_performance))
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f'confusion matrix\n{tp, fp}\n{fn, tn}')
print(f'accuracy: {accuracy_score(y_test, y_pred)}')
recal_recorrence_events = classification_report(y_test, y_pred, output_dict=True)["recurrence-events"]["recall"]
print(f'recall[recurrence-events]: {recal_recorrence_events}')

DecisionTree: Area under the ROC curve = 0.67
confusion matrix
(21, 15)
(4, 11)
accuracy: 0.6274509803921569
recall[recurrence-events]: 0.84
