# Demo

An example of how to use the custom classifiers in src/myML/classifiers

In [57]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from myml.classifiers.balancedbagging import RatioBaggingClassifier, BaggingClassifier

In [38]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
def pos_ratio(targets):
    n = len(targets)
    n_pos = sum(targets)
    return (n_pos/n)*100

# Prep the dataset

In [17]:
data = load_breast_cancer()
X, y = data.data, data.target

The breast cancer dataset is a classic and very easy binary classification dataset.

|Classes | Samples per class | Samples total| Dimensionality| Features|
| --- | :-----------:| ---:|---:|--------:|
|2 | 212(M),357(B) |569 |30 |real, positive|

In [18]:
pos_ratio(y)

62.741652021089635

Let's make the dataset a little more imbalanced by dropping some of the negatives

In [19]:
df = pd.DataFrame(data= np.c_[data.data, data.target], columns=np.append(data.feature_names,['outcome']))
df['outcome'].value_counts()

1.0    357
0.0    212
Name: outcome, dtype: int64

In [20]:
sampled_df = pd.concat([df[df.outcome==1], df[df.outcome==0].sample(frac=0.5)], ignore_index=True)
sampled_df['outcome'].value_counts()

1.0    357
0.0    106
Name: outcome, dtype: int64

Let's invert the outcome value, we want to predict the minority class

In [21]:
sampled_df['outcome'] = sampled_df['outcome'].apply(lambda x: 1 if x==0 else 0)

Split again in data and outcomes

In [22]:
sampled_df = sampled_df.sample(frac=1).reset_index(drop=True)
X = sampled_df[data.feature_names]
y = sampled_df['outcome']

In [23]:
pos_ratio(y)

22.894168466522675

# Fit the models

In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [112]:
model1 = GradientBoostingClassifier(n_estimators=5)
model1.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=5,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [113]:
classifier = GradientBoostingClassifier(n_estimators=5)
model2 = RatioBaggingClassifier(classifier, 
                                 ratios=([1, 0.5, 0.25], [1, 1, 1])
                                )
model2.fit(X_train, y_train)

Model 1 trained
Model 2 trained
Model 3 trained


In [114]:
classifier = GradientBoostingClassifier(n_estimators=5)
model3 = BaggingClassifier(classifier, n_estimators=10, d_ratios={0:1, 1:1})
model3.fit(X_train, y_train)

Model 0 trained
Model 1 trained
Model 2 trained
Model 3 trained
Model 4 trained
Model 5 trained
Model 6 trained
Model 7 trained
Model 8 trained
Model 9 trained


# Compare results

In [115]:
d_scores = {}
for model in [model1, model2, model3]:
    test_scores = model.predict_proba(X_test)[:, 1]
    average_precision = average_precision_score(y_test, test_scores)
    d_scores.update({model.__class__.__name__: average_precision})

In [127]:
pd.DataFrame.from_dict(d_scores, columns=['Av_prec'], orient='index')

Unnamed: 0,Av_prec
GradientBoostingClassifier,0.862273
RatioBaggingClassifier,0.976594
BaggingClassifier,0.893997


Great! our ensembles of ensembles outperformed the base classifier. (note: in this context a single GBClassifier with more estimators would produce results comparable to RatioBaggingClassifier, but this is not always true!)