In [1]:
# This notebook shows a working example of creating an ensemble model using 'binsembler' package
# Binsembler is an ensembling technique where the final prediction will be a weighted average of the predicted probabilities by
# each individual model. The weights will be the specified evaluation metric such as Accuracy or F1 Score calculated for bins of
# specified size (default is 10)
# Steps:
# Creates an ensemble model using the models input to the function
#    1.  Based on the predicted probabilities, create bins of fixed size (say 10)
#    2.  For each bin, calculate the Confusion matrix (TP, FP, TN, FN) and calculate other metrics such as Accuracy, AUC, F1 score, Precision, Recall
#    3.  Repeat the steps 1 and 2 for other model
#    4.  Calculate the final probability as a weighted average of models probabilities
#        Select any metric as weights. For e.g. F1 Score
#        (m1 F1 score * m1 predicted probability + m2 F1 score * m2 predicted probability)/ (m1 F1 score + m2 F1 score)

# The package can ensemble any number of models. User can either give the models and the training data or can 
# provide the predicted probabilities by each model for training.

In [2]:
# Load the packages
from binsembler import Classifier

In [3]:
# Option 1: Providing Predicted Probabilities by each model

# Making up some probabilities
y1_preds = [0.25, 0.5, 0.75, 0.9]
y2_preds = [0.3, 0.6, 0.8, 0.9]
y1_acts = [0, 0, 1, 1]
y2_acts = [0, 1, 1, 1]

# Initialize Ensembler
cl = Classifier()

# Train our Ensembler
cl.train(y_pred_probs = [y1_preds, y2_preds], y_acts = [y1_acts, y2_acts])
print("Model Coefficients: ") 
print(cl.w_)

# Predict using Ensembler
preds = cl.predict(y_pred_probs=[[0.87, 0.90, 0.1], [0.6, 0.7, 0.2]])
print(preds)

Model Coefficients: 
  m1_bins  m1_TP  m1_TN  m1_FP  m1_FN  m1_Accuracy  m1_F1_Score  m1_Precision  \
0   20-30      0      1      0      0          1.0          0.0           0.0   
1   40-50      0      0      1      0          0.0          0.0           0.0   
2   70-80      1      0      0      0          1.0          1.0           1.0   
3   80-90      1      0      0      0          1.0          1.0           1.0   

   m1_Recall  m1_AUC  ...  m2_TP m2_TN  m2_FP  m2_FN  m2_Accuracy  \
0        0.0       0  ...      0     1      0      0          1.0   
1        0.0       0  ...      1     0      0      0          1.0   
2        1.0       0  ...      1     0      0      0          1.0   
3        1.0       0  ...      1     0      0      0          1.0   

   m2_F1_Score  m2_Precision  m2_Recall  m2_AUC  m2_PR_AUC  
0          0.0           0.0        0.0       0          0  
1          1.0           1.0        1.0       0          0  
2          1.0           1.0        1.0     

In [4]:
# Option 2: Providing model and a training data

# create our first simple classification model using Naive Bayes
import sklearn
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

# Load dataset
data = load_breast_cancer()

# Organize our data
label_names = data['target_names']
labels = data['target']
feature_names = data['feature_names']
features = data['data']

# Split our data
X_train, X_test, y_train, y_test = train_test_split(features,
                                                          labels,
                                                          test_size=0.33,
                                                          random_state=42)

# Initialize our classifier
gnb = GaussianNB()

# Train our classifier
model1 = gnb.fit(X_train, y_train)

# Create a second model using logistic regression
from sklearn.linear_model import LogisticRegression

# Initialize our classifier
logreg = LogisticRegression()

# Train our classifier
model2 = logreg.fit(X_train, y_train)





In [5]:
# Initialize Ensembler
cl = Classifier()

# Train our Ensembler
cl.train(models=[model1, model2], targetcols = ['target', 'target'], X_train=X_train, y_train=y_train)
print("Model Coefficients: ") 
print(cl.w_[['m1_bins', 'm1_Accuracy', 'm2_Accuracy']])

# Predict using Ensembler
preds = cl.predict(models=[model1, model2], test_data=X_test)
print(preds.head())

Model Coefficients: 
  m1_bins  m1_Accuracy  m2_Accuracy
0    0-10     0.938017     0.995098
1   10-20     1.000000     0.947368
2   30-40     0.000000     0.500000
3   60-70     0.333333     0.600000
4   80-90     1.000000     0.333333
5  90-100     0.962121     1.000000
   ensmbl_preds  ensmbl_pred_probs  ensmbl_pred_probs0
0             0           0.091431        9.085692e-01
1             1           1.000000        8.953394e-12
2             1           0.999997        3.323592e-06
3             0           0.002090        9.979101e-01
4             0           0.000490        9.995101e-01
