In [1]:
import sklearn
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from hybridboost import HybridBoost 
from smote import SMOTEBoost 
from imblearn.over_sampling import SMOTE, ADASYN

In [2]:
df = pd.read_csv("pima-indians-diabetes.csv")
df.head()

Unnamed: 0,num_pregnant,plasma_glucose,blood_pressure,skin_fold_thickness,serum_insulin,bmi,diabetes_pedigree,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# Outcomes distribution
df["class"].value_counts()

0    500
1    268
Name: class, dtype: int64

In [4]:
clf1 = HybridBoost(random_state=0, n_samples=232)
X, y= df.iloc[:,:-1].values, df["class"].values

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.4, random_state=0)

In [6]:
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[148,  57],
       [ 23,  80]])

In [7]:
roc_auc_score(y_test, y_pred)

0.7493251243192044

In [8]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, output_dict=False))

              precision    recall  f1-score   support

           0       0.87      0.72      0.79       205
           1       0.58      0.78      0.67       103

   micro avg       0.74      0.74      0.74       308
   macro avg       0.72      0.75      0.73       308
weighted avg       0.77      0.74      0.75       308



In [9]:
clf2 = SMOTEBoost(random_state=0, n_samples=232)
X, y= df.iloc[:,:-1].values, df["class"].values

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.4, random_state=0)

In [11]:
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 90, 115],
       [  9,  94]])

In [12]:
roc_auc_score(y_test, y_pred)

0.6758228747336017

In [13]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.44      0.59       205
           1       0.45      0.91      0.60       103

   micro avg       0.60      0.60      0.60       308
   macro avg       0.68      0.68      0.60       308
weighted avg       0.76      0.60      0.60       308

