In [17]:
import sklearn
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from hybridboost import HybridBoost 
from smote import SMOTEBoost 
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.tree import DecisionTreeClassifier as DT

In [19]:
class smoteBoost(object):
    def __init__(self, base_learner=DT(max_depth=2),
                 n_estimators=3,
                 smote_ratio=10,
                 class_numsamples_dict=False,
                 df_smote=False,
                 smote_decay='linear'):
        self.m = base_learner
        self.T = n_estimators
        self.smote_ratio = smote_ratio
        self.class_numsamples_dict = class_numsamples_dict
        self.df_smote = df_smote
        self.smote_decay = smote_decay

    def fit(self, x, y):
        # initialize list to hold models and model weights (alphas)
        self.models = []
        self.alphas = []

        xconstant = x.copy()
        yconstant = y.copy()

        # initialize weights
        w0 = 1.0 / x.shape[0]
        w = pd.Series([w0]*x.shape[0], index=x.index)
        w.name = 'w'
        w_index = w.index.values

        # iterate T times
        for i in range(self.T):
            # modify the distribution by creating N synthetic examples from minority class
            synthetic_indices = []
            for c in self.class_numsamples_dict:
                df_c = self.df_smote[self.df_smote.cl == c]
                synthetic_ind = random.sample(df_c.index, \
                    np.minimum(int(self.class_numsamples_dict[c]*self.smote_ratio), df_c.shape[0]))
                for i in synthetic_ind:
                    synthetic_indices.append(i)
            synthetic_df_round = self.df_smote.loc[synthetic_indices]
            xsynthetic = synthetic_df_round.drop(['cl','filepath'], axis=1)
            ysynthetic = synthetic_df_round.cl
            xsmote = pd.concat([x, xsynthetic])
            ysmote = pd.concat([y, ysynthetic])

            # train a weak learner
            clf = self.m.fit(xsmote, ysmote)
            self.models.append(clf)

            # make predictions and compute loss
            predtrain = clf.predict(xconstant)
            df_err = pd.DataFrame({'pred':predtrain, 'actual':y})
            h_t = (df_err['pred'] != df_err['actual'])*1
            e = 0.5*h_t.values.dot(w.values).sum()

            # update alpha
            alpha = np.log((1-e)/e)
            self.alphas.append(alpha)

            # update weights
            h = h_t.replace(0, -1).values
            w = w.values*(np.exp(-alpha*h))
            w = w / w.sum()
            w = pd.Series(w, index=w_index)         

            # update the data frame
            new_indices = np.random.choice(w.index,
                                           size=xconstant.shape[0],
                                           replace=True,
                                           p=w.values)
            x = xconstant.loc[new_indices]
            y = yconstant.loc[new_indices]  

    def predict(self, x):
        # iterate through each model and stuff predictions in predictions_df
        predictions = []
        for m in self.models:
            # create a prediction
            pred = m.predict(x)
            predictions.append(pred)
        predictions = pd.DataFrame(predictions).transpose()

        # predict the class with the largest sum of alphas
        weighted_predictions = []
        for index, value in predictions.iterrows():
            predictions_prob_dict = {}
            for i in range(len(self.alphas)):
                v = value.iloc[i]
                if v not in predictions_prob_dict.keys():
                    predictions_prob_dict[v] = self.alphas[i]
                else:
                    predictions_prob_dict[v] += self.alphas[i]
            weighted_predictions.append(max(predictions_prob_dict, key=predictions_prob_dict.get))

        return weighted_predictions

    def predict_proba(self, x):
        predictions = []
        for m in self.models:
            pred_proba = m.predict_proba(x)
            predictions.append(pred_proba)
        weighted_predictions = []
        for i in range(len(predictions)):
            weighted_predictions.append(self.alphas[i]*predictions[i])
        predict_proba = sum(weighted_predictions)
        return predict_proba

In [2]:
df = pd.read_csv("pima-indians-diabetes.csv")
df.head()

Unnamed: 0,num_pregnant,plasma_glucose,blood_pressure,skin_fold_thickness,serum_insulin,bmi,diabetes_pedigree,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# Outcomes distribution
df["class"].value_counts()

0    500
1    268
Name: class, dtype: int64

In [4]:
clf1 = HybridBoost(random_state=0, n_samples=232)
X, y= df.iloc[:,:-1].values, df["class"].values

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.4, random_state=0)

In [6]:
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[148,  57],
       [ 23,  80]])

In [7]:
roc_auc_score(y_test, y_pred)

0.7493251243192044

In [8]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.72      0.79       205
           1       0.58      0.78      0.67       103

   micro avg       0.74      0.74      0.74       308
   macro avg       0.72      0.75      0.73       308
weighted avg       0.77      0.74      0.75       308



In [22]:
clf2 = smoteBoost()
X, y= df.iloc[:,:-1].values, df["class"].values

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.4, random_state=0)

In [28]:
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
confusion_matrix(y_test, y_pred)

TypeError: 'bool' object is not iterable

In [12]:
roc_auc_score(y_test, y_pred)

0.6758228747336017

In [13]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.44      0.59       205
           1       0.45      0.91      0.60       103

   micro avg       0.60      0.60      0.60       308
   macro avg       0.68      0.68      0.60       308
weighted avg       0.76      0.60      0.60       308



In [14]:
X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train)

AttributeError: 'SMOTE' object has no attribute 'fit_resample'

In [None]:
import platform; print(platform.platform())
import sys; print("Python", sys.version)
import numpy; print("NumPy", numpy.__version__)
import scipy; print("SciPy", scipy.__version__)
import sklearn; print("Scikit-Learn", sklearn.__version__)
import sklearn; print("imblearn", sklearn.__version__)