In [30]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from SIRSYNBoosting import SirSynXGB
from imblearn.over_sampling import SMOTE
import xgboost as xgb
# make sample dataset
X, y = make_classification(
    n_samples=1000, n_features=20, n_informative=2, n_redundant=10, 
    random_state=42,
    weights=[0.95, 0.05]
    )

#
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)


In [33]:
def proposed_sample(X, y, random_state=0):
    """
    Within this function, have the data generated using the synthetic data 
    generation algorithm you wish to adapt, and separate the y=1 samples 
    from the y=0 samples and return them in the return value.
    """
    sm = SMOTE(random_state=random_state)
    X_sm, y_sm = sm.fit_resample(X, y)
    z1 = X_sm[y_sm > 0.5, :]
    z0 = X_sm[y_sm < 0.5, :] 
    return z1, z0


# xbboost's params
params = {
    'objective': 'binary:logistic',
    'max_depth': 5,
    'eta': 0.1,
    'reg_lambda':1,
    'verbosity': 0,
    'seed': 324,
    'eval_metric': 'aucpr'
}

In [34]:
model = SirSynXGB(n_estimator=100,
                  proposed_sampler=proposed_sample
                  )
model.fit(dtrain=dtrain,dtest=dtest, ealry_stoping=10,params=params)

prob_y1 = model.predict(dtest=dtest)
y_pred = np.round(prob_y1)
confusion_matrix(y_pred=y_pred, y_true=y_test)

array([[282,   2],
       [  4,  12]])