In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.metrics import classification_report, confusion_matrix
import shap

def SplitTrainTestValid (dataset, testRatio = 0.2, validRatio = 0.2): 
    """Split the pandas dataframe in the training, validation and test datset with the desired ratios
    
    Args:
        dataFrame (pd.DataFrame): name of the whole dataset we want to split
        testRatio (float): ratio of the test data in the whole dataset
        validRatio (float): the ratio of the validation data in the whole dataset
    
    Return:
        pd.Dataframe: train_X for features in the training data
        numpy array: train_y
        pd.Dataframe: valid_X
        numpy array: valid_y
        pd.Dataframe: test_x
        numpy array: test_y
    """
    
    # find the size for each type of data
    totalLen = dataset.shape[0];
    trainLen = int(totalLen * (1 - testRatio - validRatio));
    testLen = int(totalLen * testRatio);
    
    test = dataset.iloc[-testLen:, :]
    train_valid = dataset.iloc[:-testLen, :]
    
    train_X, train_y = train_valid.iloc[:trainLen, :-1], train_valid.iloc[:trainLen, -1].values
    valid_X, valid_y = train_valid.iloc[trainLen:, :-1], train_valid.iloc[trainLen:, -1].values
    test_X, test_y = test.iloc[:, :-1], test.iloc[:, -1].values
    
    return train_X, train_y, valid_X, valid_y, test_X, test_y



def XGBClassiferForSurvey (fileName, testRatio=0.2, validRatio = 0.2, maxDepth=9, learningRate=0.15, subsample=0.9, 
                           colsample_bytree=0.3, objective='binary:logistic', numIters=100, minChildWeight = 1, gamma = 0):
    """predict whether the surveyee will be craving before the next survey using XGBClassifier
    
    Args: 
        fileName (str): name of file to read in data from 
        testRatio (float) : ratio of the test data in the whole dataset
        validRatio (float) : the ratio of the validation data in the whole dataset
        maxDepth (int): maximum depth of tree  (optional: default = 9)
        learningRate (float): learning rate of XGB model, range [0, 1]  (optional: default = 0.15)
        subsample (float): fraction of observations to be randomly sampled for each tree (optional: default = 0.9)
        colsample_bytree (float): fraction of columns to be randomly sampled for each tree (optional: default = 0.3)
        objective (str): loss function  (optional: default='binary:logistic')
        numIters (int): number of iterations for XGB model  (optional: default = 100)
        minChildWeight (float): controls when the tree building should terminate based comparing the sum of the 
            instance weights to the minChildWeight.
        gamma (float): Minimum loss reduction required to make a further partition on a leaf node of the tree. 
    Return:
        bool: whether craving for the next time of survey
    
    """
    
    # read in the raw data 
    # we will not sample data here because it is unlikely that we will have over 100k of data
    df = pd.read_parquet(fileName, engine='auto')
    
    # work with NaN values, then update df
    # we will figure out how to deal with NaN after EDA 
    df = df
    
    # transform the y variable to 0 and 1(xgb cannot handle factors well)
    df['is_craving'] = df['is_craving'].apply(lambda x: 0 if x=='False' else 1)
    
    # lead the y variable (it becomes the value we want to predict -- label)
    df['craving_lead'] = df['is_craving'].shift(-1)
    df["craving_lead"].fillna(0, inplace = True)
    
    # maybe add some lags for other features 
    
    # move craving_lead to the last column (skip if label is aleady the last colum)
    new_cols = [col for col in df.columns if col != 'craving_lead'] + ['craving_lead']
    df = df[new_cols]
    feature_columns = new_cols[:-1]
    
    # Split data
    train_X, train_y, valid_X, valid_y, test_X, test_y = SplitTrainTestValid (df, testRatio, validRatio)
    
    # Create XGBoost matrices
    Train = xgb.DMatrix(train_X, label = train_y)
    Valid = xgb.DMatrix(valid_X, label = valid_y)
    Test = xgb.DMatrix(test_X, label = test_y)
    
    # Setting XGBoost parameters
    parameter = {'learning_rate': learningRate,
                 'max_depth': maxDepth,
                 'colsample_bytree': colsample_bytree,
                 'min_child_weight': minChildWeight,
                 'subsample': subsample,
                 'gamma': gamma,
                 'eval_metrix': "auc",
                 'objective': objective
    }
    
    # Fine tune the above hyperparameters towards the validation dataset, and get the optimal params
    optimalParams = parameters
    
    # Run XGBoost
    model = xgb.train(params = optimalParams, train = Train,num_boost_round = numIters, 
                      evals = [(Test, "Yes")], verbose_eval = 50)
    # Predictions
    predictions = model.predict(Test)
    predictions = np.where(predictions > 0.5, 1, 0)
    
   # Confusion Matrix
    cm = confusion_matrix(test_y, predictions)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(cm)
    plt.title('Confusion Matrix')
    fig.colorbar(cax)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()
    
    # Print report
    report = classification_report(test_y, predictions)
    print(report)
    
    # ROC Curve and AUC
    fpr, tpr, _ = roc_curve(test_y, predictions)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([-0.02, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')
    plt.legend(loc="lower right")
    plt.show()
    
    # feature importances
    xgb.plot_importance(model, max_num_features = 10)
    
    # Prepare and plot SHAP 
    # monkey patch
    model_bytearray = finalModel.save_raw()[4:]
    def myfun(self=None):
        return model_bytearray
    finalModel.save_raw = myfun
    # plot SHAP
    explainer = shap.TreeExplainer(finalModel)
    shap_values = explainer.shap_values(X_test)
    shap.summary_plot(shap_values,
                      X_test,
                      feature_names = feature_columns,
                      max_display = 10)
    
    return predictions[-1]