In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import ParameterGrid

print ('done')

done


In [3]:
# load data

complete = pd.read_csv('../data/artificial_data_3features.csv')

print(complete.head())


   Class_label  Feature_1  Feature_2  Feature_3
0            0        NaN        NaN   -0.36098
1            1        NaN        NaN        NaN
2            1   6.422475   6.496080        NaN
3            0        NaN        NaN        NaN
4            1        NaN   5.673988    4.38316


In [4]:
# Function: train, test, holdout speration

def dataset_speration(complete):
    complete = complete.sample(frac=1).reset_index(drop=True) # shuffle
    total_row = complete.shape[0]
    train = complete.loc[0:int(total_row * 0.6)-1].reset_index(drop=True)
    test = complete.loc[int(total_row *0.6):int(total_row *0.8)-1].reset_index(drop=True)
    holdout = complete.loc[int(total_row *0.8):total_row].reset_index(drop=True)
    
    return train, test, holdout

# Function: Gerneral XGB model

def xgb_model(X_train, Y_train, X_test, Y_test, X_holdout, Y_holdout):

    XGB = xgb.XGBClassifier()
    # find the best parameter set
    param_grid = {"learning_rate": [0.1],
                  "objective":['binary:logistic'],
                  "reg_alpha": [0.1,1.],
                  "missing": [np.nan], 
                  "reg_lambda": [0.1,1.]}

    scores = np.zeros(len(ParameterGrid(param_grid)))

    for i in range(len(ParameterGrid(param_grid))):
        params = ParameterGrid(param_grid)[i]
        XGB.set_params(**params)
        eval_set = [(X_test, Y_test)]
        XGB.fit(X_train, Y_train, 
                early_stopping_rounds=50, 
                eval_metric="error", eval_set=eval_set,verbose=False)# with early stopping
        Y_test_pred = XGB.predict(X_test, ntree_limit=XGB.best_ntree_limit)
        scores[i] = accuracy_score(Y_test,Y_test_pred)

    best_params = np.array(ParameterGrid(param_grid))[scores == np.max(scores)]
    #print ('Test set max score and best parameters are:')
    #print (np.max(scores))
    #print (best_params)

    # test the model on the holdout set with best parameter set
    XGB.set_params(**best_params[0])
    XGB.fit(X_train,Y_train, 
            early_stopping_rounds=50, 
            eval_metric="error", eval_set=eval_set,verbose=False)
    Y_holdout_pred = XGB.predict(X_holdout, ntree_limit=XGB.best_ntree_limit)
    cnf_matrix = confusion_matrix(Y_holdout,Y_holdout_pred)

    print ('The accuracy is:',accuracy_score(Y_holdout,Y_holdout_pred))
    print ('The confusion matrix is:')
    print (cnf_matrix)
    
    return Y_holdout_pred


In [20]:
# Function: Reduced-feature XGB model
# all the inputs need to be pandas DataFrame

def reduced_feature_xgb(X_train, Y_train, X_test, Y_test, X_holdout, Y_holdout):
    
    # find all unique patterns of missing value in holdout set
    mask = X_holdout.isnull()
    unique_rows = np.array(np.unique(mask, axis=0))
    all_Y_holdout_pred = pd.DataFrame()
    
    print('there are', len(unique_rows), 'unique missing value patterns.')
    print(unique_rows)
    
    # divide holdout sets into subgroups according to the unique patterns
    for i in range(len(unique_rows)):
        print ('working on unique pattern', i)
        ## generate X_holdout subset that matches the unique pattern i
        sub_X_holdout = pd.DataFrame()
        sub_Y_holdout = pd.DataFrame()
        for j in range(len(mask)): # check each row in mask
            row_mask = np.array(mask.iloc[j])
            if np.array_equal(row_mask, unique_rows[i]): # if the pattern matches the ith unique pattern
                sub_X_holdout = sub_X_holdout.append(X_holdout.iloc[j])# append the according X_holdout row j to the subset
                sub_Y_holdout = sub_Y_holdout.append(Y_holdout.iloc[j]).astype(int) # append the according Y_holdout row j
        sub_X_holdout = sub_X_holdout[sub_X_holdout.columns[~unique_rows[i]]]
        
        ## choose the according reduced features for subgroups
        sub_X_train = pd.DataFrame()
        sub_Y_train = pd.DataFrame()
        sub_X_test = pd.DataFrame()
        sub_Y_test = pd.DataFrame()
        # 1.cut the feature columns that have nans in the according sub_X_holdout
        sub_X_train = X_train[X_train.columns[~unique_rows[i]]]
        sub_X_test = X_test[X_test.columns[~unique_rows[i]]]
        # 2.cut the rows in the sub_X_train and sub_X_test that have any nans
        sub_X_train = sub_X_train.dropna()
        sub_X_test = sub_X_test.dropna()   
        # 3.cut the sub_Y_train and sub_Y_test accordingly
        sub_Y_train = Y_train.iloc[sub_X_train.index]
        sub_Y_test = Y_test.iloc[sub_X_test.index]
        
        #print(sub_X_train)
        #print(sub_Y_train)
        #print(sub_X_test)
        #print(sub_Y_test)
        #print(sub_X_holdout)
        #print(sub_Y_holdout)
        
        ## check if sub_X_train or sub_X_test is empty:
        if (sub_X_train.size == 0 or 
            sub_X_test.size == 0 or 
            len(np.unique(sub_Y_train)) == 1):
            
            if (Y_train['Class_label'] == 0).sum() >= (Y_train['Class_label'] == 1).sum():
                sub_Y_holdout_pred = pd.DataFrame(0, index=np.arange(len(sub_Y_holdout)), columns=['sub_Y_holdout_pred'])
            else:
                sub_Y_holdout_pred = pd.DataFrame(1, index=np.arange(len(sub_Y_holdout)), columns=['sub_Y_holdout_pred'])
                        
        else:
            
            ## call xgb function for the subgroups: train and test by local reduced features
            sub_Y_train_list = sub_Y_train[sub_Y_train.columns[0]].tolist()
            sub_Y_test_list = sub_Y_test[sub_Y_test.columns[0]].tolist()
            sub_Y_holdout_list = sub_Y_holdout[sub_Y_holdout.columns[0]].tolist()
        
            sub_Y_holdout_pred = xgb_model(sub_X_train, sub_Y_train_list, sub_X_test, 
                                           sub_Y_test_list, sub_X_holdout, sub_Y_holdout_list)
            sub_Y_holdout_pred = pd.DataFrame(sub_Y_holdout_pred,columns=['sub_Y_holdout_pred'],
                                              index=sub_Y_holdout.index)
            
        all_Y_holdout_pred = all_Y_holdout_pred.append(sub_Y_holdout_pred)
        
    # rank the final Y_holdout_pred according to original Y_holdout index
    all_Y_holdout_pred = all_Y_holdout_pred.sort_index()
    print(all_Y_holdout_pred)
               
    # get global accuracy and the confusion matrix
    total_accuracy_score= accuracy_score(Y_holdout,all_Y_holdout_pred)
    total_cnf_matrix = confusion_matrix(Y_holdout,all_Y_holdout_pred)
    
    return total_accuracy_score, total_cnf_matrix
    

In [22]:
# reduced model test

train, test, holdout = dataset_speration(complete)
#print(train)
#print(test)
#print(holdout)

X_train = train.iloc[:,1:]
Y_train = pd.DataFrame(train['Class_label'])
X_test = test.iloc[:,1:]
Y_test = pd.DataFrame(test['Class_label'])
X_holdout = holdout.iloc[:,1:]
Y_holdout = pd.DataFrame(holdout['Class_label'])

#print(X_train)
#print(Y_train)
#print(X_test)
#print(Y_test)
#print(X_holdout)
#print(Y_holdout)

total_accuracy_score, total_cnf_matrix = reduced_feature_xgb(X_train, Y_train, X_test, Y_test, X_holdout, Y_holdout)
print('the total accuracy of the reduced feature model is:', total_accuracy_score)
print('the total confusion matrix is:')
print(total_cnf_matrix)


there are 7 unique missing value patterns.
[[False False  True]
 [False  True False]
 [False  True  True]
 [ True False False]
 [ True False  True]
 [ True  True False]
 [ True  True  True]]
working on unique pattern 0
The accuracy is: 1.0
The confusion matrix is:
[[1]]
working on unique pattern 1
The accuracy is: 1.0
The confusion matrix is:
[[2]]
working on unique pattern 2
The accuracy is: 1.0
The confusion matrix is:
[[1]]
working on unique pattern 3
The accuracy is: 1.0
The confusion matrix is:
[[2]]
working on unique pattern 4
The accuracy is: 1.0
The confusion matrix is:
[[2 0]
 [0 1]]
working on unique pattern 5
The accuracy is: 1.0
The confusion matrix is:
[[3 0]
 [0 1]]
working on unique pattern 6
    sub_Y_holdout_pred
0                    0
1                    0
1                    0
2                    0
2                    0
3                    1
3                    0
4                    0
5                    0
5                    1
6                    0
6      

In [17]:
a = np.array([1,1,1])
print (np.unique(a))

[1]
