In [2]:
from sklearn.datasets import load_breast_cancer

In [5]:
def load_breast_data():
    breast = load_breast_cancer()
    feature_names = list(breast.feature_names)
    X, y = pd.DataFrame(breast.data, columns=feature_names), breast.target
    dataset = {
        'problem': 'classification',
        'full': {
            'X': X,
            'y': y
        }
    }
    return dataset

def load_adult_data():
    df = pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
        header=None)
    df.columns = [
        "Age", "WorkClass", "fnlwgt", "Education", "EducationNum",
        "MaritalStatus", "Occupation", "Relationship", "Race", "Gender",
        "CapitalGain", "CapitalLoss", "HoursPerWeek", "NativeCountry", "Income"
    ]
    train_cols = df.columns[0:-1]
    label = df.columns[-1]
    X_df = df[train_cols]
    y_df = df[label]
    
    dataset = {
        'problem': 'classification',
        'full' : {
            'X': X_df,
            'y': y_df
        }
    }
    
    return dataset

def load_heart_data():
    # https://www.kaggle.com/ronitf/heart-disease-uci
    df = pd.read_csv('heart.csv')
    train_cols = df.columns[0:-1]
    label = df.columns[-1]
    X_df = df[train_cols]
    y_df = df[label]
    dataset = {
        'problem': 'classification',
        'full': {
            'X': X_df,
            'y': y_df
        }
    }
    
    return dataset

def load_credit_data():
    df = pd.read_csv('creditcard.csv')
    train_cols = df.columns[0:-1]
    label = df.columns[-1]
    X_df = df[train_cols]
    y_df = df[label]
    dataset = {
        'problem': 'classification',
        'full': {
            'X': X_df,
            'y': y_df
        }
    }

    return dataset

def load_telco_churn_data():
    df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
    train_cols = df.columns[1:-1]
    label = df.columns[-1]
    X_df = df[train_cols]
    y_df = df[label]
    dataset = {
        'problem': 'classification',
        'full': {
            'X': X_df,
            'y': y_df
        }
    }
    
    return dataset

In [4]:
import pyforest
from interpret.glassbox import ExplainableBoostingClassifier

In [17]:
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedShuffleSplit, cross_validate

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.linear_model import SGDClassifier, LogisticRegression

from interpret.glassbox import ExplainableBoostingClassifier

def format_n(x):
    return "{0:.3f}".format(x)

def process_model(clf, name, X, y, n_splits=3):
    
    ss = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.25, random_state=1337)
    scores = cross_validate(
        clf, X, y, scoring='roc_auc', cv=ss,
        n_jobs=None, return_estimator=True)
    
    record = dict()
    record['model_name'] = name
    record['fit_time_mean'] = format_n(np.mean(scores['fit_time']))
    record['fit_time_std'] = format_n(np.std(scores['fit_time']))
    record['test_score_mean'] = format_n(np.mean(scores['test_score']))
    record['test_score_std'] = format_n(np.std(scores['test_score']))
    
    return record

def benchmark_models(dataset_name, X, y, ct=None, n_splits=3, random_state=1337):
    if ct is None:
        is_cat = np.array([dt.kind == 'O' for dt in X.dtypes])
        cat_cols = X.columns.values[is_cat]
        num_cols = X.columns.values[~is_cat]
        
        cat_ohe_step = ('ohe', OneHotEncoder(sparse=False,
                                            handle_unknown='ignore'))
        
        cat_pipe = Pipeline([cat_ohe_step])
        num_pipe = Pipeline([('identity', FunctionTransformer())])
        transformers = [
            ('cat', cat_pipe, cat_cols),
            ('num', num_pipe, num_cols)
        ]
        ct = ColumnTransformer(transformers=transformers)
        
    records = []
    
    summary_record = {}
    summary_record['dataset_name'] = dataset_name
    print()
    print('-' * 78)
    print(dataset_name)
    print('-' * 78)
    print(summary_record)
    print()
    
    pipe = Pipeline([
        ('ct', ct),
        ('std', StandardScaler()),
        ('linear-sgd', SGDClassifier(random_state=random_state))
    ])
    record = process_model(pipe, 'linear-sgd', X, y, n_splits=n_splits)
    print(record)
    record.update(summary_record)
    records.append(record)
    
    pipe = Pipeline([
        ('ct', ct),
        ('std', StandardScaler()),
        ('lr', LogisticRegression(random_state=random_state))
    ])
    record = process_model(pipe, 'lr', X, y, n_splits=n_splits)
    print(record)
    record.update(summary_record)
    records.append(record)
    
    pipe = Pipeline([
        ('ct', ct),
        ('rf-100', RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=random_state )),  
    ])
    record = process_model(pipe, 'rf-100', X, y, n_splits=n_splits)
    print(record)
    record.update(summary_record)
    records.append(record)
    
    pipe = Pipeline([
        ('ct', ct),
        ('xgb', XGBClassifier(random_state=random_state))
    ])
    record = process_model(pipe, 'xgb', X, y, n_splits=n_splits)
    print(record)
    record.update(summary_record)
    records.append(record)
    
    ebm_main = ExplainableBoostingClassifier(n_jobs=-1, interactions=0, random_state=random_state)
    record = process_model(ebm_main, 'ebm main', X, y, n_splits=n_splits)
    print(record)
    record.update(summary_record)
    records.append(record)
    
    return records
    
    

In [18]:
results = []
n_splits = 3

dataset = load_heart_data()
result = benchmark_models('heart', dataset['full']['X'], dataset['full']['y'], n_splits=n_splits)
results.append(result)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


------------------------------------------------------------------------------
heart
------------------------------------------------------------------------------
{'dataset_name': 'heart'}



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'model_name': 'linear-sgd', 'fit_time_mean': '0.015', 'fit_time_std': '0.001', 'test_score_mean': '0.882', 'test_score_std': '0.020'}


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'model_name': 'lr', 'fit_time_mean': '0.016', 'fit_time_std': '0.005', 'test_score_mean': '0.895', 'test_score_std': '0.030'}


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'model_name': 'rf-100', 'fit_time_mean': '0.382', 'fit_time_std': '0.047', 'test_score_mean': '0.890', 'test_score_std': '0.008'}








<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'model_name': 'xgb', 'fit_time_mean': '0.115', 'fit_time_std': '0.014', 'test_score_mean': '0.851', 'test_score_std': '0.018'}


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'model_name': 'ebm main', 'fit_time_mean': '1.956', 'fit_time_std': '1.127', 'test_score_mean': '0.915', 'test_score_std': '0.007'}


In [19]:
dataset = load_breast_data()
result = benchmark_models('breast-cancer', dataset['full']['X'], dataset['full']['y'], n_splits=n_splits)
results.append(result)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


------------------------------------------------------------------------------
breast-cancer
------------------------------------------------------------------------------
{'dataset_name': 'breast-cancer'}



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'model_name': 'linear-sgd', 'fit_time_mean': '0.013', 'fit_time_std': '0.004', 'test_score_mean': '0.989', 'test_score_std': '0.008'}


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'model_name': 'lr', 'fit_time_mean': '0.038', 'fit_time_std': '0.007', 'test_score_mean': '0.994', 'test_score_std': '0.006'}


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'model_name': 'rf-100', 'fit_time_mean': '0.463', 'fit_time_std': '0.073', 'test_score_mean': '0.992', 'test_score_std': '0.009'}








<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'model_name': 'xgb', 'fit_time_mean': '0.123', 'fit_time_std': '0.010', 'test_score_mean': '0.992', 'test_score_std': '0.010'}


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'model_name': 'ebm main', 'fit_time_mean': '3.522', 'fit_time_std': '1.623', 'test_score_mean': '0.996', 'test_score_std': '0.005'}


In [21]:
dataset = load_credit_data()
result = benchmark_models('credit-fraud', dataset['full']['X'], dataset['full']['y'], n_splits=n_splits)
results.append(result)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


------------------------------------------------------------------------------
credit-fraud
------------------------------------------------------------------------------
{'dataset_name': 'credit-fraud'}



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'model_name': 'linear-sgd', 'fit_time_mean': '1.680', 'fit_time_std': '0.096', 'test_score_mean': '0.984', 'test_score_std': '0.001'}


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'model_name': 'lr', 'fit_time_mean': '4.412', 'fit_time_std': '0.203', 'test_score_mean': '0.979', 'test_score_std': '0.002'}


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'model_name': 'rf-100', 'fit_time_mean': '114.817', 'fit_time_std': '4.545', 'test_score_mean': '0.950', 'test_score_std': '0.007'}














<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'model_name': 'xgb', 'fit_time_mean': '100.249', 'fit_time_std': '5.739', 'test_score_mean': '0.981', 'test_score_std': '0.003'}


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'model_name': 'ebm main', 'fit_time_mean': '107.532', 'fit_time_std': '12.287', 'test_score_mean': '0.976', 'test_score_std': '0.004'}


In [23]:
dataset = load_telco_churn_data()
result = benchmark_models('telco-churn', dataset['full']['X'], dataset['full']['y'], n_splits=3)
results.append(result)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


------------------------------------------------------------------------------
telco-churn
------------------------------------------------------------------------------
{'dataset_name': 'telco-churn'}



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'model_name': 'linear-sgd', 'fit_time_mean': '7.546', 'fit_time_std': '2.102', 'test_score_mean': '0.800', 'test_score_std': '0.018'}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'model_name': 'lr', 'fit_time_mean': '6.187', 'fit_time_std': '0.106', 'test_score_mean': '0.809', 'test_score_std': '0.014'}


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'model_name': 'rf-100', 'fit_time_mean': '14.133', 'fit_time_std': '0.279', 'test_score_mean': '0.824', 'test_score_std': '0.002'}














<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'model_name': 'xgb', 'fit_time_mean': '73.437', 'fit_time_std': '2.713', 'test_score_mean': '0.825', 'test_score_std': '0.003'}


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

{'model_name': 'ebm main', 'fit_time_mean': '7.757', 'fit_time_std': '1.757', 'test_score_mean': '0.851', 'test_score_std': '0.004'}


In [24]:
records = [item for result in results for item in result]
record_df = pd.DataFrame.from_records(records)[['dataset_name','model_name', 'test_score_mean', 'test_score_std']]
record_df.to_csv('ebm-perf-classification-overnight.csv')

<IPython.core.display.Javascript object>