In [1]:
from sklearn.metrics import make_scorer, confusion_matrix, accuracy_score, precision_score, recall_score

def bkrclf_accuracy_score(y_true, y_pred, *, sample_weight=None):
    C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
    with np.errstate(divide="ignore", invalid="ignore"):
        per_class = np.diag(C) / C.sum(axis=1)

    score = np.array([1/6, 5/6]) @ per_class
    return score
bkrclf_accuracy = make_scorer(bkrclf_accuracy_score) 

In [2]:
import pandas as pd
import numpy as np
import os
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

def MyModel(X):
    name_ = 'Easy Ensemble APP'
    # my class for outlier clipping
    class OutlierClipper1():
        def __init__(self):
            self.ther = []
        def fit(self, X, y=None):
            X_df = pd.DataFrame(X)
            for i in X_df.columns:
                self.ther.append((X_df[i].quantile(0.10), X_df[i].quantile(0.90)))
            return self
        def transform(self, X):
            X_df = pd.DataFrame(X)
            for i in X_df.columns:
                low, high = self.ther[i]
                X_df[i] = np.where(X_df[i] > high, high, X_df[i])
                X_df[i] = np.where(X_df[i] < low, low, X_df[i])
            return X_df.values
        
    # It should create an array of predictions; we initialize it to the empty array for convenience
    predictions = []
    
    # convert numeric and drop features
    X = X.drop('Id', axis=1)
    for f in X.columns:
        if X[f].dtype == 'O':
            X[f] = pd.to_numeric(X[f], errors='coerce')
    
    # build the pipeline
    transformers_= Pipeline(steps=[('imputer', SimpleImputer(missing_values=np.nan, strategy='median')),
                              ('clipper', OutlierClipper1()),     
                              ('scaler', StandardScaler()),
                               ('normer', Normalizer())
                                ] )
    
    clf_ = EasyEnsembleClassifier(n_estimators=5, random_state=0)
    model_ = Pipeline(steps=[
                    ('transformers', transformers_),
                    ('clf_final', clf_)
                ])
    
    # train the model
    DATA_DIR_ = "./Data"

    if not os.path.isdir(DATA_DIR_):
        DATA_DIR_ = "../resource/asnlib/publicdata/bankruptcy/data"


    data_file_ = "5th_yr.csv"
    data_ = pd.read_csv( os.path.join(DATA_DIR_, "train", data_file_) )
    data_ = data_.drop('Id', axis=1)
    for f in data_.columns:
        if data_[f].dtype == 'O':
            data_[f] = pd.to_numeric(data_[f], errors='coerce')
            
    X_ = data_.iloc[:, :64]
    y_ = data_['Bankrupt']
    
    X_train_, X_test_, y_train_, y_test_ = train_test_split(X_.values, y_.values, test_size=0.1, random_state=42)
    
    model_.fit(X_train_, y_train_)
    
    y_test_pred_ = model_.predict(X_test_)
    
    accuracy_test_ = accuracy_score(y_test_, y_test_pred_)
    recall_test_ = recall_score(y_test_, y_test_pred_, pos_label=1, average="binary")
    precision_test_ = precision_score(y_test_,   y_test_pred_, pos_label=1, average="binary")
    bkrclf_test_ = bkrclf_accuracy_score(y_test_, y_test_pred_)


    print("\t{m:s} In Sample Test Accuracy: {a:3.1%}, Recall {r:3.1%}, Precision {p:3.1%}, Bkrclf Score {b:3.1%}".format(m=name_,
                                                                            a=accuracy_test_,
                                                                            r=recall_test_,
                                                                            p=precision_test_,
                                                                            b=bkrclf_test_
                                                                            ))

    
    # make predictions
    predictions = model_.predict(X.values)
    
    
    
    return predictions


In [3]:
DATA_DIR = "./Data"
X_hold = pd.read_csv( os.path.join(DATA_DIR, "holdout", '5th_yr.csv') )

# Predict using MyModel
y_hold_pred = MyModel(X_hold)
y_hold_pred.shape


	Easy Ensemble APP In Sample Test Accuracy: 77.8%, Recall 75.8%, Precision 20.2%, Bkrclf Score 76.1%


(1092,)