In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sporgboost.preprocessing import onehot_encode, shuffle
from sporgboost.forests import *
from sklearn.metrics import roc_auc_score

In [2]:
# Dataset for testing
X, y = load_iris(return_X_y = True)

# Set seed to be able to reproduce
np.random.seed(1234)
    
# Preprocessing
y = onehot_encode(y)
X, y = shuffle(X, y)

# Train and Test Splits
X_train, y_train = X[:-50,:], y[:-50,:]
X_test, y_test = X[-50:,:], y[-50:,:]


In [14]:
models = {
    'rf' : RandomForest(),
    'ab' : AdaBoost(),
    'sporf' : SPORF(d=2, s=3),
    'sporgboost' : SPORGBoost(d=2, s=3),
    'rrf' : RotationalRandomForest(K=2),
    'rotboost' : RotBoost(K=2)
}

_ = [m.fit(X_train, y_train) for m in models.values()]


In [15]:
%timeit _ = [m.fit(X_train, y_train) for m in models.values()]

1.03 s ± 4.84 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [198]:
import requests
import pandas as pd
import yaml
import numpy as np

def parse_dataset_metadata(path):
    # Read yaml
    dfs = yaml.safe_load(open(path, 'rb'))

    # Structure dictionary so we can read this data in later
    out = {}
    for df, meta in dfs.items():
        out[df] = {
            'url' : meta['url'],
            'missing_ind' : meta['missing_ind'],
            'columns' : list(meta['columns'].keys()),
            'dtype' : meta['columns']
        }

    return out

def get_dataset(url, columns, dtype, missing_ind):
    # Grab data from web
    data = requests.get(url).content

    # Decode the data, split on rows, remove leading/trailing whitespace, and then finally split on cols
    rows = [l.strip().split(" ") for l in data.decode('utf-8').split('\n') if l.strip() != ""]

    # Convert to a pandas dataframe
    dtype_mapping = {
        'ordinal' : 'float32',
        'target' : 'category',
        'continuous' : 'float32',
        "category" : "category"
    }
    # Validate we only have valid input types
    input_dtypes = set(dtype.values())
    bad_dtypes = input_dtypes - dtype_mapping.keys()
    if len(bad_dtypes) > 0:
        raise ValueError(f"Bad dtypes found in schema: {bad_dtypes}")
    pd_dtypes = {col : dtype_mapping[t] for col, t in dtype.items()}
    
    # Convert to a pandas dataframe with appropriate schema
    df = pd.DataFrame(rows, columns = columns) \
    .replace(missing_ind, np.nan) \
    .astype(pd_dtypes)
    
    return df

meta = parse_dataset_metadata('datasets.yml')
dfs = {name : get_dataset(**info) for name, info in meta.items()}

In [199]:
dfs

{'horse-colic':     Surgery  Age Hospital_Number  Rectal_Temperature  Pulse  Respiratory_Rate  \
 0         2  1.0          530101           38.500000   66.0              28.0   
 1         1  1.0          534817           39.200001   88.0              20.0   
 2         2  1.0          530334           38.299999   40.0              24.0   
 3         1  9.0         5290409           39.099998  164.0              84.0   
 4         2  1.0          530255           37.299999  104.0              35.0   
 ..      ...  ...             ...                 ...    ...               ...   
 295       1  1.0          533886                 NaN  120.0              70.0   
 296       2  1.0          527702           37.200001   72.0              24.0   
 297       1  1.0          529386           37.500000   72.0              30.0   
 298       1  1.0          530612           36.500000  100.0              24.0   
 299       1  1.0          534618           37.200001   40.0              20.0   
 

In [196]:
pd.DataFrame(dfs['horse-colic'][0], columns = dfs['horse-colic'][1]) \
.replace("?", np.nan) \
.astype(dfs['horse-colic'][2]).dtypes

Surgery                        category
Age                             float32
Hospital_Number                category
Rectal_Temperature              float32
Pulse                           float32
Respiratory_Rate                float32
Temperature_Extremities         float32
Peripheral_Pulse               category
Mucous_Membranes               category
Capilary_Refill_Time            float32
Pain                            float32
Peristalsis                     float32
Abdominal_Distension            float32
Nasogastric_Tube                float32
Nasogastric_Reflux             category
Nasogastric_Reflux_PH           float32
Rectal_Examination_Feces       category
Abdomen                        category
Packed_Cell_Volume              float32
Total_Protein                   float32
Abdominocentesis_Appearance     float32
Abdomcentesis_Total_Protein     float32
Outcome                        category
Surgical_Lesion                 float32
Lesion1                        category


In [56]:
# Process data from UCI
def get_data(url):


    return rows

# url_data = "https://archive.ics.uci.edu/ml/machine-learning-databases/horse-colic/horse-colic.data"
# data_rows = get_data(url_data)
metadata = get_metadata(url_meta)

In [133]:
# Find the lines associated with the attribute information
import numpy as np
meta = pd.Series(metadata.decode('utf-8').split("\n"), name='lines')
meta = meta.str.strip()
# meta = meta[meta != ''].reset_index(drop=True)

attr_start = meta.index[meta.str.contains("7. Attribute")].values[0] + 1
attr_end = meta.index[meta.str.contains("8. Missing")].values[0] - 1
meta = meta.loc[attr_start:attr_end]

# Get series names 
cols = meta[meta.str.match("^[0-9]+:")].str.replace("^.*:", "", regex=True).str.strip().values
cols
# meta

array(['surgery?', 'Age', 'Hospital Number', 'rectal temperature',
       'pulse', 'respiratory rate', 'temperature of extremities',
       'peripheral pulse', 'mucous membranes', 'capillary refill time',
       "pain - a subjective judgement of the horse's pain level",
       'peristalsis', 'abdominal distension', 'nasogastric tube',
       'nasogastric reflux', 'nasogastric reflux PH',
       'rectal examination - feces', 'abdomen', 'packed cell volume',
       'total protein', 'abdominocentesis appearance',
       'abdomcentesis total protein', 'outcome', 'surgical lesion?',
       'cp_data'], dtype=object)

In [119]:
meta[meta.str.match("^[0-9]:")].str.split?

[1;31mSignature:[0m [0mstr[0m[1;33m.[0m[0msplit[0m[1;33m([0m[0mself[0m[1;33m,[0m [1;33m/[0m[1;33m,[0m [0msep[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [0mmaxsplit[0m[1;33m=[0m[1;33m-[0m[1;36m1[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Return a list of the words in the string, using sep as the delimiter string.

sep
  The delimiter according which to split the string.
  None (the default value) means split according to any whitespace,
  and discard empty strings from the result.
maxsplit
  Maximum number of splits to do.
  -1 (the default value) means no limit.
[1;31mType:[0m      method_descriptor


In [None]:
meta[meta.str.match("^[0-9]:")].str

In [13]:
# Get AUC scores
auc = {key : roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovo') for key, model in models.items()}
auc

{'rf': 0.998799148408656,
 'ab': 0.978074217802571,
 'sporf': 1.0,
 'sporgboost': 1.0,
 'rrf': 1.0,
 'rotboost': 0.9915940388605923}