In [13]:
import pandas as pd
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.tabular.metadata.sortinghat import SortingHatFeatureMetadataEngine
from sklearn.model_selection import train_test_split

In [14]:
metadata = pd.read_csv("./metadata/metadata.csv")

In [15]:
metadata

Unnamed: 0,name,link,target,truthVector
0,Cancer,https://archive.ics.uci.edu/ml/datasets/Breast...,Classification,"[0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,Mfeat,https://www.openml.org/d/12,class,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Nursery,https://archive.ics.uci.edu/ml/datasets/nursery,class,"[1, 1, 1, 1, 1, 1, 1, 1]"
3,Audiology,https://www.openml.org/d/7,class,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
4,Hayes,https://archive.ics.uci.edu/ml/datasets/Hayes-...,class,"[7, 1.0, 1.0, 1.0, 1.0]"
5,Supreme,https://www.openml.org/d/728,binaryClass,"[1, 1, 1, 1, 1, 1, 1]"
6,Flares,https://archive.ics.uci.edu/ml/datasets/Solar+...,c,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."
7,Kropt,https://www.openml.org/d/184,game,"[1, 1, 1, 1, 1, 1]"
8,Boxing,https://www.openml.org/d/444,Winner,"[7,1,1,1]"
9,Flags,https://www.openml.org/d/1012,binaryClass,"[1, 1, 0,0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1..."


In [16]:
classif_data = {
    "NU" : ["Cancer", "MFeat"],
    "CA" : ["Nursery", "Audiology", "Hayes", "Supreme", "Flares", "Kropt", "Boxing"],
    "CA+NG" : ["Apnea2"],
    "NU+CA" : ["Flags", "Diggle", "Hearts", "Sleuth"],
    "NU+CA+ST" : ["Auto-MPG"],
    "NU+CA+ST+NG" : ["Clothing"],
    "NU+DT+NG" : ["IOT"],
    "NU+DT+EN" : ["NYC"],
    "ST" : ["BBC"],
    "DT+ST" : ["Articles"],
    "NG+CA" : ["Zoo"],
    "NU+CA+EN" : ["Churn"],
    "NU+CA+EN+NG" : ["PBCseq"],
    "NU+CA+LST+NG+CS" : ["Pokemon"],
    "NU+CA+DT+URL+NG+CS" : ["President"]
}

reg_data = {
    "CA" : ["MBA"],
    "NU+CA" : ["Vineyard", "Apnea"],
    "DT" : ["Accident"],
    "NU+CA+EN+NG" : ["Car Fuel"]
}

In [31]:
sh_engine = SortingHatFeatureMetadataEngine()

def pipeline(data_dict, category, metadata):
    res = []
    for name in data_dict[category]:
        file_name = name.lower().replace(' ', '_') + '.csv'
        df = pd.read_csv('./data/' + file_name)
        truth_vec = metadata.loc[metadata.name == name].iloc[0,3]
        label = list(df.columns)[-1]  # specifies which column do we want to predict
        data = TabularDataset(df)
        train_data, test_data = train_test_split(data, test_size=0.2, random_state=25)
        #perf0 = agl_downstream(df, train_data, test_data, label, predictor_type=0, truth_vec=truth_vec)
        perf0=None
        perf1 = agl_downstream(df, train_data, test_data, label, predictor_type=1)
        perf2 = agl_downstream(df, train_data, test_data, label, predictor_type=2)
        res.append([name, perf0, perf1, perf2])
    return pd.DataFrame(res, columns=['Name', 'Truth', 'AGL', 'AGL+SH'])

'''
predictor_type: {0, 1, 2}
0 represents using true feature types
1 represents using AutoGluon auto-inferred feature types
2 represents using SortingHat inferred feature types
'''
def agl_downstream(df, train_data, test_data, label, predictor_type=1, truth_vec=None):
    # exclude other tree based models
    excluded = ['CAT', 'GBM', 'XT', 'custom']
    if predictor_type == 0:
        true_feature_metadata = sh_engine.to_feature_metadata(df, truth_vec)
        predictor = TabularPredictor(label=label, eval_metric= "accuracy").fit(train_data, feature_metadata=true_feature_metadata, presets='best_quality', excluded_model_types=excluded)
    elif predictor_type == 2:
        predictor = TabularPredictor(label=label, eval_metric= "accuracy").fit(train_data, use_metadata_engine=True, presets='best_quality', excluded_model_types=excluded)
    else:
        predictor = TabularPredictor(label=label, eval_metric= "accuracy").fit(train_data, presets='best_quality', excluded_model_types=excluded)

    # results = predictor.fit_summary(show_plot=True)
    # Inference time:
    y_test = test_data[label]
    test_data = test_data.drop(labels=[label], axis=1)  # delete labels from test data since we wouldn't have them in practice
    print(test_data.head())
    y_pred = predictor.predict(test_data)
    perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)
    predictor.leaderboard()
    return perf

pipeline(classif_data, 'CA', metadata)

No path specified. Models will be saved in: "AutogluonModels/ag-20211120_063509/"
Presets specified: ['best_quality']
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20211120_063509/"
AutoGluon Version:  0.3.1b20211115
Train Data Rows:    10368
Train Data Columns: 8
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == object).
	5 unique label values:  ['very_recom', 'spec_prior', 'priority', 'not_recom', 'recommend']
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Fraction of data from classes with at least 10 examples that will be kept for training models: 0.9998070987654321
Train Data Class Count: 4
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    1311.16 MB
	Train Da

           parents   has_nurs        form children     housing     finance  \
4865   pretentious     proper  incomplete        3  convenient  convenient   
6255   pretentious   improper    complete     more    critical      inconv   
10567   great_pret   improper    complete     more    critical  convenient   
7863   pretentious  very_crit    complete        2   less_conv      inconv   
11923   great_pret   critical      foster        1    critical  convenient   

              social       health  
4865   slightly_prob    not_recom  
6255         nonprob  recommended  
10567        nonprob     priority  
7863     problematic  recommended  
11923    problematic     priority  


Evaluation: accuracy on test data: 0.9996141975308642
Evaluations on test data:
{
    "accuracy": 0.9996141975308642,
    "balanced_accuracy": 0.9997119815668203,
    "mcc": 0.9994372287518047
}
No path specified. Models will be saved in: "AutogluonModels/ag-20211120_063539/"
Presets specified: ['best_quality']
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20211120_063539/"
AutoGluon Version:  0.3.1b20211115
Train Data Rows:    10368
Train Data Columns: 8
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == object).
	5 unique label values:  ['very_recom', 'spec_prior', 'priority', 'not_recom', 'recommend']
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Fraction of data from classes with at least 10 examples that will be kept for training models: 0.

9 9
9 9 9 9
9


			Note: Converting 1 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fitting CategoryMemoryMinimizeFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('category', []) : 8 | ['parents', 'has_nurs', 'form', 'children', 'housing', ...]
	Types of features in processed data (raw dtype, special dtypes):
		('category', [])  : 7 | ['parents', 'has_nurs', 'form', 'children', 'housing', ...]
		('int', ['bool']) : 1 | ['finance']
	0.6s = Fit runtime
	8 features in original data used to generate 8 features in processed data.
	Train Data (Processed) Memory Usage: 0.09 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.6s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accur

           parents   has_nurs        form children     housing     finance  \
4865   pretentious     proper  incomplete        3  convenient  convenient   
6255   pretentious   improper    complete     more    critical      inconv   
10567   great_pret   improper    complete     more    critical  convenient   
7863   pretentious  very_crit    complete        2   less_conv      inconv   
11923   great_pret   critical      foster        1    critical  convenient   

              social       health  
4865   slightly_prob    not_recom  
6255         nonprob  recommended  
10567        nonprob     priority  
7863     problematic  recommended  
11923    problematic     priority  


Evaluation: accuracy on test data: 0.9996141975308642
Evaluations on test data:
{
    "accuracy": 0.9996141975308642,
    "balanced_accuracy": 0.9997119815668203,
    "mcc": 0.9994372287518047
}


KeyboardInterrupt: 