In [3]:
# activate conda.yaml to setup autogluon environment prior to running code
import os
import numpy as np
import pandas as pd
import torch

from autogluon.tabular import TabularDataset, TabularPredictor

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
# get the data
# input_dataframe = pd.read_parquet('s3://cortex-dsc-2023-data/sprint_data/sprint_train.parquet')
def create_dataframe(pt_path):
        # Load the .pt file
        data = torch.load(pt_path)

        # Extract data into separate lists
        pids, feats, labels = zip(*data)

        # Create a DataFrame
        df = pd.DataFrame({
            'pid': pids,
            'label': labels,
            'features': feats
        })

        feature_columns = [f'feature_{i}' for i in range(len(df['features'].iloc[0]))]

         # Convert features column to a list of lists
        df['features'] = df['features'].apply(lambda x: x.cpu().numpy().tolist())

        df = pd.concat([df, pd.DataFrame(df['features'].tolist(), columns=feature_columns)], axis=1)

        # Drop the original 'features' column
        df = df.drop(columns=['features'])

        return df

In [18]:
import torch
# os.chdir('./deepnote-gnn-reproduced-main')
train_path = './data/discharge/train.pt'
test_path = './data/discharge/test.pt'
val_path = './data/discharge/val.pt'

# Create DataFrames
train_df = create_dataframe(train_path)
test_df = create_dataframe(test_path)
val_df = create_dataframe(val_path)

In [19]:
val_df.head()

Unnamed: 0,pid,label,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_758,feature_759,feature_760,feature_761,feature_762,feature_763,feature_764,feature_765,feature_766,feature_767
0,176176,0,-0.175461,-0.031054,0.046837,0.136903,-0.117701,-0.026094,0.385239,0.094383,...,0.03849,-0.164991,0.030843,0.220228,-0.002753,-0.044868,-0.083309,0.076162,-0.344506,-0.012195
1,176176,0,-0.224407,0.036126,-0.086208,0.157155,-0.013275,-0.059862,0.433127,0.156739,...,0.016694,-0.313061,0.206523,0.293353,-0.120235,0.003004,-0.026881,0.008423,-0.385576,0.023437
2,161160,1,-0.156343,-0.023782,0.068393,0.115051,-0.114274,-0.026454,0.369157,0.098631,...,0.027988,-0.165158,-0.019983,0.228936,-0.010385,-0.044965,-0.089852,0.107745,-0.332285,-0.018265
3,161160,1,-0.037761,-0.072406,0.355639,0.025613,-0.312891,0.028596,0.250725,0.000943,...,0.065428,0.033266,-0.341756,0.153609,0.139355,-0.052577,-0.161805,0.319533,-0.246552,-0.104527
4,161160,1,-0.169068,-0.04291,0.104977,0.132228,-0.105184,-0.033724,0.399281,0.078976,...,0.038849,-0.128436,-0.005195,0.239489,-0.0263,-0.040773,-0.107147,0.124502,-0.341689,-0.004925


In [25]:
%%time
predictor = TabularPredictor(
    label = 'label', # response variable
    problem_type = 'binary', 
    eval_metric = 'roc_auc' # other options listed: https://auto.gluon.ai/stable/api/autogluon.tabular.TabularPredictor.html
)

predictor.fit(
    presets = 'best_quality',
    train_data = train_df,
    time_limit = 60*10,
    # excluded_model_types = ['KNN','NN_TORCH','FASTAI'] # Optional
)

No path specified. Models will be saved in: "AutogluonModels\ag-20231128_213051\"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 600s
AutoGluon will save models to "AutogluonModels\ag-20231128_213051\"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22621
Disk Space Avail:   246.90 GB / 509.72 GB (48.4%)
Train Data Rows:    26128
Train Data Columns: 769
Label Column: label
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    5180.92 MB
	Train Data (Original)  Memory Usage: 160.74 MB (3.1% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify specia

CPU times: total: 40min 10s
Wall time: 8min 5s


<autogluon.tabular.predictor.predictor.TabularPredictor at 0x2915fd12f70>

In [26]:
predictor.leaderboard()

                     model  score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L2   0.772078      17.705130  391.154369                0.000000           1.339336            2       True          6
1    NeuralNetTorch_BAG_L1   0.771330       4.700446  327.186492                4.700446         327.186492            1       True          5
2  RandomForestEntr_BAG_L1   0.765574       6.385236   53.032672                6.385236          53.032672            1       True          2
3    ExtraTreesEntr_BAG_L1   0.763830       6.619447    9.595870                6.619447           9.595870            1       True          4
4    ExtraTreesGini_BAG_L1   0.762949       6.654202    9.213769                6.654202           9.213769            1       True          3
5  RandomForestGini_BAG_L1   0.759747       6.421000   32.764759                6.421000          32.764759            1       True          1

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.772078,17.70513,391.154369,0.0,1.339336,2,True,6
1,NeuralNetTorch_BAG_L1,0.77133,4.700446,327.186492,4.700446,327.186492,1,True,5
2,RandomForestEntr_BAG_L1,0.765574,6.385236,53.032672,6.385236,53.032672,1,True,2
3,ExtraTreesEntr_BAG_L1,0.76383,6.619447,9.59587,6.619447,9.59587,1,True,4
4,ExtraTreesGini_BAG_L1,0.762949,6.654202,9.213769,6.654202,9.213769,1,True,3
5,RandomForestGini_BAG_L1,0.759747,6.421,32.764759,6.421,32.764759,1,True,1


In [24]:
predictor.set_model_best('WeightedEnsemble_L2')