<a href="https://colab.research.google.com/github/ArshanBhanage/Autogluon/blob/main/Tabular_and_Multimodel_Atuogluon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install autogluon

import numpy as np
import pandas as pd
import random
from datetime import datetime

from sklearn.datasets import make_regression
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.features.generators import AutoMLPipelineFeatureGenerator

np.random.seed(1)
random.seed(1)

X, y = make_regression(
    n_samples=100,
    n_features=5,
    n_targets=1,
    random_state=1
)

dfx = pd.DataFrame(X, columns=['A', 'B', 'C', 'D', 'E'])
dfy = pd.DataFrame(y, columns=['label'])

dfx['B'] = dfx['B'].astype(int)

base_date = datetime(2000, 1, 1)
dfx['C'] = base_date + pd.to_timedelta(dfx['C'].astype(int), unit='D')

dfx['D'] = pd.cut(
    dfx['D'] * 10,
    bins=[-np.inf, -5, 0, 5, np.inf],
    labels=['v', 'w', 'x', 'y']
).astype('category')

choices = ["abc", "d", "ef", "ghi", "jkl"]
dfx['E'] = [
    ' '.join(random.choice(choices) for _ in range(4))
    for _ in range(len(dfx))
]

dataset = TabularDataset(dfx)
print(dfx)

auto_ml_pipeline_feature_generator = AutoMLPipelineFeatureGenerator()
featured_data = auto_ml_pipeline_feature_generator.fit_transform(X=dfx)
print("\n[Feature-Generated Head]")
print(featured_data.head())

df = pd.concat([dfx, dfy], axis=1)

predictor = TabularPredictor(label='label', problem_type='regression')
predictor.fit(
    train_data=df,
    hyperparameters={'GBM': {}},
    verbosity=2
)

missing_data = dfx.copy()
missing_data.iloc[0, 0] = np.nan
missing_data.iloc[1, 1] = np.nan
missing_data.iloc[2, 2] = np.nan

print("\n[Missing Data Head]")
print(missing_data.head())

auto_ml_pipeline_feature_generator = AutoMLPipelineFeatureGenerator()
featured_missing_data = auto_ml_pipeline_feature_generator.fit_transform(X=missing_data)
print("\n[Feature-Generated with Missing Values Head]")
print(featured_missing_data.head())


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.5/259.5 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.1/225.1 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m454.9/454.9 kB[0m [31m24.7 MB/s[0m eta

No path specified. Models will be saved in: "AutogluonModels/ag-20251101_235522"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Oct  2 10:42:05 UTC 2025
CPU Count:          2
Memory Avail:       11.36 GB / 12.67 GB (89.7%)
Disk Space Avail:   62.17 GB / 107.72 GB (57.7%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='extreme' : New in v1.4: Massively better than 'best' on datasets <30000 samples by using new models meta-learned on https://tabarena.ai: TabPFNv2, TabICL, Mitra, and TabM. Absolute best accuracy. Requires a GPU. Recommended 64 GB CPU memory and 32+ GB GPU memory.
	presets='best'    : Maximize accuracy. Recommended for most u

           A  B          C  D                E
0  -0.545774  0 2000-01-01  y     d jkl abc ef
1  -0.468674  0 2000-01-02  x  abc ghi ghi ghi
2   1.767960  0 1999-12-31  v    ghi d abc ghi
3  -0.118771  1 2000-01-01  y  abc ghi ghi jkl
4   0.630196  0 1999-12-31  w     abc ghi ef d
..       ... ..        ... ..              ...
95 -1.182318 -1 2000-01-01  v    ef abc ef abc
96  0.562761  0 2000-01-01  v   ghi abc jkl ef
97 -0.797270  0 2000-01-01  w     d ghi ef abc
98  0.502741  0 1999-12-31  y      ef d ef jkl
99  2.056356  0 1999-12-30  w      ef d ef abc

[100 rows x 5 columns]

[Feature-Generated Head]
          A  B  D    E                   C  C.year  C.month  C.day  \
0 -0.545774  0  3    2  946684800000000000    2000        1      1   
1 -0.468674  0  2  NaN  946771200000000000    2000        1      2   
2  1.767960  0  0    6  946598400000000000    1999       12     31   
3 -0.118771  1  3  NaN  946684800000000000    2000        1      1   
4  0.630196  0  1  NaN  946598400000

Beginning AutoGluon training ...
AutoGluon will save models to "/content/AutogluonModels/ag-20251101_235522"
Train Data Rows:    100
Train Data Columns: 5
Label Column:       label
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    11633.49 MB
	Train Data (Original)  Memory Usage: 0.01 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fitting CategoryMemoryMinimizeFeatureGenerator...
		Fitting DatetimeFeatureGenerator...
		Fitting TextSpecialFeatureGenerator...
			Fitting BinnedFeatureGenerator...
			Fitting DropDuplicatesFeatur


[Missing Data Head]
          A    B          C  D                E
0       NaN  0.0 2000-01-01  y     d jkl abc ef
1 -0.468674  NaN 2000-01-02  x  abc ghi ghi ghi
2  1.767960  0.0        NaT  v    ghi d abc ghi
3 -0.118771  1.0 2000-01-01  y  abc ghi ghi jkl
4  0.630196  0.0 1999-12-31  w     abc ghi ef d

[Feature-Generated with Missing Values Head]
          A    B  D    E                   C  C.year  C.month  C.day  \
0       NaN  0.0  3    2  946684800000000000    2000        1      1   
1 -0.468674  NaN  2  NaN  946771200000000000    2000        1      2   
2  1.767960  0.0  0    6  946688290909090944    2000        1      1   
3 -0.118771  1.0  3  NaN  946684800000000000    2000        1      1   
4  0.630196  0.0  1  NaN  946598400000000000    1999       12     31   

   C.dayofweek  E.char_count  E.symbol_ratio.   __nlp__.abc  __nlp__.ef  \
0            5             4                 3            1           1   
1            6             7                 0            1   

In [2]:
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import pandas as pd

cal = fetch_california_housing(as_frame=True)
df = cal.frame.copy()
df.columns = [c.replace(' ', '_') for c in df.columns]

label = 'MedHouseVal'

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_data = TabularDataset(train_df)
test_data  = TabularDataset(test_df)

display(train_data.head())
display(train_data[label].describe())

predictor = TabularPredictor(label=label, problem_type='regression').fit(train_data)

y_pred = predictor.predict(test_data.drop(columns=[label]))
display(y_pred.head())

predictor.evaluate(test_data, silent=True)
predictor.leaderboard(test_data)




Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
14196,3.2596,33.0,5.017657,1.006421,2300.0,3.691814,32.71,-117.03,1.03
8267,3.8125,49.0,4.473545,1.041005,1314.0,1.738095,33.77,-118.16,3.821
17445,4.1563,4.0,5.645833,0.985119,915.0,2.723214,34.66,-120.48,1.726
14265,1.9425,36.0,4.002817,1.033803,1418.0,3.994366,32.69,-117.11,0.934
2271,3.5542,43.0,6.268421,1.134211,874.0,2.3,36.78,-119.8,0.965


Unnamed: 0,MedHouseVal
count,16512.0
mean,2.071947
std,1.156226
min,0.14999
25%,1.198
50%,1.7985
75%,2.65125
max,5.00001


No path specified. Models will be saved in: "AutogluonModels/ag-20251101_235535"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Oct  2 10:42:05 UTC 2025
CPU Count:          2
Memory Avail:       11.25 GB / 12.67 GB (88.7%)
Disk Space Avail:   62.15 GB / 107.72 GB (57.7%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='extreme' : New in v1.4: Massively better than 'best' on datasets <30000 samples by using new models meta-learned on https://tabarena.ai: TabPFNv2, TabICL, Mitra, and TabM. Absolute best accuracy. Requires a GPU. Recommended 64 GB CPU memory and 32+ GB GPU memory.
	presets='best'    : Maximize accuracy. Recommended for most u

[1000]	valid_set's rmse: 0.490271
[2000]	valid_set's rmse: 0.480824
[3000]	valid_set's rmse: 0.479103
[4000]	valid_set's rmse: 0.47806
[5000]	valid_set's rmse: 0.477555
[6000]	valid_set's rmse: 0.478424


	-0.4775	 = Validation score   (-root_mean_squared_error)
	18.75s	 = Training   runtime
	1.14s	 = Validation runtime
Fitting model: LightGBM ...
	Fitting with cpus=1, gpus=0, mem=0.0/10.9 GB


[1000]	valid_set's rmse: 0.45723
[2000]	valid_set's rmse: 0.455822
[3000]	valid_set's rmse: 0.454135


	-0.454	 = Validation score   (-root_mean_squared_error)
	5.62s	 = Training   runtime
	0.55s	 = Validation runtime
Fitting model: RandomForestMSE ...
	Fitting with cpus=2, gpus=0
	-0.5303	 = Validation score   (-root_mean_squared_error)
	43.37s	 = Training   runtime
	0.38s	 = Validation runtime
Fitting model: CatBoost ...
	Fitting with cpus=1, gpus=0
	-0.4356	 = Validation score   (-root_mean_squared_error)
	132.99s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	Fitting with cpus=2, gpus=0
	-0.5307	 = Validation score   (-root_mean_squared_error)
	11.17s	 = Training   runtime
	0.22s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	Fitting with cpus=1, gpus=0, mem=0.0/10.7 GB
	-0.5457	 = Validation score   (-root_mean_squared_error)
	19.24s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: XGBoost ...
	Fitting with cpus=1, gpus=0
	-0.4589	 = Validation score   (-root_mean_squared_error)
	16.94s	 = Training   runtime
	1.01s	 

[1000]	valid_set's rmse: 0.453908
[2000]	valid_set's rmse: 0.452343
[3000]	valid_set's rmse: 0.452218
[4000]	valid_set's rmse: 0.452185


	-0.4522	 = Validation score   (-root_mean_squared_error)
	19.06s	 = Training   runtime
	0.99s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'CatBoost': 0.76, 'LightGBMLarge': 0.2, 'NeuralNetFastAI': 0.04}
	-0.4337	 = Validation score   (-root_mean_squared_error)
	0.01s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 427.68s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 1599.8 rows/s (1652 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("/content/AutogluonModels/ag-20251101_235535")


Unnamed: 0,MedHouseVal
20046,0.466455
3024,0.67895
15663,5.02021
20484,2.496017
9814,2.596785


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-0.427856,-0.433653,root_mean_squared_error,3.383083,1.032608,171.305957,0.012479,0.000557,0.014439,2,True,10
1,CatBoost,-0.431158,-0.43555,root_mean_squared_error,0.088172,0.017853,132.991499,0.088172,0.017853,132.991499,1,True,4
2,LightGBM,-0.438109,-0.454005,root_mean_squared_error,1.527592,0.548435,5.617006,1.527592,0.548435,5.617006,1,True,2
3,LightGBMLarge,-0.440919,-0.45215,root_mean_squared_error,3.171874,0.987152,19.058694,3.171874,0.987152,19.058694,1,True,9
4,XGBoost,-0.452747,-0.458902,root_mean_squared_error,3.143453,1.008186,16.942061,3.143453,1.008186,16.942061,1,True,7
5,LightGBMXT,-0.463101,-0.477455,root_mean_squared_error,3.397086,1.141433,18.752212,3.397086,1.141433,18.752212,1,True,1
6,RandomForestMSE,-0.506582,-0.530285,root_mean_squared_error,3.076966,0.377963,43.365276,3.076966,0.377963,43.365276,1,True,3
7,ExtraTreesMSE,-0.514481,-0.530657,root_mean_squared_error,1.014813,0.220269,11.173491,1.014813,0.220269,11.173491,1,True,5
8,NeuralNetTorch,-0.519059,-0.521245,root_mean_squared_error,0.054636,0.030756,150.762564,0.054636,0.030756,150.762564,1,True,8
9,NeuralNetFastAI,-0.545313,-0.545695,root_mean_squared_error,0.110559,0.027046,19.241324,0.110559,0.027046,19.241324,1,True,6


In [3]:
import os
import random
import numpy as np
import pandas as pd
from pathlib import Path
from PIL import Image

import torch
from torchvision import datasets

from autogluon.core.utils.loaders import load_zip
from autogluon.tabular import TabularPredictor, FeatureMetadata
from autogluon.tabular.configs.hyperparameter_configs import get_hyperparameter_config

download_dir = './ag_mnist_multimodal'
img_dir_train = os.path.join(download_dir, 'train_images')
img_dir_dev   = os.path.join(download_dir, 'dev_images')
os.makedirs(img_dir_train, exist_ok=True)
os.makedirs(img_dir_dev, exist_ok=True)

train_raw = datasets.MNIST(root=download_dir, train=True, download=True)
dev_raw   = datasets.MNIST(root=download_dir, train=False, download=True)

def save_split_to_csv(split, img_out_dir, csv_path):
    paths, labels, descs, pix_means = [], [], [], []
    for i in range(len(split)):
        img, label = split[i]

        out_path = os.path.abspath(os.path.join(img_out_dir, f"img_{i}.png"))
        if not os.path.exists(out_path):
            img.save(out_path)

        arr = np.array(img, dtype=np.float32)
        pmean = float(arr.mean())

        shades = "dark" if pmean < 60 else "mid" if pmean < 120 else "light"
        desc = f"digit {label} with {shades} strokes"

        paths.append(out_path)
        labels.append(int(label))
        descs.append(desc)
        pix_means.append(pmean)

    df = pd.DataFrame({
        'Images': paths,
        'Description': descs,
        'PixelMean': pix_means,
        'AdoptionSpeed': labels,
    })
    df.to_csv(csv_path, index=True)
    return df

train_csv_path = os.path.join(download_dir, 'train.csv')
dev_csv_path   = os.path.join(download_dir, 'dev.csv')

train_data = save_split_to_csv(train_raw, img_dir_train, train_csv_path)
test_data  = save_split_to_csv(dev_raw,   img_dir_dev,   dev_csv_path)

dataset_path = download_dir
label = 'AdoptionSpeed'
image_col = 'Images'

display(train_data.head(3))

def path_expander(path, base_folder):
    path_l = path.split(';')
    return ';'.join([os.path.abspath(os.path.join(base_folder, p)) for p in path_l])

train_data[image_col] = train_data[image_col].apply(lambda p: path_expander(p, base_folder=''))
test_data[image_col]  = test_data[image_col].apply(lambda p: path_expander(p, base_folder=''))

train_data = train_data.sample(500, random_state=0)

feature_metadata = FeatureMetadata.from_df(train_data)
feature_metadata = feature_metadata.add_special_types({image_col: ['image_path']})
print(feature_metadata)

hyperparameters = get_hyperparameter_config('multimodal')
hyperparameters

predictor = TabularPredictor(label=label).fit(
    train_data=train_data,
    hyperparameters=hyperparameters,
    feature_metadata=feature_metadata,
    time_limit=900,
)

leaderboard = predictor.leaderboard(test_data)
leaderboard


100%|██████████| 9.91M/9.91M [00:00<00:00, 18.1MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 481kB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 4.45MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 9.75MB/s]


Unnamed: 0,Images,Description,PixelMean,AdoptionSpeed
0,/content/ag_mnist_multimodal/train_images/img_0.png,digit 5 with dark strokes,35.108418,5
1,/content/ag_mnist_multimodal/train_images/img_1.png,digit 0 with dark strokes,39.661991,0
2,/content/ag_mnist_multimodal/train_images/img_2.png,digit 4 with dark strokes,24.799746,4


No path specified. Models will be saved in: "AutogluonModels/ag-20251102_000352"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Oct  2 10:42:05 UTC 2025
CPU Count:          2
Memory Avail:       10.48 GB / 12.67 GB (82.7%)
Disk Space Avail:   60.97 GB / 107.72 GB (56.6%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='extreme' : New in v1.4: Massively better than 'best' on datasets <30000 samples by using new models meta-learned on https://tabarena.ai: TabPFNv2, TabICL, Mitra, and TabM. Absolute best accuracy. Requires a GPU. Recommended 64 GB CPU memory and 32+ GB GPU memory.
	presets='best'    : Maximize accuracy. Recommended for most u

('float', [])              : 1 | ['PixelMean']
('int', [])                : 1 | ['AdoptionSpeed']
('object', ['image_path']) : 1 | ['Images']
('object', ['text'])       : 1 | ['Description']


Beginning AutoGluon training ... Time limit = 900s
AutoGluon will save models to "/content/AutogluonModels/ag-20251102_000352"
Train Data Rows:    500
Train Data Columns: 3
Label Column:       AdoptionSpeed
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == int, but few unique label-values observed).
	10 unique label values:  [np.int64(3), np.int64(6), np.int64(0), np.int64(2), np.int64(5), np.int64(8), np.int64(1), np.int64(9), np.int64(7), np.int64(4)]
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type parameter during Predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression', 'quantile'])
Problem Type:       multiclass
Preprocessing data ...
Train Data Class Count: 10
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    10733.91 MB
	Train Data (Original)  Memory Usage: 0.09 MB (0.0% of avai

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost,0.9904,0.98,accuracy,0.018466,0.00258,3.328425,0.018466,0.00258,3.328425,1,True,3
1,XGBoost,0.9892,0.98,accuracy,0.110466,0.005995,0.311056,0.110466,0.005995,0.311056,1,True,4
2,LightGBM,0.989,0.99,accuracy,0.360943,0.006402,0.769098,0.360943,0.006402,0.769098,1,True,1
3,WeightedEnsemble_L2,0.989,0.99,accuracy,0.367296,0.007701,0.829526,0.006353,0.001298,0.060428,2,True,7
4,LightGBMLarge,0.989,0.99,accuracy,0.68445,0.012477,2.418086,0.68445,0.012477,2.418086,1,True,6
5,LightGBMXT,0.9888,0.99,accuracy,1.868915,0.016653,0.664926,1.868915,0.016653,0.664926,1,True,2
6,NeuralNetTorch,0.2228,0.25,accuracy,0.073437,0.006744,1.618583,0.073437,0.006744,1.618583,1,True,5
