In [1]:
# i.4 — Multi-Label Tabular

# --- Clean & install compatible versions ---
!pip -q uninstall -y scikit-learn scikit-learn-intelex umap-learn >/dev/null
!pip -q install -U pip setuptools wheel >/dev/null
!pip -q install -U "scikit-learn==1.6.1" "autogluon==1.4.0" >/dev/null

# --- Imports ---
import numpy as np, pandas as pd, random
from sklearn.model_selection import train_test_split
from autogluon.tabular import TabularPredictor

# --- Minimal MultilabelPredictor (based on AutoGluon tutorial) ---
class MultilabelPredictor:
    """Maintain one TabularPredictor per label; optionally condition on earlier labels."""
    def __init__(self, labels, consider_labels_correlation=True, **tp_kwargs):
        if len(labels) < 2:
            raise ValueError("Use TabularPredictor for a single label; this class is for MULTI-label.")
        self.labels = list(labels)
        self.consider_labels_correlation = consider_labels_correlation
        self.predictors = {
            lab: TabularPredictor(label=lab, **tp_kwargs) for lab in self.labels
        }

    def fit(self, train_data, tuning_data=None, **fit_kwargs):
        td_full = train_data.copy()
        vd_full = tuning_data.copy() if tuning_data is not None else None
        for i, lab in enumerate(self.labels):
            # drop labels that come AFTER the current one (autoregressive)
            drop_labs = self.labels[i+1:] if self.consider_labels_correlation else [l for l in self.labels if l != lab]
            td = td_full.drop(columns=drop_labs, errors="ignore")
            vd = None if vd_full is None else vd_full.drop(columns=drop_labs, errors="ignore")
            print(f"Fitting predictor for label: {lab}")
            self.predictors[lab].fit(train_data=td, tuning_data=vd, **fit_kwargs)
        return self

    def predict(self, data, **kwargs):
        df = data.copy()
        out = pd.DataFrame(index=df.index)
        for i, lab in enumerate(self.labels):
            # when correlated, feed previous predictions back in as features
            if self.consider_labels_correlation and i > 0:
                for prev in self.labels[:i]:
                    df[prev] = out[prev]
            out[lab] = self.predictors[lab].predict(df, **kwargs)
        return out

    def evaluate(self, data, **kwargs):
        df = data.copy()
        scores = {}
        for i, lab in enumerate(self.labels):
            if self.consider_labels_correlation and i > 0:
                for prev in self.labels[:i]:
                    df[prev] = self.predictors[prev].predict(df, **kwargs)
            print(f"Evaluating {lab} ...")
            scores[lab] = self.predictors[lab].evaluate(df, **kwargs)
        return scores

# --- Tiny synthetic multi-label dataset (numeric + categorical + text) ---
rng = np.random.default_rng(42); random.seed(42)
N = 1200
breeds = ["mix","labrador","siamese","bulldog","persian"]
colors = ["black","white","brown","gold","tabby","tri"]
pos_tokens = ["friendly","playful","gentle","trained","housebroken","social"]
neg_tokens = ["timid","special_needs","anxious","senior","requires_experience","shy"]

age = rng.integers(1, 180, size=N)
fee = np.round(rng.uniform(0, 300, size=N), 2)
breed = rng.choice(breeds, size=N)
color = rng.choice(colors, size=N)
photo_amt = rng.integers(0, 6, size=N)
def make_desc():
    k_pos = rng.integers(1, 3)
    words = rng.choice(pos_tokens, size=k_pos, replace=False).tolist()
    if rng.random() < 0.4:
        words += [rng.choice(neg_tokens)]
    return " ".join(words)
desc = [make_desc() for _ in range(N)]

score_base = (
    -0.02*age + 0.004*fee + 0.15*(photo_amt>=3).astype(float)
    + 0.25*np.array(["trained" in s for s in desc], float)
    - 0.20*np.array(["requires_experience" in s for s in desc], float)
)
is_playful       = (score_base + 0.10*np.array(["playful" in s for s in desc], float) > 0.15).astype(int)
is_senior        = (age > 96).astype(int) | np.array(["senior" in s for s in desc], int)
needs_experience = (score_base < -0.15).astype(int) | np.array(["requires_experience" in s for s in desc], int)

df = pd.DataFrame({
    "Age": age, "Fee": fee, "Breed": breed, "Color": color,
    "Description": desc, "PhotoAmt": photo_amt,
    "is_playful": is_playful, "is_senior": is_senior, "needs_experience": needs_experience
})

labels = ["is_playful","is_senior","needs_experience"]
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=123,
    stratify=df[labels].astype(str).agg('|'.join, axis=1)
)

# --- Train multilabel (tiny budget; fast on CPU) ---
ml = MultilabelPredictor(
    labels=labels,
    consider_labels_correlation=True,          # set False to train independent heads
    problem_type="binary",                     # each head is binary
    eval_metric="f1"                           # per-label F1
).fit(
    train_data=train_df,
    presets="medium_quality_faster_train",
    time_limit=120
)

# --- Evaluate & save ---
scores = ml.evaluate(test_df)
print("\nPer-label scores:", scores)

pred = ml.predict(test_df)
pred.to_csv("multilabel_predictions.csv", index=False)
print("\nSaved: multilabel_predictions.csv")


[0m

No path specified. Models will be saved in: "AutogluonModels/ag-20251027_062612"
No path specified. Models will be saved in: "AutogluonModels/ag-20251027_062612-001"
No path specified. Models will be saved in: "AutogluonModels/ag-20251027_062612-002"
Preset alias specified: 'medium_quality_faster_train' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Oct  2 10:42:05 UTC 2025
CPU Count:          2
Memory Avail:       11.43 GB / 12.67 GB (90.2%)
Disk Space Avail:   61.50 GB / 107.72 GB (57.1%)
Presets specified: ['medium_quality_faster_train']
Using hyperparameters preset: hyperparameters='default'


Fitting predictor for label: is_playful


Beginning AutoGluon training ... Time limit = 120s
AutoGluon will save models to "/content/AutogluonModels/ag-20251027_062612"
Train Data Rows:    960
Train Data Columns: 6
Label Column:       is_playful
Problem Type:       binary
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    11706.16 MB
	Train Data (Original)  Memory Usage: 0.18 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fitting CategoryMemoryMinimizeFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage

Fitting predictor for label: is_senior


Beginning AutoGluon training ... Time limit = 120s
AutoGluon will save models to "/content/AutogluonModels/ag-20251027_062612-001"
Train Data Rows:    960
Train Data Columns: 7
Label Column:       is_senior
Problem Type:       binary
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    11293.01 MB
	Train Data (Original)  Memory Usage: 0.19 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 1 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fitting CategoryMemoryMinimi

Fitting predictor for label: needs_experience


Beginning AutoGluon training ... Time limit = 120s
AutoGluon will save models to "/content/AutogluonModels/ag-20251027_062612-002"
Train Data Rows:    960
Train Data Columns: 8
Label Column:       needs_experience
Problem Type:       binary
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    11275.06 MB
	Train Data (Original)  Memory Usage: 0.20 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 2 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fitting CategoryMemor

Evaluating is_playful ...
Evaluating is_senior ...
Evaluating needs_experience ...

Per-label scores: {'is_playful': {'f1': 0.9285714285714286, 'accuracy': 0.975, 'balanced_accuracy': np.float64(0.940630797773655), 'mcc': np.float64(0.9149855276286857), 'roc_auc': np.float64(0.9980287569573283), 'precision': 0.975, 'recall': 0.8863636363636364}, 'is_senior': {'f1': 0.9789029535864979, 'accuracy': 0.9791666666666666, 'balanced_accuracy': np.float64(0.9793388429752066), 'mcc': np.float64(0.9591774224021128), 'roc_auc': np.float64(0.9947913049517328), 'precision': 1.0, 'recall': 0.9586776859504132}, 'needs_experience': {'f1': 0.9583333333333334, 'accuracy': 0.9333333333333333, 'balanced_accuracy': np.float64(0.8693371001917063), 'mcc': np.float64(0.7989954900119854), 'roc_auc': np.float64(0.9850670971647665), 'precision': 0.934010152284264, 'recall': 0.983957219251337}}

Saved: multilabel_predictions.csv
