In [None]:
# Install tools
!pip install -q kaggle autogluon.tabular scikit-learn

# Upload your Kaggle key (kaggle.json) from your computer
from google.colab import files
uploaded = files.upload()  # pick kaggle.json

# Put the key where Kaggle CLI expects it
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Quick check
!kaggle --version


Saving kaggle.json to kaggle (2).json
Kaggle API 1.7.4.5


In [None]:
import os, pandas as pd

COMP = "california-homelessness-prediction-challenge"
DATA_DIR = "/content/data"
DATASET_DIR = os.path.join(DATA_DIR, COMP)
MODELS_DIR = os.path.join(DATA_DIR, "AutoGluonModels")

# Make folders and pull files
!mkdir -p "{DATA_DIR}" "{DATASET_DIR}" "{MODELS_DIR}"

# (A) See what files exist in the competition (helps sanity-check)
!kaggle competitions files -c "{COMP}"

# (B) Download + unzip
!kaggle competitions download -c "{COMP}" -p "{DATA_DIR}" --force
!unzip -o -q "{DATA_DIR}/{COMP}.zip" -d "{DATASET_DIR}"
!rm -f "{DATA_DIR}/{COMP}.zip"

# Peek at the files we got
print("Files:", os.listdir(DATASET_DIR))

# Load sample_submission to detect the target column(s)
sub_path = os.path.join(DATASET_DIR, "sample_submission.csv")
sub = pd.read_csv(sub_path)
print("sample_submission columns:", list(sub.columns))

# Heuristics to auto-detect id + target(s)
# - If there are exactly 2 columns, it's usually [id_col, target_col]
# - If more, we assume the first column is an ID-like column
id_col = sub.columns[0]
target_cols = [c for c in sub.columns if c != id_col]
print("ID column guess:", id_col)
print("Target column(s) guess:", target_cols)

name                          size  creationDate                
----------------------  ----------  --------------------------  
sample_submission.csv          634  2025-08-02 01:31:53.475000  
sandbox_submission.csv         634  2025-08-02 01:31:53.475000  
test.csv                     33559  2025-08-02 01:31:53.475000  
train.csv                    79563  2025-08-02 01:31:53.475000  
Downloading california-homelessness-prediction-challenge.zip to /content/data
  0% 0.00/50.7k [00:00<?, ?B/s]
100% 50.7k/50.7k [00:00<00:00, 158MB/s]
Files: ['sample_submission.csv', 'test.csv', 'sandbox_submission.csv', 'train.csv']
sample_submission columns: ['ID', 'HOMELESS_RATE']
ID column guess: ID
Target column(s) guess: ['HOMELESS_RATE']


In [None]:
import numpy as np, random, time
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import train_test_split

SEED = 42
np.random.seed(SEED); random.seed(SEED)

train_path = os.path.join(DATASET_DIR, "train.csv")
test_path  = os.path.join(DATASET_DIR, "test.csv")

train_df = pd.read_csv(train_path, low_memory=False)
test_df  = pd.read_csv(test_path,  low_memory=False)

print("train shape:", train_df.shape)
print("test shape :", test_df.shape)

# Try to pick features = columns shared by train & test, excluding ID and target columns
shared_cols = [c for c in train_df.columns if c in test_df.columns]
if id_col in shared_cols:
    shared_cols.remove(id_col)
for tc in target_cols:
    if tc in shared_cols:
        shared_cols.remove(tc)

print("Feature columns (first 10):", shared_cols[:10])

# We’ll handle only the FIRST target for a simple baseline
TARGET = target_cols[0]
print("Using target:", TARGET)

# Optional: drop rows with missing target
train_df = train_df.dropna(subset=[TARGET]).reset_index(drop=True)

# Simple split
train_part, dev_part = train_test_split(train_df, test_size=0.2, random_state=SEED)

# Train a quick baseline (good_quality for speed)
save_dir = os.path.join(MODELS_DIR, f"baseline_{time.strftime('%Y%m%d_%H%M%S')}")
predictor = TabularPredictor(label=TARGET, path=save_dir, eval_metric="rmse", verbosity=2)
predictor.fit(
    train_data=train_part[[*shared_cols, TARGET]],
    tuning_data=dev_part[[*shared_cols, TARGET]],
    presets="good_quality",
    time_limit=300,              # ~5 minutes cap
    num_bag_folds=0,
    num_stack_levels=0,
    keep_only_best=True,
)

# Evaluate on dev set
dev_metrics = predictor.evaluate(dev_part[[*shared_cols, TARGET]])
print("Dev metrics:", dev_metrics)

# Build submission using sample_submission’s exact columns/order
submission = sub.copy()
preds = predictor.predict(test_df[shared_cols])

# If the competition expects integers, round safely; else keep floats.
# We’ll keep floats by default:
submission[TARGET] = preds.values

# If sample_submission had multiple target columns, we’d need to predict each.
# For now, we handle the single-column case. If there are more, tell me and I’ll extend this cell.

out_csv = os.path.join(save_dir, "submission.csv")
submission.to_csv(out_csv, index=False)
print("Saved submission to:", out_csv)

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Oct  2 10:42:05 UTC 2025
CPU Count:          2
Memory Avail:       11.26 GB / 12.67 GB (88.8%)
Disk Space Avail:   186.08 GB / 225.83 GB (82.4%)
Presets specified: ['good_quality']
Using hyperparameters preset: hyperparameters='light'
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=0, num_bag_sets=1


train shape: (130, 33)
test shape : (56, 32)
Feature columns (first 10): ['AGE_U18_PCT', 'AGE_18_24_PCT', 'AGE_25_34_PCT', 'AGE_35_44_PCT', 'AGE_45_54_PCT', 'AGE_55_59_PCT', 'AGE_60_61_PCT', 'AGE_62_64_PCT', 'AGE_65_69_PCT', 'AGE_70_79_PCT']
Using target: HOMELESS_RATE


Beginning AutoGluon training ... Time limit = 300s
AutoGluon will save models to "/content/data/AutoGluonModels/baseline_20251025_184346"
Train Data Rows:    104
Train Data Columns: 31
Tuning Data Rows:    26
Tuning Data Columns: 31
Label Column:       HOMELESS_RATE
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (0.0587983619583155, 0.0, 0.00408, 0.0073)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during Predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression', 'quantile'])
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    11476.11 MB
	Train Data (Original)  Memory Usage: 0.03 MB (0.0% of available memory)
	Inferring data type of eac

Dev metrics: {'root_mean_squared_error': np.float64(-0.0017371703069817617), 'mean_squared_error': -3.0177606754591083e-06, 'mean_absolute_error': -0.0012172881570828155, 'r2': 0.6174983158683554, 'pearsonr': 0.7907787274275869, 'median_absolute_error': np.float64(-0.0008615442493178672)}
Saved submission to: /content/data/AutoGluonModels/baseline_20251025_184346/submission.csv


In [None]:
SUBMIT_FILE = out_csv  # from the print above
!kaggle competitions submit -c "{COMP}" -f "{SUBMIT_FILE}" -m "baseline auto"
!kaggle competitions submissions -c "{COMP}" | head -n 20

100% 1.05k/1.05k [00:00<00:00, 3.03kB/s]
Successfully submitted to California Homelessness Prediction ChallengefileName        date                        description    status                    publicScore  privateScore  
--------------  --------------------------  -------------  ------------------------  -----------  ------------  
submission.csv  2025-10-25 18:44:47.563000  baseline auto  SubmissionStatus.PENDING                             
