In [2]:
import cnvrgv2
import pandas as pd
import numpy as np
import os

from data_science_tools.core.utils.io import within_directory
from data_science_tools.datasets import AttrGeomSet

# Data

In [3]:
# Fetch embeddings - these were generated as part of a Cnvrg experiment
cnvrg = cnvrgv2.Cnvrg()

my_proj = cnvrg.projects.get("dinov2_living_area")
experiment = my_proj.experiments.get("lgtt3jv7hbdv8oz1gpe4")

with within_directory("./data/"):
    if not os.path.exists("./output/inference"):
        experiment.pull_artifacts(wait_until_success=True, poll_interval=5)

In [4]:
# Read the dataframe containing the embeddings for all living area datasets.
inference_df = pd.read_hdf("./data/output/inference/output_tensors.h5")

In [5]:
columns_to_use = ["geometry", "imagery_source", "imagery_date", "split"]

# add columns of the dataframe that include the embeddings
embedding_columns = [col for col in inference_df.columns if "emb" in col]
columns_to_use.extend(embedding_columns)

df = inference_df[columns_to_use]

In [6]:
# save to GCP - run only once
# df.to_parquet("gs://cape-ml-projects-data/pj_living_area_dev_v5/dinov2_living_area_embeddings.parquet")

In [7]:
df_train = df[df["split"] == "ta_mls_train"]
df_test = df[df["split"] == "ta_mls_test"]

df_eval = df[df["split"] == "cubicasa_test"]

In [8]:
# To get the targets, we need to fetch the AttrGeomSets
ta_mls_train_mnemonic = "living_area_v3/20220331_cnn_train_all"
ta_mls_test_mnemonic = "living_area_v3/20220331_cnn_test_all"
cubicasa_test_mneomonic = "pj_living_area_dev_v5/20231115_cubicasa_ortho_survey_id_test"

ta_mls_train_ds = AttrGeomSet.from_mnemonic(ta_mls_train_mnemonic)
ta_mls_test_ds = AttrGeomSet.from_mnemonic(ta_mls_test_mnemonic)
cubicasa_test_ds = AttrGeomSet.from_mnemonic(cubicasa_test_mneomonic)

ta_mls_train_df = ta_mls_train_ds.to_pandas()
ta_mls_train_df = ta_mls_train_df[['geometry', 'imagery_source', 'imagery_date', 'gla_sqft']]
ta_mls_train_df["gla_target"] = ta_mls_train_df["gla_sqft"]

ta_mls_test_df = ta_mls_test_ds.to_pandas()
ta_mls_test_df = ta_mls_test_df[['geometry', 'imagery_source', 'imagery_date', 'gla_sqft']]
ta_mls_test_df["gla_target"] = ta_mls_test_df["gla_sqft"]

cubicasa_test_df = cubicasa_test_ds.to_pandas()
cubicasa_test_df = cubicasa_test_df[['geometry', 'imagery_source', 'imagery_date', 'gla_target']]


In [9]:
df_train = df_train.merge(ta_mls_train_df, on=["geometry", "imagery_source", "imagery_date"])
df_test = df_test.merge(ta_mls_test_df, on=["geometry", "imagery_source", "imagery_date"])

df_eval = df_eval.merge(cubicasa_test_df, on=["geometry", "imagery_source", "imagery_date"])

# Modeling

In [23]:
df_train_use = df_train.sample(frac=0.1, random_state=42)
print(df_train_use.shape)

(23705, 2054)


In [24]:
from sklearn.model_selection import train_test_split
df_use_train, df_use_val = train_test_split(df_train_use, test_size=0.2, random_state=42)

In [25]:
x_train = df_use_train[embedding_columns].to_numpy()
y_train = df_use_train["gla_target"].to_numpy()

# For choosing the best model during a single training run
# and for preventing overfitting
x_val = df_use_val[embedding_columns].to_numpy()
y_val = df_use_val["gla_target"].to_numpy()

# For chossing the best hyperparam model
x_test = df_test[embedding_columns].to_numpy()
y_test = df_test["gla_target"].to_numpy()

# Only for final evaluation
x_eval = df_eval[embedding_columns].to_numpy()
y_eval = df_eval["gla_target"].to_numpy()

In [26]:
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

In [28]:
reg = CatBoostRegressor(
    iterations=1000,
    verbose=100,
    random_state=42,
    loss_function="MAE",
    use_best_model=True,
    eval_metric="MAE",
    early_stopping_rounds=100,
)

In [29]:
reg.fit(x_train, y_train, eval_set=(x_val, y_val))

0:	learn: 644.8835828	test: 639.6064451	best: 639.6064451 (0)	total: 246ms	remaining: 4m 5s
100:	learn: 417.3160551	test: 424.4096615	best: 424.4096615 (100)	total: 18.5s	remaining: 2m 45s
200:	learn: 381.5169575	test: 396.8859329	best: 396.8859329 (200)	total: 37.5s	remaining: 2m 28s
300:	learn: 364.0827707	test: 386.1006790	best: 386.1006790 (300)	total: 55.9s	remaining: 2m 9s
400:	learn: 349.8105586	test: 379.5393160	best: 379.5393160 (400)	total: 1m 13s	remaining: 1m 49s
500:	learn: 337.4981752	test: 374.8832350	best: 374.8832350 (500)	total: 1m 30s	remaining: 1m 29s
600:	learn: 327.3635953	test: 371.7029401	best: 371.7029401 (600)	total: 1m 47s	remaining: 1m 11s
700:	learn: 319.0723615	test: 369.5948568	best: 369.5948568 (700)	total: 2m 4s	remaining: 52.9s
800:	learn: 311.9754143	test: 368.0950492	best: 368.0950492 (800)	total: 2m 21s	remaining: 35s
900:	learn: 305.3810969	test: 366.9314345	best: 366.9314345 (900)	total: 2m 37s	remaining: 17.3s
999:	learn: 299.6257890	test: 365.77

<catboost.core.CatBoostRegressor at 0x7fd6e7f8d340>

In [30]:
from pj_living_area_dev_v5.transmo_development.utils.catboost_metrics import metrics_factory

ModuleNotFoundError: No module named 'transmo'