# Execute the Graph-Based Spatial Cross-Validation experiments from ICMLA21 Paper

## 1 - Initialize libraries

In [1]:
import os
import pandas as pd
import geopandas as gpd
from weka.core import jvm
from pathlib import Path
from src import utils
from src.pipeline import Pipeline
from src.visualization.performance import VizMetrics
from src.visualization.dependence import VizDependence

You can install them with  `pip install urbanaccess pandana` or `conda install -c udst pandana urbanaccess`
  warn(


## 2 - Initialize loggers

In [2]:
utils.initialize_coloredlog()
utils.initialize_rich_tracerback()
utils.initialize_logging()

## 3 - Initialize working path and enviromental variables

In [3]:
# Project path
project_dir = str(Path().resolve().parents[1])
# Load enviromental variables
env_var = utils.load_env_variables(project_dir)
# Load parameters
dataset = "US_Wheat_Yield_2014"
parameters = utils.load_json(os.path.join(project_dir, "parameters", "validation", f"{dataset}.json"))

## 4 - Set pipeline switchers, the default is to set True to all processes

In [4]:
# Set pipeline switchers
SWITCHERS = {
    "scv": True,
    "fs": True,
    "train": True,
    "predict": False,
    "evaluate": False,
}

# 4 - List all datasets

In [5]:
dataset_path = os.path.join(env_var["root_path"], dataset)
dataset_list = [folder for folder in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, folder))]
dataset_list.remove("Original")
dataset_list.sort()

## 6 - Runs the pipeline for each method
OBS: The results and files generated from the pipeline execution will be in the created folder Results in the data directory

### 6.1 Traditional SCV
OBS: We the the paramenter fast True so the semivariogram calculation step that can take 24h is skipped. We calculate the removing buffer by considering the 27 n-degree neighborhood as stated in the paper 

In [6]:
fs_method = parameters["fs_method"]
ml_methods = parameters["ml_methods"]
# Set paths
meshblocks_filename = parameters["meshblock"]
meshblocks_idx = parameters["meshblock_id"]
dataset_path = os.path.join(env_var["root_path"], dataset)

if fs_method == "CFS" and SWITCHERS["fs"]:
    jvm.start()
for dataset in dataset_list:
    data_path = os.path.join(dataset_path, dataset, "data.csv")
    meshblocks_path = os.path.join(dataset_path, dataset, "meshblocks", meshblocks_filename)
    # Load data
    data = pd.read_csv(data_path, index_col=parameters["index_col"], low_memory=False)
    meshblocks = gpd.read_file(meshblocks_path)
    # Remove uncessary cols
    if parameters["cols_remove"]:
        data.drop(columns=parameters["cols_remove"], inplace=True)
    # Set meshblocks  index
    if meshblocks_idx:
        meshblocks.set_index(meshblocks_idx, inplace=True)
    for ml_method in ml_methods:
        # Run pipeline
        TraditionalSCV = Pipeline(
            root_path=os.path.join(dataset_path, dataset),
            data=data,
            meshblocks=meshblocks,
            index_col=parameters["index_col"],
            fold_col=parameters["fold_col"],
            target_col=parameters["target_col"],
            scv_method="TraditionalSCV",
            fs_method=fs_method,
            ml_method=ml_method,
            switchers=SWITCHERS
        )

        print(f"Running the Traditional SCV approach for dataset: {dataset} ML Method = {ml_method}")
        TraditionalSCV.run()
if fs_method == "CFS" and SWITCHERS["fs"]:
    jvm.stop()

Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Kansas ML Method = KNN


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 60.05it/s]


Execution time: 2.508533000946045 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 593.65it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 104.97it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Kansas ML Method = OLS


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 62.13it/s]


Execution time: 0.8855555057525635 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 603.21it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 54.08it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Kansas ML Method = Lasso


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 61.38it/s]


Execution time: 0.8406765460968018 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 581.15it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 158.47it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Kansas ML Method = Ridge


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 62.43it/s]


Execution time: 0.741438627243042 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 614.01it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 126.60it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Kansas ML Method = ElasticNet


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 62.63it/s]


Execution time: 0.8247361183166504 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 612.08it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 166.89it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Kansas ML Method = DT


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 62.94it/s]


Execution time: 0.7371425628662109 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 614.87it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 57.40it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Kansas ML Method = LGBM


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 62.64it/s]


Execution time: 0.7334127426147461 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 616.56it/s]
Training model: 100%|██████████| 4/4 [00:01<00:00,  2.92it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Kansas ML Method = RF


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 62.87it/s]


Execution time: 0.8020420074462891 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 616.74it/s]
Training model: 100%|██████████| 4/4 [00:07<00:00,  1.98s/it]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Kansas ML Method = MLP


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 61.85it/s]


Execution time: 0.7335798740386963 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 569.03it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 25.46it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Kansas ML Method = SVM


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 62.21it/s]


Execution time: 0.7438068389892578 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 612.91it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 55.23it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Montana ML Method = KNN


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 47.95it/s]


Execution time: 0.8669068813323975 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 613.74it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 134.20it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Montana ML Method = OLS


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 47.46it/s]


Execution time: 0.86155104637146 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 621.17it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 141.47it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Montana ML Method = Lasso


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 47.55it/s]


Execution time: 0.8807542324066162 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 580.19it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 150.11it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Montana ML Method = Ridge


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 46.67it/s]


Execution time: 0.8701663017272949 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 580.99it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 168.37it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Montana ML Method = ElasticNet


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 47.35it/s]


Execution time: 0.8759937286376953 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 614.89it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 166.30it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Montana ML Method = DT


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 47.01it/s]


Execution time: 0.9031124114990234 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 618.58it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 69.62it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Montana ML Method = LGBM


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 47.62it/s]


Execution time: 0.8890023231506348 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 624.50it/s]
Training model: 100%|██████████| 4/4 [00:01<00:00,  3.63it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Montana ML Method = RF


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 48.01it/s]


Execution time: 0.9485106468200684 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 624.36it/s]
Training model: 100%|██████████| 4/4 [00:06<00:00,  1.52s/it]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Montana ML Method = MLP


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 47.64it/s]


Execution time: 0.8656635284423828 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 623.53it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 31.28it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Montana ML Method = SVM


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 46.66it/s]


Execution time: 0.8689858913421631 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 624.66it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 61.01it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma ML Method = KNN


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 53.91it/s]


Execution time: 0.7943837642669678 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 584.55it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 68.91it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma ML Method = OLS


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 54.45it/s]


Execution time: 0.7788825035095215 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 581.11it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 163.65it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma ML Method = Lasso


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 52.98it/s]


Execution time: 0.783928632736206 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 574.80it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 153.15it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma ML Method = Ridge


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 54.65it/s]


Execution time: 0.7903592586517334 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 606.75it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 173.31it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma ML Method = ElasticNet


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 54.77it/s]


Execution time: 0.7815029621124268 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 604.76it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 159.67it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma ML Method = DT


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 53.98it/s]


Execution time: 0.7885010242462158 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 599.59it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 42.61it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma ML Method = LGBM


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 53.92it/s]


Execution time: 0.7891597747802734 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 605.24it/s]
Training model: 100%|██████████| 4/4 [00:01<00:00,  2.73it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma ML Method = RF


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 54.31it/s]


Execution time: 0.8602325916290283 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 598.99it/s]
Training model: 100%|██████████| 4/4 [00:10<00:00,  2.74s/it]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma ML Method = MLP


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 54.81it/s]


Execution time: 0.795365571975708 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 591.58it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 22.90it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma ML Method = SVM


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 53.94it/s]


Execution time: 0.774226188659668 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 595.80it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 37.96it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Texas ML Method = KNN


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 74.40it/s]


Execution time: 0.6618592739105225 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 615.05it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 126.55it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Texas ML Method = OLS


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 75.02it/s]


Execution time: 0.6586129665374756 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 623.97it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 127.22it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Texas ML Method = Lasso


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 74.50it/s]


Execution time: 0.6622052192687988 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 633.99it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 162.93it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Texas ML Method = Ridge


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 74.77it/s]


Execution time: 0.6596360206604004 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 630.20it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 172.87it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Texas ML Method = ElasticNet


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 74.38it/s]


Execution time: 0.6769058704376221 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 629.35it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 164.90it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Texas ML Method = DT


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 74.63it/s]


Execution time: 0.6752634048461914 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 631.86it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 64.23it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Texas ML Method = LGBM


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 75.39it/s]


Execution time: 0.6665060520172119 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 634.85it/s]
Training model: 100%|██████████| 4/4 [00:01<00:00,  3.02it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Texas ML Method = RF


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 74.83it/s]


Execution time: 0.747455358505249 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 635.19it/s]
Training model: 100%|██████████| 4/4 [00:07<00:00,  1.76s/it]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Texas ML Method = MLP


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 72.85it/s]


Execution time: 0.6585066318511963 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 634.54it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 27.14it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Texas ML Method = SVM


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 75.19it/s]


Execution time: 0.6492893695831299 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 636.22it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 85.63it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Washington ML Method = KNN


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 44.77it/s]


Execution time: 1.0491876602172852 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 623.67it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 143.36it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Washington ML Method = OLS


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 44.81it/s]


Execution time: 0.9121520519256592 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 618.70it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 128.50it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Washington ML Method = Lasso


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 44.69it/s]


Execution time: 0.9041943550109863 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 619.47it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 164.83it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Washington ML Method = Ridge


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 43.94it/s]


Execution time: 0.9240236282348633 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 615.66it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 151.00it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Washington ML Method = ElasticNet


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 44.65it/s]


Execution time: 0.9263880252838135 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 597.63it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 167.94it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Washington ML Method = DT


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 45.15it/s]


Execution time: 0.9196758270263672 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 610.06it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 83.62it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Washington ML Method = LGBM


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 45.06it/s]


Execution time: 0.905123233795166 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 614.21it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00,  5.25it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Washington ML Method = RF


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 45.01it/s]


Execution time: 0.9963335990905762 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 611.70it/s]
Training model: 100%|██████████| 4/4 [00:04<00:00,  1.19s/it]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Washington ML Method = MLP


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 45.14it/s]


Execution time: 0.918790340423584 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 608.51it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 32.93it/s]


Running the Traditional SCV approach for dataset: US_Wheat_2014_Removed_Washington ML Method = SVM


Creating folds: 100%|██████████| 4/4 [00:00<00:00, 45.24it/s]


Execution time: 0.9074578285217285 seconds


Selecting Features: 100%|██████████| 4/4 [00:00<00:00, 601.27it/s]
Training model: 100%|██████████| 4/4 [00:00<00:00, 85.26it/s]
