# Execute the Graph-Based Spatial Cross-Validation experiments from ICMLA21 Paper

## 1 - Initialize libraries

In [13]:
import os
import pandas as pd
import numpy as np
from weka.core import jvm
from src import utils
from src.pipeline import Pipeline
from src.visualization.performance import VizMetrics
from src.visualization.dependence import VizDependence
import warnings
warnings.filterwarnings("ignore")

## 2 - Initialize loggers

In [14]:
utils.initialize_coloredlog()
utils.initialize_rich_tracerback()
utils.initialize_logging()

## 3 - Initialize working path and enviromental variables

In [15]:
# Project path
project_dir = os.path.abspath('')[:-5]
# Load enviromental variables
env_var = utils.load_env_variables(project_dir)

## 5 - Set pipeline switchers, the default is to set True to all processes

In [16]:
# Set pipeline switchers
SWITCHERS = {
    "scv": True,
    "fs": False,
    "train": False,
    "predict": False,
    "evaluate": False,
}

# 5 - List all datasets

In [17]:
brazil_removed_datasets = ["Brazil_Election_2018_Sampled_dec0.3_prob0.1_removed_north",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.1_removed_northeast",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.1_removed_south",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.1_removed_southeast",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.1_removed_centerwest"]

brazil_datasets = [#"Brazil_Election_2018_Sampled_dec0.3_prob0.1",
                   #"Brazil_Election_2018_Sampled_dec0.3_prob0.2",
                   #"Brazil_Election_2018_Sampled_dec0.3_prob0.3",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.4",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.5",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.6",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.7",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.8",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.9"
                   ]

australia_datasets = ["Australia_Election_2019_Sampled_dec0.05_prob0.1",
                   "Australia_Election_2019_Sampled_dec0.05_prob0.2",
                   "Australia_Election_2019_Sampled_dec0.05_prob0.3",
                   "Australia_Election_2019_Sampled_dec0.05_prob0.4",
                   "Australia_Election_2019_Sampled_dec0.05_prob0.5",
                   "Australia_Election_2019_Sampled_dec0.05_prob0.6",
                   "Australia_Election_2019_Sampled_dec0.05_prob0.7",
                   "Australia_Election_2019_Sampled_dec0.05_prob0.8",
                   "Australia_Election_2019_Sampled_dec0.05_prob0.9",
                   ]
brazil_geoeconomical_regions = ["Brazil_Election_2018_removed_AMAZONIA",
                                "Brazil_Election_2018_removed_NORDESTE",
                                "Brazil_Election_2018_removed_CENTRO_SUL"]

us_corn_datasets = ["US_Corn_Yield_2016_Removed_Northeast",
                    "US_Corn_Yield_2016_Removed_Southeast",
                    "US_Corn_Yield_2016_Removed_Midwest",
                    "US_Corn_Yield_2016_Removed_Southwest",
                    "US_Corn_Yield_2016_Removed_West"]

us_wheat_datasets = ["US_Wheat_2014_Removed_Kansas",
                    "US_Wheat_2014_Removed_Montana",
                    "US_Wheat_2014_Removed_Oklahoma",
                    "US_Wheat_2014_Removed_Texas",
                    "US_Wheat_2014_Removed_Washington"]

single = ["Brazil_Election_2018_Sampled_dec0.3_prob0.5"]

## 6 - Runs the pipeline for each method
OBS: The results and files generated from the pipeline execution will be in the created folder Results in the data directory

### 6.2 RegGBSCV

In [18]:
import contextlib
dataset_list = us_wheat_datasets
fs_method = "All"
#ml_methods = ["KNN", "OLS", "Lasso", "Ridge", "ElasticNet", "DT", "LGBM", "RF", "MLP", "SVM"]
ml_methods = ["KNN"]

if fs_method == "CFS" and SWITCHERS["fs"]:
    jvm.start()

env_var["root_path"] = "/home/tpinho/IJGIS/Datasets/US_Wheat_Yield_2014"
for dataset in dataset_list:
    # Load data
    path = os.path.join(env_var["root_path"], dataset, "data.csv")
    data = pd.read_csv(path, index_col="INDEX", low_memory=False)
    with contextlib.suppress(KeyError):
        data.drop(columns=["[GEO]_LATITUDE", "[GEO]_LONGITUDE"], inplace=True)
    with contextlib.suppress(KeyError):
        data.drop(columns=["[GEO]_DIVISIONNM"], inplace=True)
    # Load adjacency matrix
    adj_matrix = pd.read_csv(
        os.path.join(env_var["root_path"], dataset, "normd_matrix.csv"), low_memory=False
    )
    w_matrix = pd.read_csv(
        os.path.join(env_var["root_path"], dataset, "normd_matrix.csv"), low_memory=False
    )
    adj_matrix.set_index(adj_matrix.columns[0], inplace=True)
    w_matrix.set_index(w_matrix.columns[0], inplace=True)
    for ml_method in ml_methods:
        for kappa in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
            # Instanciate pipeline
            pipeline = Pipeline(
                root_path=os.path.join(env_var["root_path"], dataset),
                data=data,
                adj_matrix=adj_matrix,
                w_matrix=w_matrix,
                index_col="INDEX",
                fold_col="INDEX_FOLDS",
                target_col="TARGET",
                scv_method="RegGBSCV",
                type_graph="Weighted",
                run_selection=False,
                kappa=kappa,
                fs_method=fs_method,
                ml_method=ml_method,
                paper=False,
                switchers=SWITCHERS
            )
            print(f"Running the RegGBSCV SCV approach for dataset: {dataset} kappa {kappa} ML Method {ml_method}")
            pipeline.run()
if fs_method == "CFS" and SWITCHERS["fs"]:
    jvm.stop()




Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Kansas kappa 0.0 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:07<00:00,  1.84s/it]


Execution time: 7.448563814163208 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Kansas kappa 0.1 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:07<00:00,  1.84s/it]


Execution time: 7.386594533920288 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Kansas kappa 0.2 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:07<00:00,  1.85s/it]


Execution time: 7.398498296737671 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Kansas kappa 0.3 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:07<00:00,  1.82s/it]


Execution time: 7.3119988441467285 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Kansas kappa 0.4 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:07<00:00,  1.82s/it]


Execution time: 7.302840232849121 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Kansas kappa 0.5 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:07<00:00,  1.85s/it]


Execution time: 7.415990352630615 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Kansas kappa 0.6 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:07<00:00,  1.82s/it]


Execution time: 7.2981157302856445 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Kansas kappa 0.7 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:07<00:00,  1.82s/it]


Execution time: 7.294617176055908 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Kansas kappa 0.8 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:07<00:00,  1.85s/it]


Execution time: 7.42519998550415 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Kansas kappa 0.9 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:07<00:00,  1.83s/it]


Execution time: 7.323801755905151 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Kansas kappa 1.0 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:07<00:00,  1.82s/it]


Execution time: 7.304224967956543 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Montana kappa 0.0 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:09<00:00,  2.26s/it]


Execution time: 9.06947922706604 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Montana kappa 0.1 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:08<00:00,  2.24s/it]


Execution time: 8.968487977981567 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Montana kappa 0.2 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:09<00:00,  2.26s/it]


Execution time: 9.071960687637329 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Montana kappa 0.3 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:08<00:00,  2.23s/it]


Execution time: 8.940437078475952 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Montana kappa 0.4 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:09<00:00,  2.26s/it]


Execution time: 9.053532838821411 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Montana kappa 0.5 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:08<00:00,  2.23s/it]


Execution time: 8.944868087768555 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Montana kappa 0.6 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:09<00:00,  2.26s/it]


Execution time: 9.056759119033813 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Montana kappa 0.7 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:08<00:00,  2.23s/it]


Execution time: 8.932307481765747 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Montana kappa 0.8 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:08<00:00,  2.23s/it]


Execution time: 8.932498693466187 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Montana kappa 0.9 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:09<00:00,  2.26s/it]


Execution time: 9.050633668899536 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Montana kappa 1.0 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:08<00:00,  2.23s/it]


Execution time: 8.935766458511353 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma kappa 0.0 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:08<00:00,  2.02s/it]


Execution time: 8.11112117767334 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma kappa 0.1 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:07<00:00,  2.00s/it]


Execution time: 8.077019214630127 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma kappa 0.2 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:07<00:00,  1.99s/it]


Execution time: 8.014124393463135 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma kappa 0.3 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:08<00:00,  2.02s/it]


Execution time: 8.098705530166626 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma kappa 0.4 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:07<00:00,  1.99s/it]


Execution time: 7.961683750152588 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma kappa 0.5 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:08<00:00,  2.02s/it]


Execution time: 8.07626485824585 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma kappa 0.6 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:07<00:00,  1.98s/it]


Execution time: 7.9392242431640625 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma kappa 0.7 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:07<00:00,  1.99s/it]


Execution time: 7.9763336181640625 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma kappa 0.8 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:08<00:00,  2.02s/it]


Execution time: 8.108788251876831 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma kappa 0.9 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:07<00:00,  1.99s/it]


Execution time: 7.9819536209106445 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Oklahoma kappa 1.0 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:07<00:00,  1.99s/it]


Execution time: 7.9576640129089355 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Texas kappa 0.0 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:06<00:00,  1.57s/it]


Execution time: 6.349148511886597 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Texas kappa 0.1 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:06<00:00,  1.52s/it]


Execution time: 6.112866640090942 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Texas kappa 0.2 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:06<00:00,  1.52s/it]


Execution time: 6.113943576812744 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Texas kappa 0.3 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:06<00:00,  1.52s/it]


Execution time: 6.113433599472046 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Texas kappa 0.4 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:06<00:00,  1.55s/it]


Execution time: 6.2083420753479 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Texas kappa 0.5 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:06<00:00,  1.52s/it]


Execution time: 6.112289905548096 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Texas kappa 0.6 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:06<00:00,  1.52s/it]


Execution time: 6.083211898803711 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Texas kappa 0.7 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:06<00:00,  1.52s/it]


Execution time: 6.094363212585449 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Texas kappa 0.8 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:06<00:00,  1.55s/it]


Execution time: 6.216880559921265 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Texas kappa 0.9 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:05<00:00,  1.33s/it]


Execution time: 5.33000636100769 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Texas kappa 1.0 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:04<00:00,  1.05s/it]


Execution time: 4.1966753005981445 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Washington kappa 0.0 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:06<00:00,  1.64s/it]


Execution time: 6.576699256896973 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Washington kappa 0.1 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:06<00:00,  1.65s/it]


Execution time: 6.602611541748047 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Washington kappa 0.2 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:06<00:00,  1.61s/it]


Execution time: 6.4676172733306885 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Washington kappa 0.3 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:06<00:00,  1.64s/it]


Execution time: 6.558104753494263 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Washington kappa 0.4 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:06<00:00,  1.62s/it]


Execution time: 6.492345094680786 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Washington kappa 0.5 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:06<00:00,  1.63s/it]


Execution time: 6.541157245635986 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Washington kappa 0.6 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:06<00:00,  1.63s/it]


Execution time: 6.54681658744812 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Washington kappa 0.7 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:06<00:00,  1.61s/it]


Execution time: 6.450157165527344 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Washington kappa 0.8 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:06<00:00,  1.62s/it]


Execution time: 6.501342535018921 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Washington kappa 0.9 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:06<00:00,  1.64s/it]


Execution time: 6.560135126113892 seconds
Running the RegGBSCV SCV approach for dataset: US_Wheat_2014_Removed_Washington kappa 1.0 ML Method KNN


Creating folds: 100%|██████████| 4/4 [00:06<00:00,  1.62s/it]

Execution time: 6.491204023361206 seconds



