# Execute the Graph-Based Spatial Cross-Validation experiments from ICMLA21 Paper

## 1 - Initialize libraries

In [1]:
import os
import pandas as pd
import numpy as np
from weka.core import jvm
from src import utils
from src.pipeline import Pipeline
from src.visualization.performance import VizMetrics
from src.visualization.dependence import VizDependence
import warnings
warnings.filterwarnings("ignore")



## 2 - Initialize loggers

In [2]:
utils.initialize_coloredlog()
utils.initialize_rich_tracerback()
utils.initialize_logging()

## 3 - Initialize working path and enviromental variables

In [3]:
# Project path
project_dir = os.path.abspath('')[:-5]
# Load enviromental variables
env_var = utils.load_env_variables(project_dir)

## 5 - Set pipeline switchers, the default is to set True to all processes

In [4]:
# Set pipeline switchers
SWITCHERS = {
    "scv": False,
    "fs": False,
    "train": True,
    "predict": True,
    "evaluate": False,
}

# 5 - List all datasets

In [5]:
brazil_removed_datasets = ["Brazil_Election_2018_Sampled_dec0.3_prob0.1_removed_north",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.1_removed_northeast",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.1_removed_south",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.1_removed_southeast",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.1_removed_centerwest"]

brazil_datasets = [#"Brazil_Election_2018_Sampled_dec0.3_prob0.1",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.2",
                   #"Brazil_Election_2018_Sampled_dec0.3_prob0.3",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.4",
                   #"Brazil_Election_2018_Sampled_dec0.3_prob0.5",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.6",
                   #"Brazil_Election_2018_Sampled_dec0.3_prob0.7",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.8",
                   #"Brazil_Election_2018_Sampled_dec0.3_prob0.9"
                   ]
us_datasets = ["US_Corn_Yield_2016", "US_Wheat_Yield_2014"]
brazil_geoeconomical_regions = ["Brazil_Election_2018_removed_AMAZONIA",
                                "Brazil_Election_2018_removed_NORDESTE",
                                "Brazil_Election_2018_removed_CENTRO_SUL"]


single = ["Brazil_Election_2018_Sampled_dec0.3_prob0.4"]

## 6 - Runs the pipeline for each method
OBS: The results and files generated from the pipeline execution will be in the created folder Results in the data directory

### 6.2 RegGBSCV

In [6]:
dataset_list = brazil_datasets
fs_method = "CFS"
ml_methods = ["KNN", "OLS", "Lasso", "Ridge", "ElasticNet", "DT", "LGBM", "RF", "MLP", "SVM"]

if fs_method == "CFS" and SWITCHERS["fs"]:
    jvm.start()

for dataset in dataset_list:
    # Load data
    path = os.path.join(env_var["root_path"], dataset, "data.csv")
    data = pd.read_csv(path, index_col="INDEX", low_memory=False)
    try:
        data.drop(columns=["[GEO]_LATITUDE", "[GEO]_LONGITUDE"], inplace=True)
    except KeyError:
        pass
    # Load adjacency matrix
    adj_matrix = pd.read_csv(
        os.path.join(env_var["root_path"], dataset, "queen_matrix.csv"), low_memory=False
    )
    w_matrix = pd.read_csv(
        os.path.join(env_var["root_path"], dataset, "normd_matrix.csv"), low_memory=False
    )
    adj_matrix.set_index(adj_matrix.columns[0], inplace=True)
    w_matrix.set_index(w_matrix.columns[0], inplace=True)
    for ml_method in ml_methods:
        for kappa in [0.0, 0.2, 0.4, 0.6, 0.8]:
            # Instanciate pipeline
            pipeline = Pipeline(
                root_path=os.path.join(env_var["root_path"], dataset),
                data=data,
                adj_matrix=adj_matrix,
                w_matrix=w_matrix,
                index_col="INDEX",
                fold_col="INDEX_FOLDS",
                target_col="TARGET",
                scv_method="RegGBSCV",
                type_graph="Weighted",
                run_selection=False,
                kappa=kappa,
                fs_method=fs_method,
                ml_method=ml_method,
                paper=False,
                switchers=SWITCHERS
            )
            print(f"Running the RegGBSCV SCV approach for dataset: {dataset} kappa {kappa} ML Method {ml_method}")
            pipeline.run()
if fs_method == "CFS" and SWITCHERS["fs"]:
    jvm.stop()




Running the RegGBSCV SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.2 kappa 0.0 complex_param 1


Training model: 100%|██████████| 27/27 [00:03<00:00,  8.17it/s]
Predicting test set: 100%|██████████| 27/27 [00:00<00:00, 50.28it/s]


Running the RegGBSCV SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.2 kappa 0.2 complex_param 1


Training model: 100%|██████████| 27/27 [00:03<00:00,  7.25it/s]
Predicting test set: 100%|██████████| 27/27 [00:00<00:00, 50.99it/s]


Running the RegGBSCV SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.2 kappa 0.4 complex_param 1


Training model: 100%|██████████| 27/27 [00:03<00:00,  7.85it/s]
Predicting test set: 100%|██████████| 27/27 [00:00<00:00, 50.64it/s]


Running the RegGBSCV SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.2 kappa 0.6 complex_param 1


Training model: 100%|██████████| 27/27 [00:03<00:00,  7.46it/s]
Predicting test set: 100%|██████████| 27/27 [00:00<00:00, 38.66it/s]


Running the RegGBSCV SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.2 kappa 0.8 complex_param 1


Training model: 100%|██████████| 27/27 [00:02<00:00,  9.85it/s]
Predicting test set: 100%|██████████| 27/27 [00:00<00:00, 52.00it/s]


Running the RegGBSCV SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.4 kappa 0.0 complex_param 1


Training model: 100%|██████████| 27/27 [00:02<00:00, 10.73it/s]
Predicting test set: 100%|██████████| 27/27 [00:00<00:00, 69.87it/s]


Running the RegGBSCV SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.4 kappa 0.2 complex_param 1


Training model: 100%|██████████| 27/27 [00:02<00:00, 11.43it/s]
Predicting test set: 100%|██████████| 27/27 [00:00<00:00, 68.79it/s]


Running the RegGBSCV SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.4 kappa 0.4 complex_param 1


Training model: 100%|██████████| 27/27 [00:01<00:00, 15.28it/s]
Predicting test set: 100%|██████████| 27/27 [00:00<00:00, 38.83it/s]


Running the RegGBSCV SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.4 kappa 0.6 complex_param 1


Training model: 100%|██████████| 27/27 [00:01<00:00, 17.46it/s]
Predicting test set: 100%|██████████| 27/27 [00:00<00:00, 50.68it/s]


Running the RegGBSCV SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.4 kappa 0.8 complex_param 1


Training model: 100%|██████████| 27/27 [00:01<00:00, 14.87it/s]
Predicting test set: 100%|██████████| 27/27 [00:00<00:00, 59.02it/s]


Running the RegGBSCV SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.6 kappa 0.0 complex_param 1


Training model: 100%|██████████| 27/27 [00:01<00:00, 19.86it/s]
Predicting test set: 100%|██████████| 27/27 [00:00<00:00, 77.71it/s]


Running the RegGBSCV SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.6 kappa 0.2 complex_param 1


Training model: 100%|██████████| 27/27 [00:01<00:00, 22.10it/s]
Predicting test set: 100%|██████████| 27/27 [00:00<00:00, 85.62it/s]


Running the RegGBSCV SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.6 kappa 0.4 complex_param 1


Training model: 100%|██████████| 27/27 [00:01<00:00, 18.86it/s]
Predicting test set: 100%|██████████| 27/27 [00:00<00:00, 89.20it/s]


Running the RegGBSCV SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.6 kappa 0.6 complex_param 1


Training model: 100%|██████████| 27/27 [00:04<00:00,  6.47it/s]
Predicting test set: 100%|██████████| 27/27 [00:00<00:00, 35.09it/s]


Running the RegGBSCV SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.6 kappa 0.8 complex_param 1


Training model: 100%|██████████| 27/27 [00:03<00:00,  8.61it/s]
Predicting test set: 100%|██████████| 27/27 [00:00<00:00, 56.62it/s]


Running the RegGBSCV SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.8 kappa 0.0 complex_param 1


Training model: 100%|██████████| 27/27 [00:00<00:00, 31.90it/s]
Predicting test set: 100%|██████████| 27/27 [00:00<00:00, 67.20it/s]


Running the RegGBSCV SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.8 kappa 0.2 complex_param 1


Training model: 100%|██████████| 27/27 [00:00<00:00, 32.93it/s]
Predicting test set: 100%|██████████| 27/27 [00:00<00:00, 97.80it/s] 


Running the RegGBSCV SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.8 kappa 0.4 complex_param 1


Training model: 100%|██████████| 27/27 [00:00<00:00, 30.66it/s]
Predicting test set: 100%|██████████| 27/27 [00:00<00:00, 102.82it/s]


Running the RegGBSCV SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.8 kappa 0.6 complex_param 1


Training model: 100%|██████████| 27/27 [00:01<00:00, 26.56it/s]
Predicting test set: 100%|██████████| 27/27 [00:00<00:00, 42.83it/s]


Running the RegGBSCV SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.8 kappa 0.8 complex_param 1


Training model: 100%|██████████| 27/27 [00:00<00:00, 29.85it/s]
Predicting test set: 100%|██████████| 27/27 [00:00<00:00, 72.85it/s]
