# Execute the Graph-Based Spatial Cross-Validation experiments from ICMLA21 Paper

## 1 - Initialize libraries

In [13]:
import os
import pandas as pd
import geopandas as gpd
from weka.core import jvm
from src import utils
from src.pipeline import Pipeline
from src.visualization.performance import VizMetrics
from src.visualization.dependence import VizDependence

## 2 - Initialize loggers

In [14]:
utils.initialize_coloredlog()
utils.initialize_rich_tracerback()
utils.initialize_logging()

## 3 - Initialize working path and enviromental variables

In [15]:
# Project path
project_dir = os.path.abspath('')[:-5]
# Load enviromental variables
env_var = utils.load_env_variables(project_dir)

## 4 - Set pipeline switchers, the default is to set True to all processes

In [16]:
# Set pipeline switchers
SWITCHERS = {
    "scv": False,
    "fs": False,
    "train": True,
    "predict": True,
    "evaluate": False,
}

# 4 - List all datasets

In [17]:
brazil_removed_datasets = ["Brazil_Election_2018_Sampled_dec0.3_prob0.1_removed_north",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.1_removed_northeast",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.1_removed_south",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.1_removed_southeast",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.1_removed_centerwest"]

brazil_datasets = ["Brazil_Election_2018_Sampled_dec0.3_prob0.5",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.6",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.7",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.8",
                   "Brazil_Election_2018_Sampled_dec0.3_prob0.9"]
    

brazil_geoeconomical_regions = ["Brazil_Election_2018_removed_AMAZONIA",
                                "Brazil_Election_2018_removed_NORDESTE",
                                "Brazil_Election_2018_removed_CENTRO_SUL"]

us_datasets = ["US_Corn_Yield_2016", "US_Wheat_Yield_2014"]
just_for_test = ["Brazil_Election_2018_removed_CENTRO_SUL"]

## 6 - Runs the pipeline for each method
OBS: The results and files generated from the pipeline execution will be in the created folder Results in the data directory

### 6.1 Traditional SCV
OBS: We the the paramenter fast True so the semivariogram calculation step that can take 24h is skipped. We calculate the removing buffer by considering the 27 n-degree neighborhood as stated in the paper 

In [18]:
dataset_list = brazil_datasets
fs_method = "CFS"
ml_methods = ["KNN", "OLS", "Lasso", "Ridge", "ElasticNet", "DT", "LGBM", "RF", "MLP", "SVM"]


if fs_method == "CFS" and SWITCHERS["fs"]:
    jvm.start()
# Set paths
meshblocks_filename = "meshblocks.shp"
meshblocks_idx = "code_muni"
data_idx = "INDEX"
env_var["root_path"] = "/home/tpinho/IJGIS/Datasets/"
for dataset in dataset_list:
    data_path = os.path.join(env_var["root_path"], dataset, "data.csv")
    
    meshblocks_path = os.path.join(env_var["root_path"], dataset, "meshblocks", meshblocks_filename)
    # Load data
    data = pd.read_csv(data_path, index_col=data_idx, low_memory=False)
    try:
        data.drop(columns=["[GEO]_LATITUDE", "[GEO]_LONGITUDE"], inplace=True)
    except KeyError:
        pass
    meshblocks = gpd.read_file(meshblocks_path)
    # Set meshblocks  index
    meshblocks.set_index(meshblocks_idx, inplace=True)
    for ml_method in ml_methods:
        # Run pipeline
        TraditionalSCV = Pipeline(
            root_path=os.path.join(env_var["root_path"], dataset),
            data=data,
            meshblocks=meshblocks,
            index_col="INDEX",
            fold_col="INDEX_FOLDS",
            target_col="TARGET",
            scv_method="TraditionalSCV",
            fs_method=fs_method,
            ml_method=ml_method,
            switchers=SWITCHERS
        )

        print(f"Running the Traditional SCV approach for dataset: {dataset} ML Method = {ml_method}")
        TraditionalSCV.run()
if fs_method == "CFS" and SWITCHERS["fs"]:
    jvm.stop()

Running the Traditional SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.5 ML Method = KNN


Training model: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 72.28it/s]
Predicting test set: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 69.14it/s]


Running the Traditional SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.5 ML Method = OLS


Training model: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 46.93it/s]
Predicting test set: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 76.82it/s]


Running the Traditional SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.5 ML Method = Lasso


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
Training model: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 28.92it/s]
Predicting test set: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 109.57it/s]


Running the Traditional SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.5 ML Method = Ridge


Training model: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 67.17it/s]
Predicting test set: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 109.73it/s]


Running the Traditional SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.5 ML Method = ElasticNet


Training model: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 69.71it/s]
Predicting test set: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 126.52it/s]


Running the Traditional SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.5 ML Method = DT


Training model: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 28.26it/s]
Predicting test set: 100%|███████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 124.35it/s]


Running the Traditional SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.5 ML Method = LGBM


Training model: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:17<00:00,  1.50it/s]
Predicting test set: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 40.52it/s]


Running the Traditional SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.5 ML Method = RF


Training model: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [01:24<00:00,  3.15s/it]
Predicting test set: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:01<00:00, 21.38it/s]


Running the Traditional SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.5 ML Method = MLP


Training model: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [03:15<00:00,  7.25s/it]
Predicting test set: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 67.23it/s]


Running the Traditional SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.5 ML Method = SVM


Training model: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 36.62it/s]
Predicting test set: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 27/27 [00:00<00:00, 90.43it/s]


Running the Traditional SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.6 ML Method = KNN


Training model:  48%|██████████████████████████████████████████████▋                                                  | 13/27 [00:00<00:00, 50.77it/s]
