# Execute the Graph-Based Spatial Cross-Validation experiments from ICMLA21 Paper

## 1 - Initialize libraries

In [13]:
import os
import pandas as pd
import geopandas as gpd
from weka.core import jvm
from pathlib import Path
from src import utils
from src.pipeline import Pipeline
from src.visualization.performance import VizMetrics
from src.visualization.dependence import VizDependence

## 2 - Initialize loggers

In [14]:
utils.initialize_coloredlog()
utils.initialize_rich_tracerback()
utils.initialize_logging()

## 3 - Initialize working path and enviromental variables

In [15]:
# Project path
project_dir = str(Path().resolve().parents[1])
# Load enviromental variables
env_var = utils.load_env_variables(project_dir)
# Load parameters
dataset = "Brazil_Election_2018"
parameters = utils.load_json(os.path.join(project_dir, "parameters", "validation", f"{dataset}.json"))

## 4 - Set pipeline switchers, the default is to set True to all processes

In [16]:
# Set pipeline switchers
SWITCHERS = {
    "scv": True,
    "fs": True,
    "train": True,
    "predict": False,
    "evaluate": False,
}

# 4 - List all datasets

In [17]:
dataset_path = os.path.join(env_var["root_path"], dataset)
dataset_list = [folder for folder in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, folder))]
dataset_list.remove("Original")
dataset_list.sort()

## 6 - Runs the pipeline for each method
OBS: The results and files generated from the pipeline execution will be in the created folder Results in the data directory

### 6.1 Traditional SCV
OBS: We the the paramenter fast True so the semivariogram calculation step that can take 24h is skipped. We calculate the removing buffer by considering the 27 n-degree neighborhood as stated in the paper 

In [18]:
fs_method = parameters["fs_method"]
ml_methods = parameters["ml_methods"]
# Set paths
meshblocks_filename = parameters["meshblock"]
meshblocks_idx = parameters["meshblock_id"]
dataset_path = os.path.join(env_var["root_path"], dataset)

if fs_method == "CFS" and SWITCHERS["fs"]:
    jvm.start()
for dataset in dataset_list:
    data_path = os.path.join(dataset_path, dataset, "data.csv")
    meshblocks_path = os.path.join(dataset_path, dataset, "meshblocks", meshblocks_filename)
    # Load data
    data = pd.read_csv(data_path, index_col=parameters["index_col"], low_memory=False)
    meshblocks = gpd.read_file(meshblocks_path)
    # Remove uncessary cols
    if parameters["cols_remove"]:
        data.drop(columns=parameters["cols_remove"], inplace=True)
    # Set meshblocks  index
    meshblocks.set_index(meshblocks_idx, inplace=True)
    for ml_method in ml_methods:
        # Run pipeline
        TraditionalSCV = Pipeline(
            root_path=os.path.join(dataset_path, dataset),
            data=data,
            meshblocks=meshblocks,
            index_col=parameters["index_col"],
            fold_col=parameters["fold_col"],
            target_col=parameters["target_col"],
            scv_method="TraditionalSCV",
            fs_method=fs_method,
            ml_method=ml_method,
            switchers=SWITCHERS
        )

        print(f"Running the Traditional SCV approach for dataset: {dataset} ML Method = {ml_method}")
        TraditionalSCV.run()
if fs_method == "CFS" and SWITCHERS["fs"]:
    jvm.stop()

2022-05-02 14:32:32 labics01 weka.core.jvm[6102] INFO JVM already running, call jvm.stop() first


Running the Traditional SCV approach for dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.1 ML Method = KNN


Creating folds: 100%|██████████| 27/27 [00:07<00:00,  3.46it/s]


Execution time: 19.85424304008484 seconds


Selecting Features:   4%|▎         | 1/27 [05:50<2:31:46, 350.25s/it]