# Execute the Graph-Based Spatial Cross-Validation experiments from ICMLA21 Paper

## 1 - Initialize libraries

In [7]:
import os
import pandas as pd
from weka.core import jvm
from pathlib import Path
from src import utils
from src.pipeline import Pipeline
from src.visualization.performance import VizMetrics
from src.visualization.dependence import VizDependence

## 2 - Initialize loggers

In [8]:
utils.initialize_coloredlog()
utils.initialize_rich_tracerback()
utils.initialize_logging()

## 3 - Initialize working path and enviromental variables

In [9]:
# Project path
project_dir = str(Path().resolve().parents[1])
# Load enviromental variables
env_var = utils.load_env_variables(project_dir)
# Load parameters
dataset = "Brazil_Election_2018"
parameters = utils.load_json(os.path.join(project_dir, "parameters", "validation", f"{dataset}.json"))

## 4 - Set pipeline switchers, the default is to set True to all processes

In [10]:
# Set pipeline switchers
switchers = {
    "scv": False,
    "fs": True,
    "train": False,
    "predict": False,
    "evaluate": False,
}

# 5 - List all datasets

In [11]:
dataset_path = os.path.join(env_var["root_path"], dataset)
dataset_list = [folder for folder in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, folder))]
dataset_list.remove("Original")
dataset_list.sort()

## 6 - Runs the pipeline for Optmistic approach
OBS: The results and files generated from the pipeline execution will be in the created folder Results in the data directory

In [12]:

fs_method = parameters["fs_method"]
ml_methods = parameters["ml_methods"]
dataset_path = os.path.join(env_var["root_path"], dataset)
if fs_method == "CFS" and switchers["fs"]:
    jvm.start()

for dataset in dataset_list:
    # Load the data
    data_path = os.path.join(dataset_path, dataset, "data.csv")
    data = pd.read_csv(data_path, index_col="INDEX", low_memory=False)
    if parameters["cols_remove"]:
        data.drop(columns=parameters["cols_remove"], inplace=True)
    for ml_method in ml_methods:
        CrossValidation= Pipeline(
            root_path=os.path.join(dataset_path, dataset),
            data=data,
            index_col=parameters["index_col"],
            fold_col=parameters["fold_col"],
            target_col=parameters["target_col"],
            scv_method="CrossValidation",
            fs_method=fs_method,
            ml_method=ml_method,
            switchers=switchers
        )
        print(f"Running the Cross-Valdiation approach for dataset: {dataset} ML Method = {ml_method}")
        CrossValidation.run()
if fs_method == "CFS" and switchers["fs"]:
    jvm.stop()

Running the Cross-Valdiation approach for dataset: US_Corn_Yield_2016_Removed_Northeast ML Method = KNN


Selecting Features: 100%|██████████| 10/10 [00:00<00:00, 190.90it/s]


Running the Cross-Valdiation approach for dataset: US_Corn_Yield_2016_Removed_Southeast ML Method = KNN


Selecting Features: 100%|██████████| 10/10 [00:00<00:00, 201.41it/s]


Running the Cross-Valdiation approach for dataset: US_Corn_Yield_2016_Removed_Midwest ML Method = KNN


Selecting Features: 100%|██████████| 10/10 [00:00<00:00, 158.19it/s]


Running the Cross-Valdiation approach for dataset: US_Corn_Yield_2016_Removed_Southwest ML Method = KNN


Selecting Features: 100%|██████████| 10/10 [00:00<00:00, 158.57it/s]


Running the Cross-Valdiation approach for dataset: US_Corn_Yield_2016_Removed_West ML Method = KNN


Selecting Features: 100%|██████████| 10/10 [00:00<00:00, 167.80it/s]
