# Execute the Graph-Based Spatial Cross-Validation experiments from ICMLA21 Paper

## 1 - Initialize libraries

In [1]:
import os
import pandas as pd
from src import utils
from src.pipeline import Pipeline
from src.visualization.performance import VizMetrics
from src.visualization.dependence import VizDependence

## 2 - Initialize loggers

In [2]:
utils.initialize_coloredlog()
utils.initialize_rich_tracerback()
utils.initialize_logging()

## 3 - Initialize working path and enviromental variables

In [3]:
# Project path
project_dir = os.path.abspath('')[:-5]
# Load enviromental variables
env_var = utils.load_env_variables(project_dir)

## 4 - Load the data

In [4]:
# Set paths
data_path = os.path.join(env_var["root_path"], "data.csv")
adj_path = os.path.join(env_var["root_path"], "queen_matrix.csv")
# Load data
data = pd.read_csv(data_path, index_col="INDEX", low_memory=False)
#data.drop(columns=["[GEO]_LATITUDE", "[GEO]_LONGITUDE"])
adj_matrix = pd.read_csv(adj_path, low_memory=False)
# Set adjacency matrix index
adj_matrix.set_index(adj_matrix.columns[0], inplace=True)

Index(['INDEX_FOLDS', 'TARGET', '[CENSUS]_DOMICILIO01_V002',
       '[CENSUS]_DOMICILIO01_V003', '[CENSUS]_DOMICILIO01_V004',
       '[CENSUS]_DOMICILIO01_V005', '[CENSUS]_DOMICILIO01_V006',
       '[CENSUS]_DOMICILIO01_V007', '[CENSUS]_DOMICILIO01_V008',
       '[CENSUS]_DOMICILIO01_V009',
       ...
       '[CENSUS]_RESPONSAVELRENDA_V123', '[CENSUS]_RESPONSAVELRENDA_V124',
       '[CENSUS]_RESPONSAVELRENDA_V125', '[CENSUS]_RESPONSAVELRENDA_V126',
       '[CENSUS]_RESPONSAVELRENDA_V127', '[CENSUS]_RESPONSAVELRENDA_V128',
       '[CENSUS]_RESPONSAVELRENDA_V129', '[CENSUS]_RESPONSAVELRENDA_V130',
       '[CENSUS]_RESPONSAVELRENDA_V131', '[CENSUS]_RESPONSAVELRENDA_V132'],
      dtype='object', length=4000)

## 5 - Set pipeline switchers, the default is to set True to all processes

In [5]:
# Set pipeline switchers
SWITCHERS = {
    "scv": True,
    "fs": True,
    "train": True,
    "predict": True,
    "evaluate": True,
}

## 6 - Runs the pipeline for each method
OBS: The results and files generated from the pipeline execution will be in the created folder Results in the data directory

### 6.1 Ultra-Conservative
OBS: We the the paramenter fast True so the semivariogram calculation step that can take 24h is skipped. We calculate the removing buffer by considering the 27 n-degree neighborhood as stated in the paper 

In [None]:
UltraConservative = Pipeline(
    root_path=env_var["root_path"],
    data=data,
    adj_matrix=adj_matrix,
    index_col="INDEX",
    fold_col="INDEX_FOLDS",
    target_col="TARGET",
    scv_method="UltraConservative",
    fs_method="CFS",
    ml_method="LGBM",
    fast=False,
    switchers=SWITCHERS
)

print("Running the UltraConservative approach...")
UltraConservative.run()