In [1]:
import contextlib
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import os
import json
from tqdm import tqdm
from src import utils

In [2]:
def _mkdir(root_path, folder_name):
    """Creates a folder at current path"""
    # logger = logging.getLogger(self.logger_name)
    cur_dir = os.path.join(root_path, folder_name)
    with contextlib.suppress(FileExistsError):
        os.mkdir(cur_dir)
        # logger.info(f"Entering folder: /{folder_name}")
def _make_folders(root_path, folders):
        """Make the initial folders"""
        for folder in folders:
            _mkdir(root_path, folder)
            root_path = os.path.join(root_path, folder)
        return root_path

In [3]:
# Project path
project_dir = os.path.abspath('')[:-9]
# Load enviromental variables

env_var = utils.load_env_variables(project_dir)
env_var["root_path"] = "/exp/tpinho/Datasets"

In [4]:
datasets_single = ["Brazil_Election_2018"]
few_methods = ["CrossValidation", "Optimistic", "TraditionalSCV"]

datasets = ["Brazil_Election_2018_Sampled_dec0.3_prob0.1",      
            "Brazil_Election_2018_Sampled_dec0.3_prob0.2",
            "Brazil_Election_2018_Sampled_dec0.3_prob0.3",
            "Brazil_Election_2018_Sampled_dec0.3_prob0.4",
            "Brazil_Election_2018_Sampled_dec0.3_prob0.5",
            "Brazil_Election_2018_Sampled_dec0.3_prob0.6",
            "Brazil_Election_2018_Sampled_dec0.3_prob0.7",
            "Brazil_Election_2018_Sampled_dec0.3_prob0.8",
            "Brazil_Election_2018_Sampled_dec0.3_prob0.9",
            ]

scv_methods = ["CrossValidation",
               "Optimistic",
               "RegGBSCV_R_Kappa_0.0",
               "RegGBSCV_R_Kappa_0.1",
               "RegGBSCV_R_Kappa_0.2",
               "RegGBSCV_R_Kappa_0.3",  
               "RegGBSCV_R_Kappa_0.4",
               "RegGBSCV_R_Kappa_0.5",
               "RegGBSCV_R_Kappa_0.6",
               "RegGBSCV_R_Kappa_0.7", 
               "RegGBSCV_R_Kappa_0.8",
               "RegGBSCV_R_Kappa_0.9",
               "RegGBSCV_R_Kappa_1.0",
               "TraditionalSCV"]

In [5]:
fs_method = "CFS"
ml_methods = ["KNN", "OLS", "Lasso", "Ridge", "ElasticNet", "DT", "LGBM", "RF", "MLP", "SVM"]
data_id = "INDEX"

for dataset_folder in datasets:
    rmse_mean = {}
    print(f"Dataset: {dataset_folder}")
    dataset_path = os.path.join(env_var["root_path"], dataset_folder)
    #scv_methods = list(os.listdir(os.path.join(dataset_path, "results")))
    #rmse_mean["Dataset"] = rmse_mean["Dataset"] + [dataset_folder] if rmse_mean.get("Dataset") else [dataset_folder]
    #scv_methods = ["RegGBSCV_R_Kappa_2.0"]
    for ml_method in tqdm(ml_methods):
        rmse_mean["Method"] = rmse_mean["Method"] + [ml_method] if rmse_mean.get("Method") else [ml_method]
        for scv_method in scv_methods:
            predictions_path = os.path.join(dataset_path, "results", scv_method, "predictions", fs_method, f"{ml_method}")
            predi_files = [os.path.join(predictions_path,c) for c in os.listdir(predictions_path)]
            predictions = pd.DataFrame()
            for file in predi_files:
                pred_fold = pd.read_csv(file)
                predictions = pd.concat([predictions, pred_fold])
            predictions["fold"] = predictions[data_id].apply(lambda cod: str(cod)[:2])
            predictions["error"] = (predictions["PREDICTIONS"] - predictions["GROUND_TRUTH"])**2
            predictions = predictions.groupby(by="fold").agg("mean")
            rmse_mean[f"{scv_method}"] = rmse_mean[f"{scv_method}"] +[predictions["error"].mean()] if rmse_mean.get(f"{scv_method}") else [predictions["error"].mean()]
            #rmse_mean[f"{scv_method}_std"] = rmse_mean[f"{scv_method}_std"] +[predictions["error"].std()] if rmse_mean.get(f"{scv_method}_std") else [predictions["error"].std()]
    _make_folders(os.path.join(env_var["root_path"], dataset_folder), ["comparison"])
    df_rmse = pd.DataFrame(rmse_mean)
    df_rmse.set_index("Method", inplace=True)

    df_rmse.to_csv(os.path.join(env_var["root_path"], dataset_folder, "comparison", "RMSE.csv"))
    df_rmse.rank().to_csv(os.path.join(env_var["root_path"], dataset_folder, "comparison", "Rank.csv"))

pd.DataFrame(rmse_mean).rank()



Dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.1


100%|██████████| 10/10 [00:08<00:00,  1.13it/s]


Dataset: Brazil_Election_2018_Sampled_dec0.3_prob0.2


  0%|          | 0/10 [00:00<?, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: '/exp/tpinho/Datasets/Brazil_Election_2018_Sampled_dec0.3_prob0.2/results/RegGBSCV_R_Kappa_0.7/predictions/CFS/KNN'