In [1]:
import json
import pathlib
import re
import os

import pandas as pd
import numpy as np


In [2]:
def to_latex_table(df: pd.DataFrame):
  mean = []
  std = []

  for _, row in df.iterrows():
    mean.append(f"& ${row['mean_precision']}$ & ${row['mean_recall']}$ & ${row['mean_f1_score']}$")
    std.append(f"& (${row['std_precision']}$) & (${row['std_recall']}$) & (${row['std_f1_score']}$)")

  mean = [mean[1], mean[0], mean[2]]
  std = [std[1], std[0], std[2]]

  return '\n'.join(mean) + '\\\\', '\n'.join(std) + '\\\\'

In [3]:
def open_result_ipynb(path: pathlib.Path):
    with open(path, 'r') as f:
        return json.loads(''.join(f.readlines()))

In [4]:
def parse_result_dataframe(mean_list,
                           std_list,
                           alg,
                           exp,
                           dataset) -> pd.DataFrame:
    mean_precision = float(re.sub('\s+', ' ', mean_list[0].strip()).split()[1])
    mean_recall = float(re.sub('\s+', ' ', mean_list[1].strip()).split()[1])
    mean_f1_score = float(re.sub('\s+', ' ', mean_list[2].strip()).split()[1])
    std_precision = float(re.sub('\s+', ' ', std_list[0].strip()).split()[1])
    std_recall = float(re.sub('\s+', ' ', std_list[1].strip()).split()[1])
    std_f1_score = float(re.sub('\s+', ' ', std_list[2].strip()).split()[1])

    return pd.DataFrame({
        'dataset': [dataset],
        'algorithm': [alg],
        'experiment': [exp],
        'mean_precision': [mean_precision],
        'std_precision': [std_precision],
        'mean_recall': [mean_recall],
        'std_recall': [std_recall],
        'mean_f1_score': [mean_f1_score],
        'std_f1_score': [std_f1_score]
    })


In [5]:
def results_for(df, dataset, alg):

    # Já que o resultado é para um par <dataaset, algoritmo>
    #   podemos remover os demais.
    df_ = df[df['dataset'] == dataset].drop(columns=['dataset'])
    df_ = df_[df_['algorithm'] == alg].drop(columns=['algorithm'])

    # DataFrame auxiliar para calcular os grupos
    df__ = df_.copy()
    df__['experiment'] = df__['experiment'].map({f'EXP{i}-{j}': f'{j}'
                                                 for i in range(1, 6)
                                                 for j in ['ALL', '75', '100']})
    # Calcular variância
    df__['std_precision'] = df__['std_precision'] ** 2
    df__['std_recall'] = df__['std_recall'] ** 2
    df__['std_f1_score'] = df__['std_f1_score'] ** 2

    # Agregar os resultados
    agg = df__.groupby('experiment').aggregate('mean')

    # Recalcular std
    df__['std_precision'] = np.sqrt(df__['std_precision'])
    df__['std_recall'] = np.sqrt(df__['std_recall'])
    df__['std_f1_score'] = np.sqrt(df__['std_f1_score'])

    return df_.sort_values(by='experiment'), agg


In [6]:
def cell_data(cell_dict):
    return cell_dict['outputs'][0]['data']['text/plain']


In [7]:
INDICES_RESULTS = {
    'ann': [10, 11],
    'svm': [14, 15],
    'knn': [18, 19],
    'dtree': [22, 23]
}

dataframe = None

In [8]:
base_dir = pathlib.Path('results').absolute()
dfs = []

for p in base_dir.iterdir():
    if not p.is_file():
        # Caso não seja um arquivo, ir para o próximo
        continue
    """
    name = p.name
    if name.startswith('standardized_'):
        ids = name.replace('.ipynb', '').split('_')
        exp = f'exp{ids[2]}'.lower()
        n = ids[3].lower()

        os.rename(str(p), str(p.parent.joinpath(f'standardized_{exp}_{n}.ipynb')))
    """

    ids = p.name.replace('.ipynb', '').split('_')
    dataset = ids[0]
    exp = f'{ids[1]}-{ids[2]}'.upper()

    if ids[0].startswith('min'):
        dataset = f'{ids[0]}_{ids[1]}'
        exp = f'{ids[2]}-{ids[3]}'.upper()

    data = open_result_ipynb(p)

    for alg in INDICES_RESULTS:
        cells = INDICES_RESULTS[alg]

        mean_list = cell_data(data['cells'][cells[0]])
        std_list = cell_data(data['cells'][cells[1]])
        dfs.append(parse_result_dataframe(mean_list,
                                          std_list,
                                          alg,
                                          exp,
                                          dataset))


df = pd.concat(dfs)

In [9]:
df

Unnamed: 0,dataset,algorithm,experiment,mean_precision,std_precision,mean_recall,std_recall,mean_f1_score,std_f1_score
0,standardized,ann,EXP1-75,0.742632,0.049927,0.724444,0.055134,0.723809,0.053440
0,standardized,svm,EXP1-75,0.732159,0.048747,0.706667,0.046319,0.705974,0.048554
0,standardized,knn,EXP1-75,0.663942,0.070368,0.630000,0.072776,0.626473,0.073021
0,standardized,dtree,EXP1-75,0.481375,0.038089,0.445556,0.039701,0.443465,0.034980
0,standardized,ann,EXP3-100,0.769035,0.047619,0.744444,0.059720,0.744761,0.058463
...,...,...,...,...,...,...,...,...,...
0,raw,dtree,EXP2-100,0.422849,0.066612,0.403333,0.067087,0.400734,0.063700
0,standardized,ann,EXP1-ALL,0.727357,0.043790,0.708889,0.041837,0.705178,0.043721
0,standardized,svm,EXP1-ALL,0.739651,0.053644,0.716667,0.063559,0.714783,0.063346
0,standardized,knn,EXP1-ALL,0.687903,0.043830,0.645556,0.036458,0.641992,0.039810


# Resultados Baseline

In [10]:
_, agg_ann = results_for(df, 'raw', 'ann')
_, agg_svm = results_for(df, 'raw', 'svm')
_, agg_dtree = results_for(df, 'raw', 'dtree')
_, agg_knn = results_for(df, 'raw', 'knn')

In [11]:
agg_ann.round(decimals=4)

Unnamed: 0_level_0,mean_precision,std_precision,mean_recall,std_recall,mean_f1_score,std_f1_score
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100,0.4077,0.0246,0.3513,0.0142,0.3152,0.0163
75,0.3971,0.02,0.3473,0.0128,0.3115,0.0137
ALL,0.4593,0.0201,0.3822,0.0101,0.3498,0.0106


In [12]:
m, s = to_latex_table(agg_ann.round(decimals=4))
print(m)
print(s)

& $0.3971$ & $0.3473$ & $0.3115$
& $0.4077$ & $0.3513$ & $0.3152$
& $0.4593$ & $0.3822$ & $0.3498$\\
& ($0.02$) & ($0.0128$) & ($0.0137$)
& ($0.0246$) & ($0.0142$) & ($0.0163$)
& ($0.0201$) & ($0.0101$) & ($0.0106$)\\


In [13]:
agg_svm.round(decimals=4)

Unnamed: 0_level_0,mean_precision,std_precision,mean_recall,std_recall,mean_f1_score,std_f1_score
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100,0.3634,0.0047,0.3842,0.0027,0.3462,0.0028
75,0.3634,0.0048,0.3847,0.0026,0.3468,0.0027
ALL,0.3641,0.0042,0.3847,0.0024,0.3466,0.0025


In [14]:
m, s = to_latex_table(agg_svm.round(decimals=4))
print(m)
print(s)

& $0.3634$ & $0.3847$ & $0.3468$
& $0.3634$ & $0.3842$ & $0.3462$
& $0.3641$ & $0.3847$ & $0.3466$\\
& ($0.0048$) & ($0.0026$) & ($0.0027$)
& ($0.0047$) & ($0.0027$) & ($0.0028$)
& ($0.0042$) & ($0.0024$) & ($0.0025$)\\


In [15]:
agg_dtree.round(decimals=4)

Unnamed: 0_level_0,mean_precision,std_precision,mean_recall,std_recall,mean_f1_score,std_f1_score
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100,0.4716,0.003,0.4453,0.0029,0.4412,0.0029
75,0.47,0.0031,0.4424,0.0031,0.4422,0.003
ALL,0.4422,0.0035,0.4178,0.0027,0.4149,0.0029


In [16]:
m, s = to_latex_table(agg_dtree.round(decimals=4))
print(m)
print(s)

& $0.47$ & $0.4424$ & $0.4422$
& $0.4716$ & $0.4453$ & $0.4412$
& $0.4422$ & $0.4178$ & $0.4149$\\
& ($0.0031$) & ($0.0031$) & ($0.003$)
& ($0.003$) & ($0.0029$) & ($0.0029$)
& ($0.0035$) & ($0.0027$) & ($0.0029$)\\


In [17]:
agg_knn.round(decimals=4)

Unnamed: 0_level_0,mean_precision,std_precision,mean_recall,std_recall,mean_f1_score,std_f1_score
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100,0.4563,0.0046,0.4398,0.0027,0.4272,0.0028
75,0.4475,0.0041,0.4347,0.0022,0.4203,0.0024
ALL,0.4494,0.004,0.4351,0.0026,0.4208,0.0027


In [18]:
m, s = to_latex_table(agg_knn.round(decimals=4))
print(m)
print(s)

& $0.4475$ & $0.4347$ & $0.4203$
& $0.4563$ & $0.4398$ & $0.4272$
& $0.4494$ & $0.4351$ & $0.4208$\\
& ($0.0041$) & ($0.0022$) & ($0.0024$)
& ($0.0046$) & ($0.0027$) & ($0.0028$)
& ($0.004$) & ($0.0026$) & ($0.0027$)\\


# Resultados MinMax

In [19]:
_, agg_ann = results_for(df, 'min_max', 'ann')
_, agg_svm = results_for(df, 'min_max', 'svm')
_, agg_dtree = results_for(df, 'min_max', 'dtree')
_, agg_knn = results_for(df, 'min_max', 'knn')

In [20]:
agg_ann.round(decimals=4)

Unnamed: 0_level_0,mean_precision,std_precision,mean_recall,std_recall,mean_f1_score,std_f1_score
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100,0.7207,0.0025,0.6956,0.0021,0.6925,0.0022
75,0.7071,0.0024,0.6776,0.0023,0.6747,0.0024
ALL,0.7278,0.0026,0.6967,0.0025,0.6943,0.0028


In [21]:
m, s = to_latex_table(agg_ann.round(decimals=4))
print(m)
print(s)

& $0.7071$ & $0.6776$ & $0.6747$
& $0.7207$ & $0.6956$ & $0.6925$
& $0.7278$ & $0.6967$ & $0.6943$\\
& ($0.0024$) & ($0.0023$) & ($0.0024$)
& ($0.0025$) & ($0.0021$) & ($0.0022$)
& ($0.0026$) & ($0.0025$) & ($0.0028$)\\


In [22]:
agg_svm.round(decimals=4)

Unnamed: 0_level_0,mean_precision,std_precision,mean_recall,std_recall,mean_f1_score,std_f1_score
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100,0.7505,0.0022,0.7333,0.002,0.7319,0.0021
75,0.7301,0.002,0.7122,0.0017,0.7098,0.0018
ALL,0.7438,0.0024,0.7242,0.0025,0.7229,0.0026


In [23]:
m, s = to_latex_table(agg_svm.round(decimals=4))
print(m)
print(s)

& $0.7301$ & $0.7122$ & $0.7098$
& $0.7505$ & $0.7333$ & $0.7319$
& $0.7438$ & $0.7242$ & $0.7229$\\
& ($0.002$) & ($0.0017$) & ($0.0018$)
& ($0.0022$) & ($0.002$) & ($0.0021$)
& ($0.0024$) & ($0.0025$) & ($0.0026$)\\


In [24]:
agg_dtree.round(decimals=4)

Unnamed: 0_level_0,mean_precision,std_precision,mean_recall,std_recall,mean_f1_score,std_f1_score
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100,0.4715,0.0039,0.4489,0.0032,0.4469,0.0032
75,0.4719,0.0018,0.4451,0.0015,0.4425,0.0015
ALL,0.4547,0.0032,0.4267,0.0033,0.4246,0.0032


In [25]:
m, s = to_latex_table(agg_dtree.round(decimals=4))
print(m)
print(s)

& $0.4719$ & $0.4451$ & $0.4425$
& $0.4715$ & $0.4489$ & $0.4469$
& $0.4547$ & $0.4267$ & $0.4246$\\
& ($0.0018$) & ($0.0015$) & ($0.0015$)
& ($0.0039$) & ($0.0032$) & ($0.0032$)
& ($0.0032$) & ($0.0033$) & ($0.0032$)\\


In [26]:
agg_knn.round(decimals=4)

Unnamed: 0_level_0,mean_precision,std_precision,mean_recall,std_recall,mean_f1_score,std_f1_score
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100,0.6841,0.0032,0.6493,0.0026,0.6465,0.0028
75,0.6708,0.0037,0.636,0.003,0.63,0.0032
ALL,0.7126,0.0017,0.6611,0.0014,0.6578,0.0014


In [27]:
m, s = to_latex_table(agg_knn.round(decimals=4))
print(m)
print(s)

& $0.6708$ & $0.636$ & $0.63$
& $0.6841$ & $0.6493$ & $0.6465$
& $0.7126$ & $0.6611$ & $0.6578$\\
& ($0.0037$) & ($0.003$) & ($0.0032$)
& ($0.0032$) & ($0.0026$) & ($0.0028$)
& ($0.0017$) & ($0.0014$) & ($0.0014$)\\


# Resultados standardized

In [28]:
_, agg_ann = results_for(df, 'standardized', 'ann')
_, agg_svm = results_for(df, 'standardized', 'svm')
_, agg_dtree = results_for(df, 'standardized', 'dtree')
_, agg_knn = results_for(df, 'standardized', 'knn')

In [29]:
agg_ann.round(decimals=4)

Unnamed: 0_level_0,mean_precision,std_precision,mean_recall,std_recall,mean_f1_score,std_f1_score
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100,0.7533,0.0026,0.7362,0.0025,0.7347,0.0025
75,0.7447,0.0017,0.7233,0.0016,0.7223,0.0016
ALL,0.7346,0.002,0.7164,0.0018,0.7141,0.0019


In [30]:
m, s = to_latex_table(agg_ann.round(decimals=4))
print(m)
print(s)

& $0.7447$ & $0.7233$ & $0.7223$
& $0.7533$ & $0.7362$ & $0.7347$
& $0.7346$ & $0.7164$ & $0.7141$\\
& ($0.0017$) & ($0.0016$) & ($0.0016$)
& ($0.0026$) & ($0.0025$) & ($0.0025$)
& ($0.002$) & ($0.0018$) & ($0.0019$)\\


In [31]:
agg_svm.round(decimals=4)

Unnamed: 0_level_0,mean_precision,std_precision,mean_recall,std_recall,mean_f1_score,std_f1_score
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100,0.7547,0.002,0.7349,0.002,0.734,0.002
75,0.7297,0.002,0.7076,0.0019,0.7067,0.0019
ALL,0.7456,0.0021,0.7231,0.0024,0.7214,0.0025


In [32]:
m, s = to_latex_table(agg_svm.round(decimals=4))
print(m)
print(s)

& $0.7297$ & $0.7076$ & $0.7067$
& $0.7547$ & $0.7349$ & $0.734$
& $0.7456$ & $0.7231$ & $0.7214$\\
& ($0.002$) & ($0.0019$) & ($0.0019$)
& ($0.002$) & ($0.002$) & ($0.002$)
& ($0.0021$) & ($0.0024$) & ($0.0025$)\\


In [33]:
agg_dtree.round(decimals=4)

Unnamed: 0_level_0,mean_precision,std_precision,mean_recall,std_recall,mean_f1_score,std_f1_score
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100,0.4845,0.0033,0.4538,0.0022,0.4503,0.0024
75,0.4617,0.0019,0.4373,0.0019,0.4341,0.002
ALL,0.4509,0.0033,0.4309,0.0033,0.4263,0.0035


In [34]:
m, s = to_latex_table(agg_dtree.round(decimals=4))
print(m)
print(s)

& $0.4617$ & $0.4373$ & $0.4341$
& $0.4845$ & $0.4538$ & $0.4503$
& $0.4509$ & $0.4309$ & $0.4263$\\
& ($0.0019$) & ($0.0019$) & ($0.002$)
& ($0.0033$) & ($0.0022$) & ($0.0024$)
& ($0.0033$) & ($0.0033$) & ($0.0035$)\\


In [35]:
agg_knn.round(decimals=4)

Unnamed: 0_level_0,mean_precision,std_precision,mean_recall,std_recall,mean_f1_score,std_f1_score
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100,0.6919,0.0032,0.6493,0.0027,0.647,0.0027
75,0.6755,0.0031,0.642,0.0032,0.6387,0.0033
ALL,0.7001,0.0019,0.6542,0.0015,0.6507,0.0015


In [36]:
m, s = to_latex_table(agg_knn.round(decimals=4))
print(m)
print(s)

& $0.6755$ & $0.642$ & $0.6387$
& $0.6919$ & $0.6493$ & $0.647$
& $0.7001$ & $0.6542$ & $0.6507$\\
& ($0.0031$) & ($0.0032$) & ($0.0033$)
& ($0.0032$) & ($0.0027$) & ($0.0027$)
& ($0.0019$) & ($0.0015$) & ($0.0015$)\\
