# Extract Performances

In [1]:
import glob
import os
import re
import pandas as pd
import numpy as np
from collections import defaultdict

## Define DefaultOrderedDict

In [2]:
from collections import OrderedDict, Callable

class DefaultOrderedDict(OrderedDict):
    # Source: http://stackoverflow.com/a/6190500/562769
    def __init__(self, default_factory=None, *a, **kw):
        if (default_factory is not None and
           not isinstance(default_factory, Callable)):
            raise TypeError('first argument must be callable')
        OrderedDict.__init__(self, *a, **kw)
        self.default_factory = default_factory

    def __getitem__(self, key):
        try:
            return OrderedDict.__getitem__(self, key)
        except KeyError:
            return self.__missing__(key)

    def __missing__(self, key):
        if self.default_factory is None:
            raise KeyError(key)
        self[key] = value = self.default_factory()
        return value

    def __reduce__(self):
        if self.default_factory is None:
            args = tuple()
        else:
            args = self.default_factory,
        return type(self), args, None, None, self.items()

    def copy(self):
        return self.__copy__()

    def __copy__(self):
        return type(self)(self.default_factory, self)

    def __deepcopy__(self, memo):
        import copy
        return type(self)(self.default_factory,
                          copy.deepcopy(self.items()))

    def __repr__(self):
        return 'OrderedDefaultDict(%s, %s)' % (self.default_factory,
                                               OrderedDict.__repr__(self))

  """Entry point for launching an IPython kernel.


## Extract mean best scores among all random seeds

In [139]:
input_dir = './results/baseline/nl50'
# input_dir = './results/fixmatch/nl50'
# input_dir = './results/fixmatch&focal/nl50/0.5'
out_dir = './results'
out_fname = 'test.csv'
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)

In [144]:

# Extract paths for each colums
ori_data_paths = glob.glob(f'{input_dir}/*.txt')
data_paths_dict, fm_cols = DefaultOrderedDict(list), []
for ori_data_path in ori_data_paths:
    # Exclude the case of the random seed 4 because of performance
    if 'baseline' in ori_data_path:
        data_paths_dict['baseline'].append(ori_data_path)
        if 'baseline' not in fm_cols:
            fm_cols.append('baseline')
    else: # FixMatch
        cond = ori_data_path.split('/')[-1]
        for r in range(4):
            cond = cond.replace(f'-r{r}', '')

        data_paths_dict[cond].append(ori_data_path)
        if cond not in fm_cols:
            fm_cols.append(cond)

data_paths_dict['baseline'][0]

'./results/baseline/nl50/baseline-b16-nl50-m1-lb1.0-th1.0-sharpFalse-T1.0-focalFalse-fg1.0-optSGD-lr0.001-mom0.9-sc:step-r1.txt'

In [145]:
p = re.compile('1[.]\d{4}|0[.]\d{4}') #[0.숫자4개]

cols = ('acc-all', 'ppv-all', 'ppv-covid', 'ppv-pneumonia', 'ppv-normal',
        'recall-all', 'recall-covid', 'recall-pneumonia', 'recall-normal',
        'f1-all', 'f1-covid', 'f1-pneumonia', 'f1-normal')
metrics_dfs = {}
for col, data_paths in data_paths_dict.items(): # 데이터 경로에서 txt파일을 하나씩 불러온다.
    metrics_dfs[col] = pd.DataFrame([], columns=cols)
    for ori_data_path in data_paths:

        with open(ori_data_path, 'r') as f: 
            all_data = f.read() # 전체 txt line을 읽어온다.
            all_data = all_data.split('\n') # 띄어쓰기 기준으로 나눔
            if 'baseline' in ori_data_path:
                candidates = ((327, 332), (662, 667), (997, 1002),)
                              #(1332, 1337))#, (1667, 1672))
            else:
                candidates = ((347, 352), (702, 707), (1057, 1062),)
                              #(1412, 1417))#, (1767, 1772))
            metrics = []
            for s_idx, e_idx in candidates:
                metric_line = all_data[s_idx:e_idx] # hard coding.. best result 부분
                metrics.append(list(map(float, p.findall(str(metric_line)))))# 정규표현식에 맞는것만 뽑기

            metrics_dfs[col] = pd.concat((metrics_dfs[col],
                                         pd.DataFrame(metrics, columns=cols))).reset_index(drop=True)

final_metrics_df = pd.DataFrame([], columns=cols)
for fm_col in fm_cols:
    final_metrics = []
    for col in cols:
        best_metrics = metrics_dfs[fm_col][col].to_numpy()
        mean, std = best_metrics.mean(), best_metrics.std()
        final_metrics.append(f'{mean:.4f} (±{std:.4f})')
        
    final_metrics_df = final_metrics_df.append(pd.DataFrame([final_metrics], columns=cols, index=(fm_col,)))

final_metrics_df

Unnamed: 0,acc-all,ppv-all,ppv-covid,ppv-pneumonia,ppv-normal,recall-all,recall-covid,recall-pneumonia,recall-normal,f1-all,f1-covid,f1-pneumonia,f1-normal
baseline,0.8525 (±0.0197),0.8590 (±0.0146),0.9138 (±0.0248),0.8274 (±0.0637),0.8357 (±0.0349),0.8525 (±0.0197),0.8367 (±0.0578),0.8750 (±0.0328),0.8458 (±0.0528),0.8528 (±0.0196),0.8718 (±0.0251),0.8479 (±0.0227),0.8388 (±0.0187)


In [74]:
# final_metrics_df.to_csv(f'{out_dir}/{out_fname}')