# Предобработка данных

In [4]:
import os
import re
import pandas as pd
from tqdm.notebook import tqdm
import subprocess
from PIL import Image
import numpy as np

Открываем `OpenPart.csv` файл, в котором находятся оценки эксперта, преобразовываем его так, чтобы оценки были одним столбцов. Также добавляем колонки с адресом к файлам экспертной разметки и разметки алгоритма.

In [5]:
df = pd.read_csv('../../dataset/SecretPart.csv')
df.head()

Unnamed: 0,Case,Sample 1,Sample 2,Sample 3
0,00011827_003.png,3,3,3
1,00011925_072.png,5,5,4
2,00012045_019.png,1,5,1
3,00012094_040.png,5,3,1
4,00012174_000.png,4,5,2


In [6]:
def rename_file(num):
    def fun(row):
        row[0] = f'dataset/sample_{num}/{row[0]}'
        row[0] = row[0].replace('.png', f'_s{num}.png')
        return row
    return fun

def rename_expert_file(file_name):
    file_name = 'dataset/Expert/' + file_name[:-4] + '_expert.png'
    return file_name

In [7]:
df_s1 = df.iloc[:,[0,1]]
df_s1 = df_s1.rename(columns={"Case": "pred_mask_path", "Sample 1": "review"})
df_s1['pred_mask_path'].apply(rename_expert_file)
df_s1['true_mask_path'] = df_s1['pred_mask_path'].apply(rename_expert_file)
df_s1 = df_s1.apply(rename_file(1), axis=1)
df_s1 = df_s1[['true_mask_path', 'pred_mask_path', 'review']]

df_s2 = df.iloc[:,[0,1]]
df_s2 = df_s2.rename(columns={"Case": "pred_mask_path", "Sample 1": "review"})
df_s2['true_mask_path'] = df_s2['pred_mask_path'].apply(rename_expert_file)
df_s2 = df_s2.apply(rename_file(2), axis=1)
df_s2 = df_s2[['true_mask_path', 'pred_mask_path', 'review']]

df_s3 = df.iloc[:,[0,1]]
df_s3 = df_s3.rename(columns={"Case": "pred_mask_path", "Sample 1": "review"})
df_s3['true_mask_path'] = df_s3['pred_mask_path'].apply(rename_expert_file)
df_s3 = df_s3.apply(rename_file(3), axis=1)
df_s3 = df_s3[['true_mask_path', 'pred_mask_path', 'review']]

Для каждой пары разметок добавляем пустые колонки с названием [метрик](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4533825/), которые мы будем оценивать

In [8]:
_metrics = [
    'DICE', 'JACRD', 'AUC', 'KAPPA', 'RNDIND', 'ADJRIND', 'ICCORR', 'VOLSMTY', 'MUTINF',
    'HDRFDST', 'MAHLNBS', 'VARINFO', 'GCOERR', 'PROBDST', 'SNSVTY', 'SPCFTY', 'PRCISON',
    'FMEASR', 'ACURCY', 'FALLOUT', 'TP', 'FP', 'TN', 'FN', 'REFVOL', 'SEGVOL'
]

In [9]:
new_df = pd.concat([df_s1, df_s2, df_s3]).reset_index(drop=True)

new_df['true_mask_pixels'] = 0
new_df['pred_mask_pixels'] = 0

for t in _metrics:
    new_df[t] = 0

Функция, которая запускает файл, который высчитывает эти метрики

In [10]:
def calculate_metrics(true_path, pred_path, debug=False):    
    cmd_metrics = ','.join(_metrics)
    
    metrics = subprocess.run(['./scripts/evaluate',
                              true_path,
                              pred_path,
                             '-use', cmd_metrics],
                              cwd=os.path.realpath(os.path.join(os.getcwd(), '..', '..')),
                              capture_output=True)
    
    metrics = metrics.stdout.decode("utf-8").strip()
    if debug:
        print(metrics)
    metrics = re.findall(r"([A-Z]+)\s+=\s([\.\d]+)\s+[\w\(\)\-,\s]+\s?$",
                         metrics, re.MULTILINE)
    
    return metrics

В процессе расчитывания метрик и анализа данных, выяснилось, что бывают такие случаи, когда либо эксперт, либо алгоритм ничего не разметили, соответственно невозможно посчитать метрики, а нам очень важно наличие индикатора пустоты изображения. Поэтому в таблицу вносятся данные о количестве белых пикселей.

In [11]:
def calc_pixels(path):
    img = Image.open(f'../../{path}').convert('L')
    np_img = np.array(img)
    np_img[np_img > 0] = 1
    return np.count_nonzero(np_img)

In [12]:
for i in tqdm(new_df.index, ncols='100%'):
    true_mask_path = new_df.loc[i, ['true_mask_path']][0]
    pred_mask_path = new_df.loc[i, ['pred_mask_path']][0]
    metrics = calculate_metrics(true_mask_path, pred_mask_path)
    
    for metric_name, value in metrics:
        new_df.loc[i,[metric_name]] = value
        
    new_df.loc[i,['true_mask_pixels']] = calc_pixels(true_mask_path)
    new_df.loc[i,['pred_mask_pixels']] = calc_pixels(pred_mask_path)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, layout=Layout(flex='2'), max=120.0), HTML(value='')), …




In [13]:
new_df

Unnamed: 0,true_mask_path,pred_mask_path,review,true_mask_pixels,pred_mask_pixels,DICE,JACRD,AUC,KAPPA,RNDIND,...,PRCISON,FMEASR,ACURCY,FALLOUT,TP,FP,TN,FN,REFVOL,SEGVOL
0,dataset/Expert/00011827_003_expert.png,dataset/sample_1/00011827_003_s1.png,3,0,21047,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,dataset/Expert/00011925_072_expert.png,dataset/sample_1/00011925_072_s1.png,5,162894,154111,0.654488,0.486423,0.789984,0.593016,0.812912,...,0.673138,0.654488,0.895545,0.056875,103738,50373,835309,59156,162894,154111
2,dataset/Expert/00012045_019_expert.png,dataset/sample_1/00012045_019_s1.png,1,0,2539,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,dataset/Expert/00012094_040_expert.png,dataset/sample_1/00012094_040_s1.png,5,122966,214132,0.588304,0.416735,0.841086,0.516229,0.770329,...,0.463070,0.588304,0.867647,0.124214,99158,114974,810636,23808,122966,214132
4,dataset/Expert/00012174_000_expert.png,dataset/sample_1/00012174_000_s1.png,4,103946,67154,0.514693,0.346523,0.699564,0.473743,0.854163,...,0.655687,0.514693,0.920811,0.024477,44032,23122,921508,59914,103946,67154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,dataset/Expert/00019124_090_expert.png,dataset/sample_3/00019124_090_s3.png,5,55609,7372,0.156904,0.085131,0.543202,0.146306,0.903850,...,0.670239,0.156904,0.949361,0.002448,4941,2431,990536,50668,55609,7372
116,dataset/Expert/00019495_004_expert.png,dataset/sample_3/00019495_004_s3.png,2,13523,6150,0.253342,0.145044,0.590372,0.247273,0.972375,...,0.405203,0.253342,0.985991,0.003534,2492,3658,1031395,11031,13523,6150
117,dataset/Expert/00019767_016_expert.png,dataset/sample_3/00019767_016_s3.png,4,30283,27753,0.000000,0.000000,0.486373,0,0.895432,...,0.000000,0.000000,0.944653,0.027254,0,27753,990540,30283,30283,27753
118,dataset/Expert/00020000_000_expert.png,dataset/sample_3/00020000_000_s3.png,2,82502,67028,0.646104,0.477219,0.783067,0.619247,0.904161,...,0.720684,0.646104,0.949533,0.019379,48306,18722,947352,34196,82502,67028


In [14]:
new_df.to_csv('../../corpus/answer_metrics.csv')