In [1]:
file_name = 'nlp_aortic_calcification.csv'

In [2]:
import os
import shutil
from collections import Counter

import pandas as pd

from config import DATASTORE

subset = pd.read_hdf(os.path.join(DATASTORE, 'store.h5'), '/subset')
subset = subset.fillna('')

## Review 100 reports randomly selected from sample10k

In [3]:
# These pattern was design by observation the reports above
pattern = r'(?i)(calcifi(cation|ed)[^\.]*?aort(a|ic)|aort(a|ic)[^\.]*?calcification|arteriosclerotic|atherosclero(tic|sis)|ASHD|A\.S\.H\.\D)'

In [5]:
def doctor_profile(dataset, pattern):
    """Given regex pattern, return the profile about how the doctor report this pattern
    return the doctor/count/workload/ratio 
    """
    pos = dataset[dataset['FINDINGS'].str.contains(pattern)]
    dr_who_report = pos['CONFIRMDR']
    workload = Counter(subset['CONFIRMDR']).most_common()
    profile = pd.DataFrame(workload, columns=['name', 'workload'] ).set_index('name', drop=True)
    report_count = Counter(dr_who_report).most_common()
    report_count = pd.DataFrame(report_count, columns=['name', 'count'] ).set_index('name', drop=True)
    profile = profile.assign(count=report_count).fillna(0)
    profile = profile.assign(ratio=profile['count']/profile['workload'])
    return profile.sort_values('ratio', ascending=False)

In [7]:
threshold = 0.4
profile = doctor_profile(subset, pattern)
sensitive_doctors = profile[(profile['ratio'] > threshold) & (profile['workload'] > 50)]; sensitive_doctors

  """


Unnamed: 0_level_0,workload,count,ratio
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
陳郁惟,52,33.0,0.634615
歐陽翊潔,171,81.0,0.473684
劉武翹,1072,462.0,0.43097


In [8]:
sensitive_doctors = sensitive_doctors.index

In [9]:
full = pd.read_hdf(os.path.join(DATASTORE, 'store.h5'), '/femh').fillna('')
relaible_reports = full[full['CONFIRMDR'].isin(sensitive_doctors)]
pos_full = full[full['FINDINGS'].str.contains(pattern)]
len(pos_full)

  This is separate from the ipykernel package so we can avoid doing imports until


84971

In [10]:
positive_reports = pos_full
positive_reports = positive_reports.assign(label=1)

In [11]:
pos_neg = pd.merge(relaible_reports, positive_reports, how='outer', left_on='ACCNO', right_index=True)
pos_neg['label'].fillna(0, inplace=True)
stats = Counter(pos_neg['label'])
print('正樣本:{}, 負樣本:{}'.format(stats[1.0], stats[0.0]))

正樣本:85023, 負樣本:30971


In [12]:
pos_neg[['label']].to_csv(os.path.join('data', file_name))

# QC of Image

In [13]:
system_output = pd.read_csv(os.path.join('data', file_name), index_col=0)

In [14]:
shutil.rmtree('data/QC/pos', ignore_errors=True)
shutil.rmtree('data/QC/neg', ignore_errors=True)
os.makedirs('data/QC/pos')
os.makedirs('data/QC/neg')

In [15]:
has_file = pd.merge(subset, system_output, left_index=True, right_index=True)[['label']]

for accno in has_file[has_file['label']==1].sample(50).index:
    src = os.path.join(DATASTORE,'subset/{}.png'.format(accno))
    dst = 'data/QC/pos/{}.png'.format(accno)
    shutil.copy(src, dst)
    
for accno in has_file[has_file['label']==0].sample(50).index:
    src = os.path.join(DATASTORE,'subset/{}.png'.format(accno))
    dst = 'data/QC/neg/{}.png'.format(accno)
    shutil.copy(src, dst)


# Varify results

In [18]:
result = pd.read_csv('data/output testset.csv', index_col=0);result
has_file = pd.merge(subset, result, left_index=True, right_index=True)[['cal', 'prob']]
has_file = has_file.sort_values('prob', ascending=False); has_file

Unnamed: 0_level_0,cal,prob
ACCNO,Unnamed: 1_level_1,Unnamed: 2_level_1
RA03430916620002,1,9.997503e-01
RA01C13744050018,1,9.995596e-01
RA02531403660013,1,9.994934e-01
RA04A09116270001,1,9.994019e-01
RA04406724510011,1,9.989448e-01
RA04630656260016,1,9.988459e-01
RA05A07903720015,1,9.984035e-01
RA07101575020014,1,9.983060e-01
RA03416368270004,1,9.979587e-01
RA06309267150623,1,9.978195e-01


In [22]:
os.makedirs('data/QC/result_pos', exist_ok=True)
os.makedirs('data/QC/result_neg', exist_ok=True)

for i, accno in enumerate(has_file[has_file['cal']==1].sample(50).index):
    src = os.path.join(DATASTORE,'subset/{}.png'.format(accno))
    dst = 'data/QC/result_pos/%03d.png' % (i +1)
    shutil.copy(src, dst)
    
for i, accno in enumerate(has_file[has_file['cal']==0].sample(50).index):
    src = os.path.join(DATASTORE,'subset/{}.png'.format(accno))
    dst = 'data/QC/result_neg/%03d.png' % (i +1)
    shutil.copy(src, dst)