In [1]:
file_name = 'nlp_aortic_aneurysm_ct.csv'

In [2]:
import os
import shutil
from collections import Counter

import pandas as pd

from config import DATASTORE

subset = pd.read_hdf(os.path.join(DATASTORE, 'store.h5'), '/femh_ct')
subset = subset.fillna('')

## Review 100 reports randomly selected from the subset

In [3]:
including_pattern = r'(?i)(aort(ic|a)[^\.]*?aneurysm|aneurysm[^\.]*?aort(ic|a))'
excluding_pattern = r'(?i)no[^\.]*(dissection|aneurysm)'
excluding_pattern2 = r'(?i)(abdominal[^\.]*aneurysm|dissect)'

In [4]:
pos = subset[subset['IMPRESSIONS'].str.contains(including_pattern) & 
                  ~subset['IMPRESSIONS'].str.contains(excluding_pattern) &
                  ~subset['IMPRESSIONS'].str.contains(excluding_pattern2)]

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
neg = subset[subset['IMPRESSIONS'].str.contains(excluding_pattern)]

  """Entry point for launching an IPython kernel.


In [6]:
label = pd.concat([pos.assign(label=1)[['label']], neg.assign(label=0)[['label']]])
stats = Counter(label['label'])
print('正樣本:{}, 負樣本:{}'.format(stats[1.0], stats[0.0]))

正樣本:546, 負樣本:2083


In [7]:
samples = pos.sample(50)
for row in samples['IMPRESSIONS']:
    print('='*50)
    print(row)

Eccentrical aneurysma of the aortic arch.$Aneurysm of ascending aorta.$Marked atherosclerosis.$Coronary heart disease.
Post-operative change, Fusiform aortic aneurysm in thoracic descending aorta, without significant interval change.
1. A tiny perifissural nodule adjacent to minor fissure in RUL of lung.$2. Mild aneurysmal dilatation of ascending aorta. Atherosclerosis and coronary artery disease.$3. Gallbladder stones.$4. Degenerative change of the spine with marginal spur formation. $$Measurement of pulmonary solid nodule is based on RECIST 1.1, the largest diameter in an axial plane. RECIST: Response Evaluation Criteria in Solid Tumors$$If solid nodules or ground-glass nodules and semisolid nodules equal to or more than 6 mm, suggest further evaluation in chest outpatient clinic.$If infiltrated lesion or any mass lesions suspicious for malignancy, please reevaluate the lesion at the chest or chest surgical outpatient clinic.$$Remarks: A majority of the nodules less than 8 mm, detect

In [8]:
label.to_csv(os.path.join('data', file_name))

# Concat the aortic_dissection and thoracic aortic_aneurysm

In [9]:
dissection = pd.read_csv('data/nlp_dissection_ct.csv', index_col=0)

In [10]:
Counter(label.index.duplicated())
Counter(dissection.index.duplicated())

Counter({False: 3352})

In [11]:
taa_dissection = pd.concat([label, dissection])

In [12]:
taa_dissection = taa_dissection.reset_index().drop_duplicates('ACCNO').set_index('ACCNO')

In [13]:
Counter(taa_dissection['label'])

Counter({1: 1809, 0: 2083})

In [14]:
taa_dissection.to_csv('data/nlp_joined_dissection_aneurysm.csv')

In [15]:
stats = Counter(taa_dissection['label'])
print('正樣本:{}, 負樣本:{}'.format(stats[1.0], stats[0.0]))

正樣本:1809, 負樣本:2083
