In [1]:
import xmltodict, json
import os
import pandas as pd

In [2]:
pd.options.display.max_columns = None

In [3]:
path = '/home/wt/Downloads/n2c2 2012/'
training_data_path = path + '2012-07-15.original-annotation.release/'
test_data_path = path + '2012-08-23.test-data.groundtruth/ground_truth/merged_xml/'

In [4]:
def data_loader(data_path):
    data = {}
    for filename in os.listdir(data_path):
        if filename.endswith(".xml"): 
            f = (os.path.join(data_path, filename))
#             print(f)
            fb = open(f, "rb").read().decode(encoding="utf-8")
#     invalid character '&' https://github.com/martinblech/xmltodict/issues/277
            fb = fb.replace('&', '&amp;')
            dic = xmltodict.parse(fb, attr_prefix='')
#     restore orginal character "&"
            dic['ClinicalNarrativeTemporalAnnotation']['TEXT'] = dic['ClinicalNarrativeTemporalAnnotation']['TEXT'].replace('&amp;', '&')
            data[filename] = (dic)
    return data

In [5]:
train_data = data_loader(training_data_path)
test_data = data_loader(test_data_path)

In [6]:
len(train_data)

190

In [7]:
len(test_data)

120

In [8]:
len(train_data) + len(test_data)

310

In [36]:
def pre_process(data):
    text_data = {}
    meta_data = []
    for doc_id in data.keys():
#         print(doc_id)
        text = data[doc_id]['ClinicalNarrativeTemporalAnnotation']['TEXT']
        text_data[doc_id] = text
        events = pd.DataFrame(data[doc_id]['ClinicalNarrativeTemporalAnnotation']['TAGS']['EVENT'])
        # adm_dis = pd.DataFrame(data[doc_id]['ClinicalNarrativeTemporalAnnotation']['TAGS']['SECTIME'])
        times = pd.DataFrame(data[doc_id]['ClinicalNarrativeTemporalAnnotation']['TAGS']['TIMEX3'])
        links = pd.DataFrame(data[doc_id]['ClinicalNarrativeTemporalAnnotation']['TAGS']['TLINK'])

#         Focus on "sec_time_rel" relations
        st_links = links[links['id'].str.lower().str.contains('sectime')]
#         print(st_links.shape)
        events.columns = [c+"_EVENT" for c in events.columns]
#         adm_dis.columns = [c+"_SECTIME" for c in adm_dis.columns]
        st_links.columns = [c+"_TLINK" for c in st_links.columns]
        times.columns = [c+"_TIMEX3" for c in times.columns]

#         Join sections together
        link_event = st_links.merge(events, left_on='fromID_TLINK', right_on='id_EVENT', how='left')
        link_event_sem = link_event.merge(times, left_on='toID_TLINK', right_on='id_TIMEX3', how='left')
        # link_event_sem['label'] = link_event_sem['type_TLINK']+'_'+link_event_sem['type_SECTIME']
        link_event_sem['doc_id'] = doc_id 
        # print(link_event_sem.shape)
        try:
            discharge_date = link_event_sem['val_TIMEX3'].max()
        except Exception as e:
#             Missing discharge data 
            print('Unexpected formmat ', doc_id)
            link_event_sem['adm_dis_TIMEX3'] = 'ADMISSION'
            link_event_sem.loc[link_event_sem['toID_TLINK']=='Discharge', 'adm_dis_TIMEX3'] = 'DISCHARGE'
        else:
            link_event_sem['adm_dis_TIMEX3'] = 'ADMISSION'
            link_event_sem.loc[link_event_sem['val_TIMEX3']==discharge_date, 'adm_dis_TIMEX3'] = 'DISCHARGE'
            
        meta_data.append(link_event_sem)
    labels = pd.concat(meta_data)     
    return text_data, labels

In [14]:
train_data_text, train_data_labels = pre_process(train_data)

Unexpected formmat  492.xml
Unexpected formmat  666.xml
Unexpected formmat  47.xml


In [15]:
test_data_text, test_data_labels = pre_process(test_data)

Unexpected formmat  31.xml
Unexpected formmat  276.xml
Unexpected formmat  781.xml


In [17]:
def extract_sentence(doc_id, text_data, start_EVENT, end_EVENT):
    text = text_data[doc_id]
    sent_start = text[0: int(start_EVENT)].rfind('\n')
    sent_end = text[int(end_EVENT)-1::].find('\n')
    sentence = text[sent_start+1:int(end_EVENT)-1 + sent_end]
    return sentence

In [18]:
# train_data_text, 

In [19]:
train_data_labels['sentence'] = train_data_labels.apply(lambda x: extract_sentence(x.doc_id, train_data_text,
                                                                                x.start_EVENT, x.end_EVENT), axis=1)

In [20]:
train_data_labels.to_csv('data/train_sec_time_rel_sentence.csv', index=None)

In [21]:
test_data_labels['sentence'] = test_data_labels.apply(lambda x: extract_sentence(x.doc_id, test_data_text,
                                                                                x.start_EVENT, x.end_EVENT), axis=1)

In [22]:
test_data_labels.to_csv('data/test_sec_time_rel_sentence.csv', index=None)

In [38]:
train_data_labels.head()

Unnamed: 0,id_TLINK,fromID_TLINK,fromText_TLINK,toID_TLINK,toText_TLINK,type_TLINK,id_EVENT,start_EVENT,end_EVENT,text_EVENT,modality_EVENT,polarity_EVENT,type_EVENT,id_TIMEX3,start_TIMEX3,end_TIMEX3,text_TIMEX3,type_TIMEX3,val_TIMEX3,mod_TIMEX3,doc_id,adm_dis_TIMEX3,sentence
0,Sectime0,E14,cardiac catheterization,T2,2017-07-02,BEFORE,E14,1004,1027,cardiac catheterization,FACTUAL,POS,TEST,T2,46,56,2017-07-02,DATE,2017-07-02,,212.xml,DISCHARGE,The patient was found at cardiac catheterizati...
1,Sectime1,E15,mild diffuse instent restenosis in the mid stent,T2,2017-07-02,BEFORE,E15,1036,1084,mild diffuse instent restenosis in the mid stent,FACTUAL,POS,PROBLEM,T2,46,56,2017-07-02,DATE,2017-07-02,,212.xml,DISCHARGE,The patient was found at cardiac catheterizati...
2,Sectime2,E46,hemodynamically normal,T2,2017-07-02,BEFORE,E46,1097,1119,hemodynamically normal,FACTUAL,POS,OCCURRENCE,T2,46,56,2017-07-02,DATE,2017-07-02,,212.xml,DISCHARGE,The patient was found at cardiac catheterizati...
3,Sectime3,E16,flow-limiting stenoses,T2,2017-07-02,BEFORE,E16,1169,1191,flow-limiting stenoses,FACTUAL,NEG,PROBLEM,T2,46,56,2017-07-02,DATE,2017-07-02,,212.xml,DISCHARGE,The patient was found at cardiac catheterizati...
4,Sectime4,E17,her cardiac medications,T2,2017-07-02,BEFORE_OVERLAP,E17,1228,1251,her cardiac medications,FACTUAL,POS,TREATMENT,T2,46,56,2017-07-02,DATE,2017-07-02,,212.xml,DISCHARGE,The patient was then continued on her cardiac ...


In [None]:
type_TLINK, adm_dis_TIMEX3, sentence

In [41]:
train_data_labels['adm_dis_TIMEX3'].value_counts()

DISCHARGE    10350
ADMISSION     5569
Name: adm_dis_TIMEX3, dtype: int64

In [34]:
pd.DataFrame(train_data_text.items(), columns=['doc_id', 'text']).to_csv('data/train_text.csv', index=None)

In [35]:
pd.DataFrame(test_data_text.items(), columns=['doc_id', 'text']).to_csv('data/test_text.csv', index=None)