# Stroke Key-word extraction

In [149]:
import os
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
raw_dir = '../../data/raw'
readings = pd.read_csv(os.path.join(raw_dir, 'brain_mr_ct_result.csv'), header=None)

In [7]:
readings[0]

0        \n\n** Final Report by Neuro Section **\n\nThe...
1        \n\n** Final Report by Neuro Section **\n\n\n\...
2        \n\n1. No evidence of intracranial hemorrhage....
3        \n\ns/p reop for recurred mass (2016-09-07)\n\...
4        \n\n1. No evidence of definite recanalization ...
                               ...                        
60849    \n\nMRI>\n\nNo diffusion restrictive lesions\n...
60850    \n\nDiffuse infiltrative T2 high SI mass like ...
60851    \n\nDiffuse infiltrative T2 high SI mass like ...
60852    \n\nr/o Moyamoya disease\n\n[Finding]\n\n2016-...
60853    \n\nMRA shows stenosis of the both distal ICA ...
Name: 0, Length: 60854, dtype: object

In [8]:
keywords = [
    'acute infarction',
    'hypoxic ischemic brain injury',
    'embolic infarction',
]

In [17]:
keyword2key = {
    'acute infarction': 'AI',
    'hypoxic ischemic brain injury': 'HIBI',
    'embolic infarction': 'EI',
}

In [9]:
readings.loc[:, 0] = readings.loc[:, 0].apply(lambda x: x.lower())

In [10]:
def present_keywords(doc):
    contains = []
    for kw in keywords:
        if kw in doc:
            contains.append(kw)
    return contains

In [11]:
readings['present_kws'] = readings.loc[:, 0].apply(present_keywords)

In [13]:
readings['has_kws'] = readings['present_kws'].apply(lambda x: True if len(x) > 0 else False)

In [16]:
readings['AI'] = readings['present_kws'].apply(lambda x: True if keywords[0] in x else False)
readings['HIBI'] = readings['present_kws'].apply(lambda x: True if keywords[1] in x else False)
readings['EI'] = readings['present_kws'].apply(lambda x: True if keywords[2] in x else False)

In [15]:
readings['has_kws'].sum()

2613

In [18]:
for kw, k in keyword2key.items():
    print(k, readings[k].sum())

AI 2497
HIBI 1
EI 178


In [21]:
test = readings.loc[:10, 0].copy()

In [24]:
sample = test.apply(lambda x: x.split('\n'))[4]

In [125]:
sample[2]

' '

In [92]:
string = 'no evidence of embolized'
kw = 'embolized'
out_middle = re.search(r'(?<= no) .*{}'.format(kw), string)
out_start = re.search(r'(?<=^no) .*{}'.format(kw), string)

In [93]:
if out_middle is not None:
    print("middle:", out_middle.group(0))
elif out_start is not None:
    print("start:", out_strat.group(0))

start:  evidence of embolized


In [103]:
readings.iloc[86, 0]

'\n\nbrain ct\n\n \n\nci: acute infarction.\n\n\n\n1. s/p evd insertion, rt.\n\n      - no significant post op unusual finding.\n\n      - post op changes: pneumocephalus.\n\n\n\n2. low attenuations at rt. occipital lobe and bilateral cerebellum.\n\n      : acute infarction, suggested.\n\n\n\n3. no demonstrable intracranial hemorrhage.\n\n\n\n[finding]\n\n2016-11-21   \n\n'

In [130]:
def contains_keyword_at_line_number(kw):
    def func(doc):
        doc = doc.split('\n')
        kw = keywords[0]
        for i, line in enumerate(doc):
            out_start = re.search(r'(?<!^no) .*{}'.format(kw), line)
            not_negated_at_start = out_start is not None

            out_middle = re.search(r'(?<!no) .*{}'.format(kw), line)
            not_negated_mid_sentence = out_middle is not None
            
            if not_negated_at_start and not_negated_mid_sentence:
                return True, i

        return False, None
    return func

In [131]:
kw = keywords[0]
readings[keyword2key[kw]+"_at_i"] = readings[0].apply(contains_keyword_at_line_number(kw))

In [135]:
readings['AI_processed'] = readings['AI_at_i'].apply(lambda x: x[0])
readings['AI_processed_line'] = readings['AI_at_i'].apply(lambda x: x[1])

In [142]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

In [144]:
for r, row in readings[readings['AI'] & ~readings['AI_processed']].head().iterrows():
    print(row[0])
    print("="*100)



***final report from the "neuro" section***

final report is grossly in agreement with the preliminary report. 



acute infarction, lt mca inferior division territory

 - flair signal change (+)

 - intraarterial enhancement in the lt. mca inferior division(+)

 - no definite evidence of hemorrhagic transformation

 - severe stenosis of the lt proximal ica, just above bifurcation level

 - additional multiple diffusion high si foci in the lt cerebral hemisphere

 --> r/o embolic infarction



old infarct in the cerebellum



no change of tiny calcification in ant falx

 --> r/o calcified meningioma > r/o small granuloma



mra>

severe stenosis of the lt proximal ica, just above bifurcation level

moderate stenosis of the rt proximal ica 

hypoplastic rt va

occlusion state of the lt. mca inferior division

[finding]

2016-11-25   * 응급판독입니다. 정식 판독 시 판독내용이 바뀔 수 있습니다 *



검사이유: dysarthria

compared to 2002-09-17 ct



acute infarction, lt mca inferior division territory

 - flair sign

In [123]:
readings.iloc[86, 0].split('\n')

['',
 '',
 'brain ct',
 '',
 ' ',
 '',
 'ci: acute infarction.',
 '',
 '',
 '',
 '1. s/p evd insertion, rt.',
 '',
 '      - no significant post op unusual finding.',
 '',
 '      - post op changes: pneumocephalus.',
 '',
 '',
 '',
 '2. low attenuations at rt. occipital lobe and bilateral cerebellum.',
 '',
 '      : acute infarction, suggested.',
 '',
 '',
 '',
 '3. no demonstrable intracranial hemorrhage.',
 '',
 '',
 '',
 '[finding]',
 '',
 '2016-11-21   ',
 '',
 '']

In [124]:
has_acute_infarction(readings.iloc[1003, 0])

False

In [113]:
readings.iloc[1003, 0].split('\n')

['',
 '',
 '***final report from the "neuro" section***',
 '',
 'final report is partly in agreement with the preliminary report. ',
 '',
 '',
 '',
 'subdural fluid collection in the rt. f-t-p convexity',
 '',
 ' - with hyperacute subtle hemorrhage ',
 '',
 '--> r/o postop chage but inflammatory condition cannot be excluded',
 '',
 'rec) clinical correlation',
 '',
 'no acute infarction on dwi/adc map.',
 '',
 '',
 '',
 'thin bulging contour loculated small fluid collection around the craniotomy site',
 '',
 '--> r/o small subtle pseudomeningocele',
 '',
 '',
 '',
 'localized t2 high si in rt. parietal lobe ',
 '',
 '--> r/o reactive change d/t inflammatory condition',
 '',
 'diffuse meningeal enhancement in the rt. f-t-p convexity',
 '',
 '--> probable postop reactive change',
 '',
 '[finding]',
 '',
 '2016-12-12   *응급판독입니다. 정식 판독 시 판독내용이 바뀔 수 있습니다*',
 '',
 '',
 '',
 '검사이유: wound discharge',
 '',
 'c.i: s/p op. for the rt. parietal convexity mass.',
 '',
 '',
 '',
 '2016-12-12 ct와 mr을

In [268]:
sample = 'mild degree svd with a few old lacunar infarct in cerebral wm'

In [269]:
kw = 'infarc'
out_start = re.search(r'(?<!no ).*{}'.format(kw), sample)
out_old = re.search(r'(?<!old ).*{}'.format(kw), sample)
out_middle = re.search(r'(?<! no ).*{}'.format(kw), sample)

In [270]:
if out_old is not None:
    print("old:", out_middle.group(0))
if out_middle is not None:
    print("middle:", out_middle.group(0))
if out_start is not None:
    print("start:", out_start.group(0))

old: mild degree svd with a few old lacunar infarc
middle: mild degree svd with a few old lacunar infarc
start: mild degree svd with a few old lacunar infarc


In [101]:
readings.index[readings['AI']]

Int64Index([    6,    86,    87,   113,   146,   148,   265,   297,   449,
              455,
            ...
            58118, 58484, 59263, 59718, 60214, 60610, 60758, 60759, 60803,
            60852],
           dtype='int64', length=2497)

In [97]:
readings.iloc[:100, 0].apply(has_acute_infarction).sum()

0

In [111]:
readings.iloc[:100, 0][readings.iloc[:100, 0].apply(has_acute_infarction)]

6     \n\nped section report)\n\n\n\nbilateral dista...
86    \n\nbrain ct\n\n \n\nci: acute infarction.\n\n...
87    \n\n1. compared to 2016-11-18 outside dwi,\n\n...
Name: 0, dtype: object

# With real data

In [392]:
df = pd.read_csv(os.path.join(raw_dir, "lab.csv"), parse_dates=['event_time'])

In [364]:
df.head()

Unnamed: 0,id,case_id,event_type,event_name,event_time,event_result
0,244097,1,LAB,간이혈당검사[POCT],2099-12-30 06:23:00,334.0
1,244104,1,LAB,간이혈당검사[POCT],2099-12-30 06:23:00,110.0
2,244107,1,LAB,간이혈당검사[POCT],2099-12-30 06:23:00,180.0
3,244108,1,LAB,Sodium (serum)(검사24시간가능),2099-12-30 06:23:00,138.0
4,244109,1,LAB,Potassium (serum)(검사24시간가능),2099-12-30 06:23:00,3.1


In [407]:
cutoff_start = pd.datetime(year=2100, month=1, day=1, hour=0, minute=0,)
cutoff_end = pd.datetime(year=2100, month=4, day=1, hour=0, minute=0,)

is_postop = (df.event_time > cutoff_start) & (df.event_time <= cutoff_end) 
is_img = df_postop.event_type == 'IMG'
is_stroke_related = df.event_name.apply(lambda x: 'CT' in x or 'MR' in x)

df_valid = df[is_postop & is_img & is_stroke_related]
del df

In [205]:
df_valid.head()

Unnamed: 0,id,case_id,event_type,event_name,event_time,event_result
4123,945,17,IMG,MRI Brain + Brain MRA + Carotid MRA (contrast),2100-01-09 14:24:00,Mild diffuse brain atrophy\nIncreased both lateral and 3rd ventricle size without visible obstructive lesion\nMild degree SVD with a few old lacunar infarct in cerebral WM\nOld ischemic change in Rt prefrontal gyrus\nMRA> \nRt proximal ICA stenosis\nLt proximal
14805,888,90,IMG,MRI Brain + MRA Acute Stroke (contrast),2100-01-04 04:29:00,"S/P Craniotomy and SDH removal, Lt. cerebral convexity.\n \nNo deifnite evidence of diffusion restrictive lesion in the brain.\nMinimal amount of small late subacute extraaxial hemorrhage (T1 and T2 high SI) at Lt occipital convexity.\nA few microbleeds in th"
25097,494,146,IMG,CT Angio + 3D Neck + Brain(contrast),2100-01-01 00:12:00,BRAIN CT AND CTA\n1. No significant focal parenchymal lesion in the brain.\n No significant abnormal finding at CSF space. \n2. No significant focal stenosis or aneurysmal dilatation. \n3. Emphysema and bullae at bilateral upper lungs.
33585,864,216,IMG,MRA+MRI(Brain),2100-01-03 16:17:00,"Outside hospital MR on .\nA small acute infarction in Rt caudate body, and adjacent CR.\n- mild T2 high SI.\n- no hemorrhagic changes.\nSeveral old lacunar infarctions in both BG and Rt thalamus.\nNo abnormal contrast enhancement in brain. \nLt maxillary sinusi"
88683,519,535,IMG,MRI Brain MRA (Aneurysm) (noncontrast),2100-01-07 02:14:00,BRAIN MRA\n1. Approximately 12.5mm sized thrombosed aneurysm at Rt. V4.\n2. Approximately 4.64mm sized fusiform aneurysm at Lt. V4.\n3. Approximately 5.10mm sized extradural unruptured aneurysm at Rt. cavernous ICA.\n4. Approximately 4.24mm sized extradural u


In [408]:
df_valid['event_result'] = df_valid['event_result'].apply(lambda x: x.lower())

# Parse results

In [248]:
s = """ischemi
infarc
high-intensity lesions on DWI-low intensity lesion on ADC
diffusion restriction lesion 
stroke
T2 high signal intensity 
low attenuation-GRE
hemorrhagic transformation
ICH
hemorrhage"""

s = s.lower().split('\n')

In [249]:
s

['ischemi',
 'hemorrhage',
 'infarc',
 'high-intensity lesions on dwi-low intensity lesion on adc',
 'diffusion restriction lesion ',
 'stroke',
 't2 high signal intensity ',
 'low attenuation-gre',
 'hemorrhagic transformation',
 'ich',
 'hemorrhage']

In [304]:
negating_words = ['no', 'old', 'known', 'previous']

In [329]:
keywords = s

def contains_which_keyword(doc):
    has = []
    for kw in keywords:
        if kw in doc:
            if kw == 'stroke':
                if 'brain mri acute stroke' not in doc:
                    has.append(kw)
            else:
                has.append(kw)
    if len(has) == 0:
        return None
    else:
        return has
    
def contains_which_keyword_not_negated(doc):
    doc = doc.split('\n')
    for i, line in enumerate(doc):
        for kw in keywords:
            if kw in line:
                has_any_negated_kw = False
                for nw in negating_words:
                    match = re.search(
                        r'(?<={negation} ).*{keyword}'.format(negation=nw, keyword=kw), 
                        line
                    )
                    has_any_negated_kw |= match is not None

                if not has_any_negated_kw and 'brain mri acute stroke' not in line:
                    return True
    return False
        

def contains_keyword_at_line_number(kw):
    def func(doc):
        doc = doc.split('\n')
        kw = keywords[0]
        for i, line in enumerate(doc):
            if kw in line:
                out_start = re.search(r'(?<!^no) .*{}'.format(kw), line)
                not_negated_at_start = out_start is not None

                out_middle = re.search(r'(?<! no) .*{}'.format(kw), line)
                not_negated_mid_sentence = out_middle is not None

                if not_negated_at_start and not_negated_mid_sentence:
                    return True, i

        return False, None
    return func

In [323]:
nw='no'
kw='infarc'
print(re.search(r'(?<={negation} ).*{keyword}'.format(negation=nw, keyword=kw), 'no evidence of large territorial infarction or hemorrhage').group())
# print(re.search(r'(?<='' ).*{keyword}'.format(negation=nw, keyword=kw), 'no evidence of large territorial infarction or hemorrhage').group())

evidence of large territorial infarc


In [327]:
line = 'no evidence of large territorial infarction or hemorrhage'
for kw in keywords:
    has_any_negated_kw = False
    for nw in negating_words:
        match = re.search(
            r'(?<={negation} ).*{keyword}'.format(negation=nw, keyword=kw), 
            line
        )
        has_any_negated_kw |= match is not None

In [328]:
has_any_negated_kw

True

In [313]:
print(df_valid.event_result.iloc[0])
contains_which_keyword_not_negated(df_valid.event_result.iloc[0])

mild diffuse brain atrophy
increased both lateral and 3rd ventricle size without visible obstructive lesion
mild degree svd with a few old lacunar infarct in cerebral wm
old ischemic change in rt prefrontal gyrus
mra> 
rt proximal ica stenosis
lt proximal


True

In [236]:
del df

In [409]:
df_valid['contains_which_kw'] = df_valid['event_result'].apply(contains_which_keyword)
df_valid['contains_kw_not_negated'] = df_valid['event_result'].apply(contains_which_keyword_not_negated)

In [368]:
df_valid[df_valid.case_id == 1515]

Unnamed: 0,id,case_id,event_type,event_name,event_time,event_result,contains_which_kw,contains_kw_not_negated
265850,669,1515,IMG,MRI Brain - Hyperacute Stroke (noncontrast),2100-01-07 10:46:00,minimal ivhs (unknown stage) in bilateral lateral ventricles and lt sided 4th ventricle.\nsuspicious minimal sah in rt medial frontal convexity and posterior fossa (along cerebellar follia).\nno visible acute infarction in brain.\nmild t2 hyperintensities in,[infarc],False
265852,674,1515,IMG,CT Routine Brain (Pre contrast),2100-01-08 10:46:00,"no evidence of large territorial infarction or hemorrhage\ndiffuse mild brain atrophy\nmild periventricular hypodensities in bilateral cerebral wm\n(ct에서는 resolution이 제한되어 mri에서 보이는 minimal ivh in bilateral lateral ventricle, 4th ventricle, suspicious minim","[hemorrhage, infarc, hemorrhage]",False
265857,702,1515,IMG,MRI Brain FU Hyperacute Stroke (contrast),2100-01-16 10:46:00,no diffusion restriction lesion\ndisappeared small amount ivh in both lateral ventricle\nno significant change of minimal sah in rt medial frontal convexity and posterior fossa\nmra>\nimproved spastic appearance of intracranial vessels\nno steno-occlusive lesi,,False


In [410]:
df_valid['contains_kw'] = df_valid['contains_which_kw'].apply(lambda x: x is not None)

In [411]:
df_valid.event_time.max()

Timestamp('2100-03-30 16:19:00')

In [417]:
df_valid.to_csv("../../data/interim/filtered_readings.csv", index=False)

In [413]:
df_valid['contains_which_kw'].apply(lambda x: x is not None).sum()

2153

In [414]:
df_valid['contains_kw'].sum(), len(df_valid[df_valid['contains_kw']].case_id.unique())

(2153, 1058)

In [415]:
df_valid['contains_kw_not_negated'].sum()

1175

In [416]:
len(df_valid[df_valid['contains_kw_not_negated']].case_id.unique())

523

In [375]:
d = df_valid[['case_id', 'contains_kw_not_negated']].groupby('case_id').sum().reset_index().rename(columns={'case_id': 'CaseID'})

In [353]:
clinical = pd.read_csv("../../data/raw/clinical.csv")

In [359]:
case_ids = clinical.CaseID.dropna()

In [377]:
case_ids = case_ids.astype(int)
labels = case_ids.to_frame()

In [379]:
labels = labels.merge(d, how='outer', on='CaseID').rename(columns={'contains_kw_not_negated': 'label'})

In [381]:
labels = labels.fillna(0)

In [386]:
labels.label = labels.label > 0

In [389]:
labels.label = labels.label.astype(int)

In [390]:
labels.head()

Unnamed: 0,CaseID,label
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0


In [391]:
labels.to_csv("../../data/processed/labels.csv", index=False)

In [355]:
clinical.merge(d, how='outer', on='CaseID').head(20)

Unnamed: 0,CaseID,Case Start,Case End,Height,Weight,BMI,AdmDate,DisDate,LOS,PostOpLOS,ICULOS,InHospDeath,ASA,Emergency,Department,Diagnosis,Operation,Surgical position,Anesthesia type,Surgery duration,Anesthesia duration,Hypertension,Diabetes,CVA,Asthma,COPD,LiverDis,KidneyDis,Tb,PreopECG,PreopPFT,PreopChest,PreopHb,PreopPlt,PreopAlb,PreopBUN,PreopCr,PreopNa,PreopK,PreopPT,PreopPTT,PreopGlu,PreopPH,PreopHCO3,PreopBE,PreopPaO2,PreopPaCO2,PreopSpO2,PreopGPT,PreopGOT,Estimated blood loss (mL),Urine output (mL),RBC (unit),FFP (unit),Crystalloid (mL),Colloid (mL),Propofol bolus (mg),Midazolam bolus (mg),Fentanyl bolus (mcg),Rocuronium (mg),Vecuronium (mg),Ephedrine (mg),Phenylephrine (mg),Epinephrine (mcg),Calcium chloride (mg),contains_kw_not_negated
0,1.0,2100-01-01 00:00,2100-01-01 12:12,160.2,65.5,25.522092,2099-12-29 06:23:00,2100-01-08 06:23:00,10.0,8.0,,,2.0,N,GS,Rectal cancer,Low anterior resection,Lithotomy,General,3.033333,3.16667,1.0,1.0,,,,,,,,Normal,No significant interval change since the last study,14.1,189.0,4.3,10.0,0.82,141.0,3.1,94.0,33.2,134.0,,,,,,,16.0,18.0,,300.0,,,350.0,,120.0,,100.0,70.0,,10.0,,,,
1,2.0,2100-01-01 00:00,2100-01-01 13:22,167.3,56.0,20.007653,2099-12-29 10:35:00,2100-01-18 10:35:00,20.0,18.0,,,2.0,N,GS,Advanced gastric cancer,"Subtotal gastrectomy (Billroth II) ( with standard lymph node dissection ) : Diagnostic laparoscopy,, [ONO trial]",Supine,General,3.583333,4.43333,,,,,,,,,,Normal,"Fibrotic lesion in RULF \n\nOtherwise, negative",10.2,251.0,3.8,14.0,0.86,143.0,4.7,110.0,31.9,88.0,,,,,,,15.0,18.0,50.0,700.0,,,800.0,,150.0,,,100.0,,20.0,,,,
2,3.0,2100-01-01 00:00,2100-01-01 10:14,169.1,69.7,24.375053,2099-12-29 11:17:00,2100-01-01 11:17:00,3.0,1.0,,,1.0,N,GS,Symptomatic gallbladder stone without obstruction,Cholecystectomy,Reverse Trendelenburg,General,1.25,1.33333,,,,,,,,,,Normal,"Grossly, no active lung lesion.Normal cardiovascular configuration",14.2,373.0,4.2,14.0,1.18,144.0,4.9,103.0,30.3,87.0,,,,,,,34.0,17.0,,,,,200.0,,,,,50.0,,,,,,
3,4.0,2100-01-01 00:00,2100-01-01 14:50,160.6,53.7,20.820119,2099-12-29 16:08:00,2100-01-07 16:08:00,9.0,7.0,,,2.0,N,GS,Advanced gastric cancer,TLDG (Totally laparoscopic distal gastrectomy) ( with standard lymph node dissection ) : LDG (open conversion °¡´É),Reverse Trendelenburg,General,4.333333,5.83333,1.0,,,,,,,,,Normal,Grossly no active lung lesion\n\n,14.4,275.0,4.1,10.0,0.96,141.0,4.2,103.0,34.5,108.0,,,,,,,18.0,23.0,,270.0,,,2700.0,,80.0,,100.0,100.0,,50.0,,,,
4,5.0,2100-01-01 00:00,2100-01-01 14:59,171.0,69.5,23.767997,2099-12-31 05:14:00,2100-02-13 05:14:00,44.0,43.0,10.0,,3.0,Y,GS,Aortic aneurysm,"Resection of aneurysm, abdominal aorta + bilateral iliac artery : Aortobiliac bypass, thrombectomyµ¿½Ã ½ÃÇà",Prone,General,5.333333,6.5,1.0,,,,,,,,"QTc:448, QRSd:98, I. Conclusion:Normal sinus rhythm Left anterior fascicular block Left ventricular hypertrophy with repolarization abnormality Anteroseptal infarct , age undetermined Abnormal ECG, T:100, PR:176, QRS:-50, P:55, RATE:82, QT:384,",Normal,,15.3,67.0,2.6,50.0,4.43,146.0,4.4,73.0,36.5,126.0,,,,,,,77.0,765.0,2600.0,1490.0,8.0,8.0,7100.0,,,,,160.0,,10.0,900.0,,2100.0,
5,6.0,2100-01-01 00:00,2100-01-01 10:26,150.0,54.6,24.266667,2099-12-28 12:31:00,2100-01-02 12:31:00,5.0,2.0,,,2.0,Y,GS,Gallbladder polyp,Cholecystectomy,Supine,General,0.416667,1.33333,,,,,,,,,,Normal,No active lung lesion,12.3,144.0,3.9,14.0,1.07,141.0,3.4,104.0,29.1,105.0,,,,,,,17.0,19.0,,,,,100.0,,70.0,,100.0,40.0,,5.0,,,,
6,7.0,2100-01-01 00:00,2100-01-01 13:21,167.5,63.8,22.740031,2099-12-30 05:08:00,2100-01-08 05:08:00,9.0,8.0,3.0,,2.0,N,TS,Nontuberculous mycobacterial infection,VATS RML lobectomy & RUL wedge resection or segmentectomy,LLD,General,3.833333,3.98333,,,,,,,,1.0,,Normal,No active lesion in the lung.,,,,,,,,,,,,,,,,,,,100.0,125.0,,,700.0,,,,,120.0,,,,,,
7,8.0,2100-01-01 00:00,2100-01-01 10:39,156.7,67.25,27.38762,2099-12-29 10:58:00,2100-01-03 10:58:00,5.0,3.0,,,2.0,N,GS,Malignant neoplasm of breast left,Breast-conserving surgery,Supine,General,1.116667,1.66667,,,,,,,,,,Normal,No active lung lesion.,12.1,186.0,3.7,11.0,0.69,142.0,4.5,92.0,31.3,101.0,,,,,,,10.0,16.0,100.0,,,,300.0,,,,,70.0,,,,,,
8,9.0,2100-01-01 00:00,2100-01-01 10:14,157.9,50.9,20.415195,2099-12-30 10:16:00,2100-01-01 10:16:00,2.0,1.0,,,1.0,N,GS,Gallbladder stone,Cholecystectomy,Supine,General,1.216667,1.38333,,,,,,,,,,Normal,,13.7,141.0,4.5,8.0,0.58,,,96.0,,91.0,,,,,,,11.0,14.0,,,,,300.0,,,,,50.0,,,,,,
9,10.0,2100-01-01 00:00,2100-01-01 14:49,162.0,66.7,25.415333,2099-12-29 10:42:00,2100-02-11 10:42:00,44.0,42.0,,,3.0,N,GS,Early gastric cancer,TLDG (Totally laparoscopic distal gastrectomy) ( with standard lymph node dissection ),Supine,General,5.0,6.33333,1.0,1.0,,,,,,,,Normal,No active lung lesion,15.8,204.0,3.9,26.0,1.2,139.0,5.4,112.0,29.8,191.0,,,,,,,15.0,18.0,200.0,200.0,,,1100.0,,90.0,,,110.0,,20.0,500.0,,600.0,


오름차순으로
case_id, 검사일, 

## 느슨한 기준
* 키워드가 하나라도 들어가있는 느슨한 기준으로 1인 사람들을 위로올린다.
* 그런데 거기 해당하는 case id는 다 보자. 0이어도. (위로 올린다.)
* 그 묶음 내에서 case id와 검사일자순으로 ascending sort


## 뒷부분: 나머지사람들
* case id 검사일로 소팅

In [433]:
df_valid = df_valid.sort_values(['case_id', 'event_time'])
head = df_valid[df_valid.contains_kw]
tail = df_valid[~df_valid.contains_kw]

In [434]:
head_case_ids = head.case_id.unique()
bring_to_head = tail.case_id.apply(lambda x: x in head_case_ids)

In [435]:
len(head.case_id.unique()), len(pd.concat([head, tail[bring_to_head]], axis=0).case_id.unique())

(1058, 1058)

In [438]:
head = pd.concat([head, tail[bring_to_head]]).sort_values(['case_id', 'event_time'])

In [439]:
tail = tail[~bring_to_head]

In [440]:
tail.shape

(1636, 9)

In [441]:
head.shape

(3242, 9)

In [442]:
df_valid.shape

(4878, 9)

In [443]:
tail.shape[0] + head.shape[0]

4878

In [445]:
pd.concat([head, tail]).to_csv("../../data/interim/mri_readings_sorted.csv", index=False)

In [446]:
head.columns

Index(['id', 'case_id', 'event_type', 'event_name', 'event_time', 'event_result', 'contains_which_kw', 'contains_kw_not_negated', 'contains_kw'], dtype='object')

# 코드 정리

In [None]:
keywords = """ischemi
infarc
high-intensity lesions on DWI-low intensity lesion on ADC
diffusion restriction lesion 
stroke
T2 high signal intensity 
low attenuation-GRE
hemorrhagic transformation
ICH
hemorrhage""".lower().split('\n')

negating_words = ['no', 'old', 'known', 'previous']

In [None]:
df = pd.read_csv(os.path.join(raw_dir, "lab.csv"), parse_dates=['event_time'])

# Time range
cutoff_start = pd.datetime(year=2100, month=1, day=1, hour=0, minute=0,)
cutoff_end = pd.datetime(year=2100, month=4, day=1, hour=0, minute=0,)

# Conditions
is_postop = (df.event_time > cutoff_start) & (df.event_time <= cutoff_end) 
is_img = df_postop.event_type == 'IMG'
is_stroke_related = df.event_name.apply(lambda x: 'CT' in x or 'MR' in x)

# Filter data
df_valid = df[is_postop & is_img & is_stroke_related]

# Delete original df (too big)
del df

In [None]:
df_valid.loc[:, 'event_result'] = df_valid['event_result'].apply(lambda x: x.lower())

In [None]:
def contains_which_keyword(doc):
    has = []
    for kw in keywords:
        if kw in doc:
            if kw == 'stroke':
                if 'brain mri acute stroke' not in doc:
                    has.append(kw)
            else:
                has.append(kw)
    if len(has) == 0:
        return None
    else:
        return has

def contains_which_keyword_not_negated(doc):
    doc = doc.split('\n')
    for i, line in enumerate(doc):
        for kw in keywords:
            if kw in line:
                has_any_negated_kw = False
                for nw in negating_words:
                    match = re.search(
                        r'(?<={negation} ).*{keyword}'.format(negation=nw, keyword=kw), 
                        line
                    )
                    has_any_negated_kw |= match is not None

                if not has_any_negated_kw and 'brain mri acute stroke' not in line:
                    return True
    return False

In [None]:
df_valid.loc[:, 'contains_which_kw'] = df_valid['event_result'].apply(contains_which_keyword)
df_valid.loc[:, 'contains_kw_not_negated'] = df_valid['event_result'].apply(contains_which_keyword_not_negated)
df_valid['contains_kw'] = df_valid['contains_which_kw'].apply(lambda x: x is not None)

## Save readings with filters

In [None]:
df_valid.to_csv("../../data/interim/filtered_readings.csv", index=False)

## Convert to binary label

Using string

In [None]:
labels = df_valid[['case_id', 'contains_kw_not_negated']].groupby('case_id').sum().reset_index()
labels = labels.rename({'case_id': 'CaseID', 'contains_kw_not_negated': 'label'})
labels = (labels['label'] > 0).astype(int)