In [1]:
import re
import numpy as np
import pandas as pd

import nltk
from nltk import *

In [2]:
#read out the CXR report dataset csv file

df_raw = pd.read_csv('data//CXR report DB.csv').sample(frac=1)


In [3]:
df_raw.shape

(65499, 17)

In [9]:
def text_clean_up(text, remove_stopwords = False, stem_words = False, lemmatization= True):
    
    ## Convert words to lower case

    text = str(text).lower()
    
    ## Remove puncuation and replace common abbreviations
    text = re.sub('s/p','status post',text)
    text = re.sub('r/o','rule out',text)
    text = re.sub('r/i','rule in',text)
    text = re.sub('f/u','follow up',text)
    text = re.sub('-','',text)
    text = re.sub(r'[^a-zA-Z0-9_]',' ',text)
    text = text.split()
    
    # Optionally, remove stop words
    if remove_stopwords:
        
        stops = set(stopwords.words("english"))
        
        text = [w for w in text if not w in stops]
        
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        
        snowball = SnowballStemmer("english", ignore_stopwords=True)
        
        stemmed_words = [snowball.stem(word) for word in text]
        
        text = " ".join(stemmed_words)
        
    # WordNet lemmatizer only removes affixes if the resulting word is in its dictionary    
    if lemmatization:
        
        lemmatizer = nltk.WordNetLemmatizer()        
        
        lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
        
        text = " ".join(lemmatized_words)

    return text

In [10]:
acc = df_raw['ACCNO'].values
findings = df_raw['FINDINGS'].map(lambda x: text_clean_up(x)).values
PE_labels = np.zeros(findings.shape[0],dtype = int)

df_report_only = pd.DataFrame({'ACCNO':acc,'Report': findings,'PE Label': PE_labels})

In [11]:
#Analyze the whole report text corpus (just exploratory not helpful in this task I believe)

report_corpus = nltk.Text(" ".join(findings).split())

total_vocab_count = len(set(report_corpus))

lexical_diversity = len(set(report_corpus)) / len(report_corpus)

freq_rank = 50

print("\n Word count of the complete report text corpus: {}"
      .format(len(report_corpus)))

print("\n Vocabulary count of the complete report text corpus: {}"
      .format(total_vocab_count))

print("\n Lexical diversity (Lexical richness of the text) : \n {}"
      .format(lexical_diversity))

print("\n {} most common vocabs: \n\n {} "
      .format(freq_rank, FreqDist(report_corpus).most_common(freq_rank)))


 Word count of the complete report text corpus: 1781523

 Vocabulary count of the complete report text corpus: 3839

 Lexical diversity (Lexical richness of the text) : 
 0.002154897803733098

 50 most common vocabs: 

 [('no', 91124), ('lung', 66732), ('is', 64943), ('the', 62874), ('normal', 61349), ('and', 57010), ('chest', 55201), ('size', 53875), ('heart', 52352), ('show', 50917), ('of', 44921), ('definite', 44333), ('pa', 42365), ('noted', 40104), ('lesion', 39913), ('in', 38367), ('bilateral', 32443), ('abnormal', 29289), ('are', 28438), ('change', 21341), ('view', 21095), ('intact', 20367), ('angle', 18394), ('field', 18175), ('thoracic', 17462), ('cage', 16607), ('clear', 16550), ('active', 15388), ('mediastinum', 14296), ('or', 13519), ('sharp', 12564), ('shadow', 11804), ('both', 11111), ('bony', 11017), ('within', 10612), ('limit', 10560), ('right', 10540), ('generally', 10535), ('with', 10221), ('pleural', 9803), ('costophrenic', 9751), ('erect', 9598), ('contour', 9585),

In [14]:
report_corpus.collocations(freq_rank)

heart size; lung lesion; thoracic cage; abnormal change; normal heart;
lung field; definite abnormal; generally intact; soft tissue; status
post; costophrenic angle; active lung; definite lung; tissue shadow;
normal limit; within normal; erect chest; free air; bony thorax;
subphrenic free; pleural effusion; abnormal contour; pleural
thickening; nonspecific finding; bilateral lung; increased
infiltrates; mottled opacity; view show; radiographic evidence;
significant pleural; bilateral costophrenic; noted chest; midline
without; without compression; field bilateral; cxr show; chest xray;
taken nonspecific; cardiac size; mediastinal shape; intact bony; lower
lung; hilar architecture; show increased; regular chest; chest
routine; mild scoliosis; clinical manifestation; lung marking; intact
chest


In [15]:
num_samples = 10

for i in np.random.randint(df_raw.shape[0], size=num_samples):
    print('\n\n--------------Original Report Text ' + str(i) + ' -------------------\n')
    print(df_raw.iloc[[i],[15]].values)

    print('\n\n--------------Processed Report Text ' + str(i) + ' -------------------\n')
    print(df_report_only.iloc[[i],[1]].values)
    



--------------Original Report Text 59808 -------------------

[['Chest  PA  shows increased infiltrates with mottled opacities in$both lung fields.   Heart is normal in size.          ']]


--------------Processed Report Text 59808 -------------------

[['chest pa show increased infiltrates with mottled opacity in both lung field heart is normal in size']]


--------------Original Report Text 50925 -------------------

[['Erect chest PA view shows:$The heart size is normal.$No abnormal contour or soft tissue shadow is noted in the mediastinum.$No significant pleural thickening or abnormal shadows is noted in the bilateral lung fields.$Bilateral CP angles are clear and sharp.$The thoracic cage and bones are generally intact. ']]


--------------Processed Report Text 50925 -------------------

[['erect chest pa view show the heart size is normal no abnormal contour or soft tissue shadow is noted in the mediastinum no significant pleural thickening or abnormal shadow is noted in the bil

In [16]:
# String search for 'blunt' and label all of them positive for pleural effusion

df_blunt = df_report_only[df_report_only['Report'].str.contains('blunt')]

df_blunt['PE Label'] = 1

print('Sample counts after applying filter: {}'.format(df_blunt.shape[0]))

Sample counts after applying filter: 269


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [17]:
# String search for 'sharp' and label all of them negative for pleural effusion

df_sharp = df_report_only[df_report_only['Report'].str.contains('sharp')]

df_sharp['PE Label'] = 0

print('Sample counts after applying filter: {}'.format(df_sharp.shape[0]))

Sample counts after applying filter: 12558


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [18]:
# Search for most common patterns of negative description regarding pleural effusion in our current text corpus

PE_neg_match = TokenSearcher(report_corpus).findall(r'<no><.*>{,2}<pleural><effusion>')

PE_neg_phrase = [" ".join(phrase) for phrase in PE_neg_match]

FreqDist(PE_neg_phrase).most_common()

[('no evidence of pleural effusion', 2798),
 ('no apparent pleural effusion', 870),
 ('no obvious pleural effusion', 86),
 ('no significant pleural effusion', 39),
 ('no pleural effusion', 21),
 ('no cardiomegaly mild pleural effusion', 21),
 ('no prominent pleural effusion', 5),
 ('no definite pleural effusion', 3),
 ('no cardiomegaly minimal pleural effusion', 2),
 ('no cardiomegaly moderate pleural effusion', 2),
 ('no visible left pleural effusion', 2),
 ('no obvious left pleural effusion', 1),
 ('no significant of pleural effusion', 1)]

In [19]:
# Filtered text corpus for negative descriptions most commonly used, extracting and labelling negative pleural effusion cases with high confidence

PE_neg_keyword = ['no evidence of pleural effusion','no apparent pleural effusion','no obvious pleural effusion','no significant pleural effusion','no pleural effusion']

pat = "|".join(PE_neg_keyword)

df_PE_neg = df_report_only[df_report_only['Report'].str.contains(pat)]

df_PE_neg['PE Label'] = 0

print('Sample counts after applying filter: {}'.format(df_PE_neg.shape[0]))

Sample counts after applying filter: 3814


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [20]:
# Removing negative reports from all the 'pleural effusion' containing reports, and label the remaining cases positive for pleural effusion

df_PE_pos = df_report_only[df_report_only['Report'].str.contains('effusion') & ~(df_report_only['Report'].str.contains(pat))]

df_PE_pos['PE Label'] = 1

print('Sample counts after applying filter: {}'.format(df_PE_pos.shape[0]))

Sample counts after applying filter: 855


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [21]:
for i in np.random.randint(df_PE_pos.shape[0], size=num_samples):
    print('\n')
    print(df_PE_pos.iloc[[i],[1]].values)




[['chest pa view wa taken status post pigtail insertion in right hemithorax with small amount of pneumothorax right pleural effusion a soft tissue mass in the right chest wall near the pigtail tube advise clinical correlation post operative change of rul no cardiomegaly']]


[['film of portable chest ap supine bronchopneumonitis over both lung pneumonia with pleural effusion over right side post chest intubation are showed']]


[['follow up supine cxr show persistent right pleural effusion with increased transverse cardiac diameter suggestive of pericardial effusion the left lung is clear']]


[['chest pa view show normal heart size left pleural effusion and subsegmental atelectasis at lll']]


[['chest xray film show normal cardiac size infiltrate at left basal lung with mild left pleural effusion suggest correlation with clinical manifestation']]


[['supine chest ap view wa taken status post central venous line insertion via the right side with it tip in the aorta suspected increa

In [22]:
df_output = df_blunt.append([df_PE_pos, df_sharp,df_PE_neg])

df_output.to_csv('Pleural Effusion List(labels generated by reports)')