In [1]:
import re
import sys
import numpy as np
import pandas as pd

import nltk
from nltk import *

In [2]:
#read out the Chest CT report dataset

df_raw = pd.read_csv('data//ABD CT simple.csv', encoding='cp950').sample(frac=1)

df_raw.shape

(66412, 5)

In [3]:
df_raw

Unnamed: 0,ACCNO,EXAMDATE,PATID,FINDINGS,IMPRESSIONS
58157,RD01036470790026,2013/3/10,308924,(B)$Pre- contrast enhanced CT scan of abdomen ...,> A 2cm ill-defined low density at pancreatic ...
45542,RD01029026090032,2014/8/6,532623,For pre-op evaluation. $CT scan of abdomen & p...,> Small diverticula over S-colon. $> Enlarged ...
51854,RD01032860020011,2013/11/21,S70321,(A)$CT of the abdomen and pelvis without and w...,acute colonic diverticulitis at the proximal a...
48916,RD01031052180028,2014/3/26,R90781,Fever. $CT scan of abdomen & pelvis without an...,"> Enlarged LNs in paraaortic region, on the me..."
1419,RD07515241520081,2018/5/15,986740,(Report revised on 2018-07-30)$CT scan of abdo...,> A 3.6-cm mass lesion in proximal A-colon. Su...
41858,RD01026880210009,2014/12/24,G90733,CT of the Abdomen:$$The pre- and post-enhanced...,1) Suspect diverticulitis of the cecum. Diver...
27481,RD05514551020016,2016/5/14,900995,CT of the abdomen before and after IV iodinate...,Dilatation of the CBD and bilateral IHDs noted...
8896,RD06A28809600044,2017/10/28,F32754,CT of the abdomen and pelvis without and with ...,"1, No intestinal obstruction.$2. Marked atroph..."
29708,RD05303853560014,2016/3/3,W21972,CT of abdomen and pelvis before and after cont...,Fractures at the left transverse process of T1...
22891,RD05929813540117,2016/9/29,A49543,Pre- contrast enhanced CT scan of abdomen :$$>...,> Presence of faint high density over right up...


In [4]:
def text_clean_up(text, remove_stopwords = False, stem_words = False, lemmatization= True):
    
    ## Convert words to lower case

    text = str(text).lower()
    
    ## Remove puncuation and replace common abbreviations
    text = re.sub('s/p','status post',text)
    text = re.sub('r/o','rule out',text)
    text = re.sub('f/u','follow up',text)
    text = re.sub('-','',text)
    text = re.sub(r'[^a-zA-Z_]',' ',text)
    text = text.split()
    
    # Optionally, remove stop words
    if remove_stopwords:
        
        stops = set(stopwords.words("english"))
        
        text = [w for w in text if not w in stops]
        
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        
        snowball = SnowballStemmer("english", ignore_stopwords=True)
        
        stemmed_words = [snowball.stem(word) for word in text]
        
        text = " ".join(stemmed_words)
        
    # WordNet lemmatizer only removes affixes if the resulting word is in its dictionary    
    if lemmatization:
        
        lemmatizer = nltk.WordNetLemmatizer()        
        
        lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
        
        text = " ".join(lemmatized_words)

    return text

In [5]:
acc = df_raw['ACCNO'].values
examdate = df_raw['EXAMDATE'].values
patid = df_raw['PATID'].values
find = df_raw['FINDINGS'].map(lambda x: text_clean_up(x)).values
imp = df_raw['IMPRESSIONS'].map(lambda x: text_clean_up(x)).values

df_report_only = pd.DataFrame({'ACCNO':acc,'DATE':examdate, 'PATID':patid, 'Findings': find, 'Impressions': imp, 'Pneumoperitoneum': np.zeros(find.shape[0],dtype = int)})

In [6]:
df_report_only.shape

(66412, 6)

In [7]:
#Analyze the whole report text corpus (just exploratory not helpful in this task I believe)

report_corpus = nltk.Text(" ".join(find).split())

total_vocab_count = len(set(report_corpus))

lexical_diversity = len(set(report_corpus)) / len(report_corpus)

freq_rank = 50

print("\n Word count of the complete report text corpus: {}"
      .format(len(report_corpus)))

print("\n Vocabulary count of the complete report text corpus: {}"
      .format(total_vocab_count))

print("\n Lexical diversity (Lexical richness of the text) : \n {}"
      .format(lexical_diversity))

print("\n {} most common vocabs: \n\n {} "
      .format(freq_rank, FreqDist(report_corpus).most_common(freq_rank)))



 Word count of the complete report text corpus: 6806223

 Vocabulary count of the complete report text corpus: 22952

 Lexical diversity (Lexical richness of the text) : 
 0.003372208051367109

 50 most common vocabs: 

 [('the', 288232), ('of', 266097), ('and', 257427), ('no', 190102), ('in', 165753), ('with', 129671), ('is', 87312), ('bilateral', 82075), ('at', 79845), ('ct', 67669), ('abdomen', 67017), ('lesion', 66883), ('liver', 66167), ('a', 63996), ('contrast', 56441), ('right', 52730), ('left', 51390), ('show', 49529), ('pelvis', 47181), ('without', 46888), ('kidney', 46525), ('are', 45668), ('or', 44356), ('small', 42730), ('noted', 42661), ('enhancement', 42122), ('lung', 39691), ('wall', 38684), ('adrenal', 38592), ('definite', 38362), ('normal', 37024), ('to', 36732), ('finding', 34993), ('spleen', 34681), ('node', 34330), ('gland', 33607), ('lymph', 33558), ('status', 33441), ('bladder', 32993), ('pancreas', 32817), ('mild', 31203), ('change', 29905), ('post', 29889), ('a

In [19]:
# Search for most common patterns of 'free air' description in our current text corpus

freeair_match = TokenSearcher(report_corpus).findall(r'<free><air>')

df_freeair = df_report_only[df_report_only['Findings'].str.contains('free air')]

df_freeair['Pneumoperitoneum'] = 1

print("Matches count: {}".format(len(freeair_match)))

Matches count: 6237


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [9]:
# Search for most common patterns of negative description of 'free air' in our current text corpus

freeair_neg_match = TokenSearcher(report_corpus).findall(r'<no|neither|nor><.*>{,3}<free><air>')

freeair_neg_phrase = [" ".join(phrase) for phrase in freeair_neg_match]

freeair_neg_phrase_list = FreqDist(freeair_neg_phrase).most_common(50)

FreqDist(freeair_neg_phrase).most_common(50)

[('no intraperitoneal free air', 2072),
 ('no ascites or free air', 1488),
 ('no ascitis or free air', 387),
 ('no significant intraperitoneal free air', 372),
 ('no evidence of free air', 363),
 ('no free air', 327),
 ('no remarkable finding no free air', 313),
 ('no evidence of intraperitoneal free air', 239),
 ('no significant free air', 75),
 ('no definite intraperitoneal free air', 23),
 ('no definite free air', 22),
 ('no abscess or free air', 17),
 ('no ascitis free air', 16),
 ('nor hydronephrosis no intraperitoneal free air', 15),
 ('no ascites free air', 12),
 ('no ascites abscess or free air', 5),
 ('no intraabdominal free air', 5),
 ('no extraluminal free air', 5),
 ('no ascites or intraperitoneal free air', 4),
 ('no ascitis abscess or free air', 4),
 ('no dilated appendix no free air', 4),
 ('no obvious free air', 3),
 ('nor dislocation no intraperitoneal free air', 3),
 ('no acitis or free air', 3),
 ('no subphrenic free air', 3),
 ('no obvious intramural free air', 2),


In [10]:
pat = "|".join([phrase[0] for phrase in freeair_neg_phrase_list])

df_freeair_neg = df_freeair[df_freeair['Findings'].str.contains(pat)]

df_freeair_neg['Pneumoperitoneum'] = 0

df_freeair_pos = df_freeair[~df_freeair['Findings'].str.contains(pat)]

print('{} positive reports confirmed as positive for pneumoperitoneum.'.format(df_freeair_pos.shape[0]))

print('{} negative reports re-labeled as negative for pneumoperitoneum.'.format(df_freeair_neg.shape[0]))

409 positive reports confirmed as positive for pneumoperitoneum.
5809 negative reports re-labeled as negative for pneumoperitoneum.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [11]:
df_freeair_pos['Target'] = df_freeair_pos['Findings'].str.extract(r'(\b.{,20}\bfree air\b.*)')

df_freeair_pos.drop(['Findings','Impressions','Pneumoperitoneum'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,ACCNO,DATE,PATID,Target
313,RD01036629430017,2013/2/26,526573,intraperitoneal free air is noted se im cm di...
432,RD01031207190020,2014/3/16,M52771,or abscess or free air noted no definite abno...
831,RD01032349480003,2013/12/27,087373,with a mdct free air is noted at the peritone...
860,RD06215590940046,2017/2/15,S06687,vein minimal free air arrow over bilateral up...
1157,RD01029991450032,2014/6/1,T99016,of intraperitoneal free air cause cannot rule...
1172,RD07511526990013,2018/5/11,J22912,small amount free air at right perineum small...
1178,RD06B20243440023,2017/11/20,F33522,of intraperitoneal free air in upper abdomen ...
1634,RD01035082570020,2013/6/17,T25458,in this study small free air pocket noted abut...
2000,RD01030894950020,2014/4/4,636801,of intraperitoneal free air in left upper mid...
2020,RD05518093150014,2016/5/19,R95509,intraperitoneal free air in whole abdomen mil...


In [12]:
# Search for most common patterns of negative description regarding pleural effusion in our current text corpus

pneumoperitoneum_match = TokenSearcher(report_corpus).findall(r'<pneumoperitoneum>')

df_pneumoperitoneum = df_report_only[df_report_only['Findings'].str.contains('pneumoperitoneum')]

df_pneumoperitoneum['Pneumoperitoneum'] = 1

print("Matches count: {}".format(len(pneumoperitoneum_match)))

Matches count: 6506


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [13]:
# Search for most common patterns of negative description of 'free air' in our current text corpus

pneumo_neg_match = TokenSearcher(report_corpus).findall(r'<no|neither|nor><.*>{,3}<pneumoperitoneum>')

pneumo_neg_phrase = [" ".join(phrase) for phrase in pneumo_neg_match]

pneumo_neg_phrase_list = FreqDist(pneumo_neg_phrase).most_common(50)

FreqDist(pneumo_neg_phrase).most_common(50)

[('no pneumoperitoneum', 2056),
 ('no pneumothorax nor pneumoperitoneum', 1069),
 ('no evidence of pneumoperitoneum', 809),
 ('no obvious pneumoperitoneum', 429),
 ('no pneumothorax hemothorax nor pneumoperitoneum', 278),
 ('no ascites no pneumoperitoneum', 269),
 ('no ascites nor pneumoperitoneum', 182),
 ('no periaortic hematoma no pneumoperitoneum', 125),
 ('no definite pneumoperitoneum', 116),
 ('no hemothorax no pneumoperitoneum', 94),
 ('no definite hemoperitoneum nor pneumoperitoneum', 85),
 ('no significant pneumoperitoneum', 61),
 ('no imaging evidence of pneumoperitoneum', 56),
 ('no pneumatosis intestinalis nor pneumoperitoneum', 35),
 ('nor pneumoperitoneum', 24),
 ('no ascites or pneumoperitoneum', 22),
 ('no obvious ascites no pneumoperitoneum', 20),
 ('no liver cirrhosis no pneumoperitoneum', 16),
 ('no pneumothorax hemothorax pneumoperitoneum', 15),
 ('no significant ascites no pneumoperitoneum', 14),
 ('no significant ascites or pneumoperitoneum', 12),
 ('no pneumothor

In [14]:
pat = "|".join([phrase[0] for phrase in pneumo_neg_phrase_list])

df_pneumoperitoneum_neg = df_pneumoperitoneum[df_pneumoperitoneum['Findings'].str.contains(pat)]

df_pneumoperitoneum_neg['Pneumoperitoneum'] = 0

df_pneumoperitoneum_pos = df_pneumoperitoneum[~df_pneumoperitoneum['Findings'].str.contains(pat)]

print('{} positive reports confirmed as positive for pneumoperitoneum.'.format(df_pneumoperitoneum_pos.shape[0]))

print('{} negative reports re-labeled as negative for pneumoperitoneum.'.format(df_pneumoperitoneum_neg.shape[0]))

559 positive reports confirmed as positive for pneumoperitoneum.
5915 negative reports re-labeled as negative for pneumoperitoneum.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [15]:
df_pneumoperitoneum_pos['Target'] = df_pneumoperitoneum_pos['Findings'].str.extract(r'(\b.{,20}\bpneumoperitoneum\b.*)')

df_pneumoperitoneum_pos.drop(['Findings','Impressions','Pneumoperitoneum'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,ACCNO,DATE,PATID,Target
18,RD06402673610006,2017/4/2,U04730,surrounding air and pneumoperitoneum a cm circ...
274,RD01035690620015,2013/5/5,B14704,enhancement show pneumoperitoneum noted suspe...
278,RD06514210120146,2017/5/14,395324,img small amount of pneumoperitoneum ser img h...
295,RD01035181400030,2013/6/10,T35006,and revealed pneumoperitoneum with moderate a...
396,RD01033500530011,2013/10/6,C51939,fluid retained and pneumoperitoneum noted sus...
562,RD01040625520049,2012/5/12,J99692,show massive pneumoperitoneum and ascites bil...
788,RD01041459750033,2012/3/16,885835,and foley catheter pneumoperitoneum and mildl...
970,RD07515194210245,2018/5/15,772616,distribution of pneumoperitoneum at abdomen a...
991,RD06821269590129,2017/8/21,401647,portion of duodenum pneumoperitoneum and pneum...
1238,RD07417375130022,2018/4/17,805640,enhancement show pneumoperitoneum perforation...


In [17]:
pd.concat([df_freeair_pos,df_pneumoperitoneum_pos]).drop(['Findings','Impressions','Pneumoperitoneum'], axis=1).to_csv('Pneumoperitoneum_positive_CT_list.csv')