this script take key words as the input and return the cases that contain this input

In [1]:
import pandas as pd
import os
import numpy as np
import math

In [2]:
def process_string(string):
    # replace symbols
    output = string.replace(',',' , ').replace(':',' : ').replace('-'," - ")\
    .replace(';',' ; ').replace('?',' ? ').replace('. ',' . ').replace('  ',' ')
    # replace the white space in the start
    position = 0
    output = list(output)
    while True:
        if output[position] == ' ':
            output[position] = ''
            position += 1
        else:
            break
    output = "".join(output)
    return output

def indexes_for_duplicates_in_list(lst, item,lower = True):
    # in case the length of item > 1, combine every N element in the list together, N = len of item 
    len_item = len(item.split())
    new_lst = []
    for i in range(0,len(lst)-(len_item-1)):
        element = []
        for ii in range(0,len_item):
            element.append(lst[i + ii])
        new_lst.append(element)

    indexes = []
    if lower == True:
        indexes = [i for i,x in enumerate(new_lst) if " ".join(x).lower() == item]
    else:
        indexes = [i for i,x in enumerate(new_lst) if " ".join(x) == item]
    
    result = []
    for i in indexes:
        for ii in range(0,len_item):
            result.append(i+ii)
    return result

In [3]:
def find_highlighted_sentence_w_keyword(case,keyword):
    # this function can also determine whether the case has the keyword
    has_keyword = 0
    
    report = case['Report text']
    # split report by sentence
    report_split = report.replace('\n','. ').split('. ')
    report_split = [i for i in report_split if i != ' ' and i != '']
    
    txt = ""
    highlight_sentence = []
    keyword_list = []
    for string in report_split:
        # find whether the key is included in each sentence
        keys = [key for key in keyword if key in process_string(string.lower())]
        if len(keys) != 0:
            has_keyword = 1
            # save keywords
            keyword_list.extend(keys)
            # find the indexes of the keyword in that sentence
            string = process_string(string)
            string_split = string.split()
            indexes_list = []
            for key in keys:
                index = indexes_for_duplicates_in_list(string_split,key)
                indexes_list.extend(index)
            indexes_list = sorted(indexes_list)
            
            # add this sentence and indexes into collection
            highlight_sentence.append((string,indexes_list))
            
            # write into text
            t = ""
            for i in range(0,len(string_split)):
                if i in indexes_list:
                    t = t + "!!" + string_split[i] + "!! "
                else:
                    t = t + string_split[i]+ ' '
            txt = txt + t +'.\n '
        
    keyword_list = list(dict.fromkeys(keyword_list))
    return has_keyword,keyword_list, highlight_sentence,txt
    

In [4]:
def make_highlighted_rtf_report(case,highlight_sentence,save_path,attach_full_report = True):
    # define rtf file name
    file_name = os.path.join(save_path,case['Patient_ID']+'.rtf')
    txt = "{\\rtf1 Patient_ID: "+case['Patient_ID']+"\line\line Highlight sentences: \line\line " 
    
    for h in highlight_sentence:
        string = h[0]
        indexes_list = h[1]
        string_split = string.split()
       
        # write into text
        t = ""
        for i in range(0,len(string_split)):
            if i in indexes_list:
                t = t + "\\b " + string_split[i] + " \\b0 "
            else:
                t = t + string_split[i]+ ' '
        txt = txt + t +'\line '
            
    # attach the full reports
    if attach_full_report == True:
        report = case['Report text']
        txt = txt + "\line Full reports:\line  " + report.replace('\n','\line ') + "}"
    
    output_file = open(file_name,'w')
    output_file.write(txt)
    output_file.close()
        

In [5]:
# define radiology report file name
main_folder = '/Users/zhennongchen/Documents/Zhennong_CT_Data/Patient_Overview'
year = '2019'
os.makedirs(os.path.join(main_folder,'Case_search_list',year),exist_ok = True)

save_path = os.path.join(main_folder,'Case_search_list',year,'rtf_files')
data = pd.read_csv(os.path.join(main_folder,year+'_Patient_Radiology_Records_Full.csv'))

In [6]:
# only look for cases with funtional study
data = data[data['Function?'] == 'Yes']
print(data.shape)

(795, 15)


In [7]:
# define your keywords:
keyword = ['hypoplastic','syndrome','congenital','CHD','LVAD','device','TAVR','aortic stenosis',\
           'stenosis','failure','thinning','hypokinesis','hypokinetic','akinesis','akinetic','dyskinesis','dyskinetic','wall motion','normal']
#keyword = ['hypokinesis','hypokinetic','akinesis','akinetic','dyskinesis','dyskinetic','wall motion']

In [10]:
# make rtf files as well as a spreadsheet for highlight sentences
make_rtf_file = 1
os.makedirs(save_path,exist_ok = True)
make_spreadsheet = 1
spreadsheet_name = year + '_Radiology_report_w_highlight.xlsx'
column_list = ['Patient_ID','Keywords','Highlight text','Report text','Acession','Manufacturer','Model','Sex','Age','Protocol','Directories_Full','Directories_Function','Timeframes']

count = 0
data = data.fillna(0)
spreadsheet = []
for i in range(0,data.shape[0]):
    case = data.iloc[i]
    if case['Report text'] == 0:
        continue
        
    has_keyword,keyword_list,highlight_sentence,highlight_txt = find_highlighted_sentence_w_keyword(case,keyword)
    
    if has_keyword == 1:
        count += 1
        if make_rtf_file == 1:
            make_highlighted_rtf_report(case,highlight_sentence,save_path)
        
        spreadsheet.append([case['Patient_ID'],keyword_list,highlight_txt,case['Report text'],case['Accession'],case['Manufacturer'],case['Model'],case['Sex'],\
                        case['Age'],case['Protocol'],case['Directories_Full'],\
                        case['Directories_Function'],case['Timeframes']])
print('finish')
print(count)
                        
if make_spreadsheet == 1:
    df = pd.DataFrame(spreadsheet,columns = column_list)
    df.to_excel(os.path.join(os.path.dirname(save_path),spreadsheet_name),index = True)

finish
756
