In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import os
import pandas as pd
import numpy as np
import pydicom
import matplotlib.pyplot as plt
import cv2

In [3]:
cur_path = "D:/MIMIC-CXR/physionet.org/files/mimic-cxr/2.0.0/"
os.chdir(cur_path)

In [4]:
cxr_study_list = pd.read_csv(cur_path+'cxr-study-list.csv')
cxr_record_list = pd.read_csv(cur_path+'cxr-record-list.csv')

In [5]:
cxr_study_list

Unnamed: 0,subject_id,study_id,path
0,10000032,50414267,files/p10/p10000032/s50414267.txt
1,10000032,53189527,files/p10/p10000032/s53189527.txt
2,10000032,53911762,files/p10/p10000032/s53911762.txt
3,10000032,56699142,files/p10/p10000032/s56699142.txt
4,10000764,57375967,files/p10/p10000764/s57375967.txt
...,...,...,...
227830,19999442,58708861,files/p19/p19999442/s58708861.txt
227831,19999733,57132437,files/p19/p19999733/s57132437.txt
227832,19999987,55368167,files/p19/p19999987/s55368167.txt
227833,19999987,58621812,files/p19/p19999987/s58621812.txt


In [6]:
cxr_record_list

Unnamed: 0,subject_id,study_id,dicom_id,path
0,10000032,50414267,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,files/p10/p10000032/s50414267/02aa804e-bde0afd...
1,10000032,50414267,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,files/p10/p10000032/s50414267/174413ec-4ec4c1f...
2,10000032,53189527,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,files/p10/p10000032/s53189527/2a2277a9-b0ded15...
3,10000032,53189527,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,files/p10/p10000032/s53189527/e084de3b-be89b11...
4,10000032,53911762,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,files/p10/p10000032/s53911762/68b5c4b1-227d048...
...,...,...,...,...
377105,19999733,57132437,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,files/p19/p19999733/s57132437/428e2c18-5721d8f...
377106,19999733,57132437,58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9,files/p19/p19999733/s57132437/58c403aa-35ff8bd...
377107,19999987,55368167,58766883-376a15ce-3b323a28-6af950a0-16b793bd,files/p19/p19999987/s55368167/58766883-376a15c...
377108,19999987,58621812,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,files/p19/p19999987/s58621812/7ba273af-3d290f8...


In [7]:
merged = cxr_study_list.merge(cxr_record_list, how='inner', left_on=['subject_id','study_id'], right_on=['subject_id','study_id'])
merged

Unnamed: 0,subject_id,study_id,path_x,dicom_id,path_y
0,10000032,50414267,files/p10/p10000032/s50414267.txt,02aa804e-bde0afdd-112c0b34-7bc16630-4e384014,files/p10/p10000032/s50414267/02aa804e-bde0afd...
1,10000032,50414267,files/p10/p10000032/s50414267.txt,174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962,files/p10/p10000032/s50414267/174413ec-4ec4c1f...
2,10000032,53189527,files/p10/p10000032/s53189527.txt,2a2277a9-b0ded155-c0de8eb9-c124d10e-82c5caab,files/p10/p10000032/s53189527/2a2277a9-b0ded15...
3,10000032,53189527,files/p10/p10000032/s53189527.txt,e084de3b-be89b11e-20fe3f9f-9c8d8dfe-4cfd202c,files/p10/p10000032/s53189527/e084de3b-be89b11...
4,10000032,53911762,files/p10/p10000032/s53911762.txt,68b5c4b1-227d0485-9cc38c3f-7b84ab51-4b472714,files/p10/p10000032/s53911762/68b5c4b1-227d048...
...,...,...,...,...,...
377105,19999733,57132437,files/p19/p19999733/s57132437.txt,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,files/p19/p19999733/s57132437/428e2c18-5721d8f...
377106,19999733,57132437,files/p19/p19999733/s57132437.txt,58c403aa-35ff8bd9-73e39f54-8dc9cc5d-e0ec3fa9,files/p19/p19999733/s57132437/58c403aa-35ff8bd...
377107,19999987,55368167,files/p19/p19999987/s55368167.txt,58766883-376a15ce-3b323a28-6af950a0-16b793bd,files/p19/p19999987/s55368167/58766883-376a15c...
377108,19999987,58621812,files/p19/p19999987/s58621812.txt,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,files/p19/p19999987/s58621812/7ba273af-3d290f8...


In [8]:
def extract_finding_impression(origin_file):
    fp = open(origin_file)
    lines = fp.readlines() 
    mark = 0
    find_impression = []
    for line in lines:
        # print(line)
        if 'FINDINGS:' in line:
            mark = 1
            continue
        elif 'IMPRESSION:' in line:
            mark = 1
            continue
        elif ('EXAMINATION:' in line) or ('INDICATION:' in line) or ('TECHNIQUE:' in line) or ('COMPARISON:' in line):
            mark = 0
        else:
            if mark == 1:
                find_impression.append(line)

    fp.close()

    find_impression_str = ''
    for i in find_impression:
        i = i.replace('\n', '')
        find_impression_str += i

    # name = origin_file.split('/')[-1]
    # f = open(target_file, 'w+')
    # f.write(find_impression_str)
    # f.close()

    return find_impression_str

In [9]:
text_file = 'files/p10/p10000032/s53911762.txt'
content = extract_finding_impression(text_file)
content

'  Single frontal view of the chest provided.  There is no focal consolidation, effusion, or pneumothorax. The cardiomediastinal silhouette is normal.  Again seen are multiple clips projecting over the left breast and remote left-sided rib fractures.  No free air below the right hemidiaphragm is seen.   No acute intrathoracic process.'

In [21]:
paired_data = pd.DataFrame(columns=['image_path', 'text_content'])
sample_count = 0

for index, row in merged.iterrows():
    image_raw = row['path_y']
    text_file = row['path_x']

    if (not os.path.exists(image_raw)) or (not os.path.exists(text_file)):
        print('file not exist.')
        continue

    content = extract_finding_impression(text_file)

    # merged.loc[index,'text_content'] = content

    image_out_path = 'D:/MIMIC-CXR/image/' + image_raw.split('/')[-1].split('.')[0] + '.jpg'
    ds = pydicom.read_file(image_raw) 
    img = ds.pixel_array 
    # plt.imshow(img,  cmap='gray')
    # plt.show()
    newimg = (img - np.min(img)) / (np.max(img) - np.min(img)) 
    newimg = (newimg * 255).astype('uint8')
    cv2.imwrite(image_out_path,newimg)

    paired_data = paired_data.append({'image_path':image_out_path, 'text_content':content},ignore_index=True)

    sample_count += 1
    if sample_count > 10000:
        break
    print(str(sample_count)+' - '+image_out_path)
        

1d4-8ff634e2.jpg
9735 - D:/MIMIC-CXR/image/505ede1a-58344de7-5af808ef-5d8145be-c7cd2dd5.jpg
9736 - D:/MIMIC-CXR/image/76acdcf1-633b10ee-72a1dbf6-4a2fde0c-579b358f.jpg
9737 - D:/MIMIC-CXR/image/c7be3028-324562bd-a04b41fe-c07ed94a-8677ded5.jpg
9738 - D:/MIMIC-CXR/image/42de33fc-4c997080-e57d30ba-94e3abbc-92fe4261.jpg
9739 - D:/MIMIC-CXR/image/7cf71288-af2c10ec-20f0fc32-c8431ec6-9bbc3183.jpg
9740 - D:/MIMIC-CXR/image/d0179447-a7344ce5-86f1a4b9-4fde7279-e3351756.jpg
9741 - D:/MIMIC-CXR/image/163eda38-3db235e1-60ec8c61-7f97f53b-a46acf9f.jpg
9742 - D:/MIMIC-CXR/image/6c36f31f-e43979b9-4b95e0c2-d3a83f4a-1b72b4d9.jpg
9743 - D:/MIMIC-CXR/image/1ec5cf71-017eb43d-2706b823-a8364145-0f71897d.jpg
9744 - D:/MIMIC-CXR/image/b3c932b4-a04e1185-7997f411-173c19be-fda0dbfd.jpg
9745 - D:/MIMIC-CXR/image/3b59adea-9b27d741-1e4ed124-748c98e4-520e09e5.jpg
9746 - D:/MIMIC-CXR/image/74642458-20f86f5b-20aab8e6-02864f32-0b15dca7.jpg
9747 - D:/MIMIC-CXR/image/a25d2a6f-23f866ab-9f99490b-fd6d7923-890f8ce2.jpg
9748 - D

In [22]:
paired_data

Unnamed: 0,image_path,text_content
0,D:/MIMIC-CXR/image/02aa804e-bde0afdd-112c0b34-...,"There is no focal consolidation, pleural eff..."
1,D:/MIMIC-CXR/image/174413ec-4ec4c1f7-34ea26b7-...,"There is no focal consolidation, pleural eff..."
2,D:/MIMIC-CXR/image/2a2277a9-b0ded155-c0de8eb9-...,"The cardiac, mediastinal and hilar contours ..."
3,D:/MIMIC-CXR/image/e084de3b-be89b11e-20fe3f9f-...,"The cardiac, mediastinal and hilar contours ..."
4,D:/MIMIC-CXR/image/68b5c4b1-227d0485-9cc38c3f-...,Single frontal view of the chest provided. ...
...,...,...
9996,D:/MIMIC-CXR/image/d18c2121-5b01211d-80b23963-...,Tiny bilateral pleural effusions are seen. ...
9997,D:/MIMIC-CXR/image/4c3294f1-81fc9461-0837e000-...,Heart is upper limits of normal in size acc...
9998,D:/MIMIC-CXR/image/c1d8108b-c7c6deed-88d6e5d2-...,Heart is upper limits of normal in size acc...
9999,D:/MIMIC-CXR/image/aaf7b1e2-479c4597-254b57ef-...,appear within normal limits. There is no ple...


In [23]:
paired_data.to_csv('D:/MIMIC-CXR/image_path_text.csv')