In this notebook useful function examples from the pydicom library for exporting selected DICOM metadata into a txt, counting by group and basic QC tests after data conversion from DICOM to jpg.

# Environment

In [None]:
import pydicom
import pandas as pd
import numpy as np
import os
import re
import dill
import matplotlib.pyplot as pl

## Load/Save Environment

In [None]:
#Save a Notebook session:
#dill.dump_session('notebook_env.db')
#Restore a Notebook session:
#dill.load_session('notebook_env.db')

# Exporting all DICOM metadata into txt

## Saving all files in a list

In [None]:
dc3files_fullpath = [os.path.join(root, name)
             for root, dirs, files in os.walk('C:/Users/miguel/Documents/COVIDDSL/DICOM_ANONYMIZED/')
             for name in files
             if name.endswith((".DC3"))]

In [None]:
dc3files_fullpath_new_anonim = [os.path.join(root, name)
             for root, dirs, files in os.walk('C:/Users/miguel/Documents/COVIDDSL/DICOM_v2_ANON_NEW/')
             for name in files
             if name.endswith((".DC3"))]

In [None]:
dc3files_fullpath_new_no_anonim = [os.path.join(root, name)
             for root, dirs, files in os.walk('C:/Users/miguel/Documents/COVIDDSL/DICOM_v2_NO_ANON/')
             for name in files
             if name.endswith((".DC3"))]

In [None]:
dc3files_names = [os.path.join(name.replace(".DC3", ""))
             for root, dirs, files in os.walk('C:/Users/miguel/Documents/COVIDDSL/DICOM_ANONYMIZED/')
             for name in files
             if name.endswith((".DC3"))]

### Counting number of DICOM files found

In [None]:
# we need to check all files have an unique name
print(len(dc3files_fullpath))
len(set(dc3files_names))

0


0

## Accesing all files in a loop and exporting metadata

(0008, 0018) SOP Instance UID                    UI: 1.2.124.113532.01021072491462487812514213279902487116200  
(0008, 0020) Study Date                          DA: '20200407'  
(0008, 0030) Study Time                          TM: '155635.273000'  
(0010, 0020) Patient ID                          LO: '9999999999999999999'  
(0008, 1150) Referenced SOP Class UID            UI: CT Image Storage  

In [None]:
for index, value in enumerate(dc3files_fullpath): 
    ds = pydicom.read_file(value)
    file1 = open("C:/Users/miguel/Documents/COVIDDSL/dicom_metadata/"+dc3files_names[index]+".txt","a") 
    file1.write(str(ds)) 
    file1.close()

# Selecting only fields of interest

In [None]:
dicom_interest = pd.DataFrame(columns=['PatientID','SOPInstanceUID', 'StudyDate','StudyTime','Modality','BodyPartExamined','ViewPosition'])
for index, value in enumerate(dc3files_fullpath):
    try:
        ds = pydicom.read_file(value)
        dicom_interest=dicom_interest.append(
            {'PatientID':ds.PatientID, 'SOPInstanceUID':ds.SOPInstanceUID, 'StudyDate':ds.StudyDate,'StudyTime':ds.StudyTime, 'Modality':ds.Modality, 'BodyPartExamined':ds.BodyPartExamined, 'ViewPosition':ds.ViewPosition}
            ,ignore_index=True)
    except:
        pass

In [None]:
dicom_interest_new_anonim = pd.DataFrame(columns=['PatientID','SOPInstanceUID', 'StudyDate','StudyTime','Modality','BodyPartExamined','ViewPosition'])
for index, value in enumerate(dc3files_fullpath_new_anonim):
    try:
        ds = pydicom.read_file(value)
        dicom_interest_new_anonim=dicom_interest_new_anonim.append(
            {'PatientID':ds.PatientID, 'SOPInstanceUID':ds.SOPInstanceUID, 'StudyDate':ds.StudyDate,'StudyTime':ds.StudyTime, 'Modality':ds.Modality, 'BodyPartExamined':ds.BodyPartExamined, 'ViewPosition':ds.ViewPosition}
            ,ignore_index=True)
    except:
        pass

In [None]:
dicom_interest_new_no_anonim = pd.DataFrame(columns=['PatientID','SOPInstanceUID', 'StudyDate','StudyTime','Modality','BodyPartExamined','ViewPosition'])
for index, value in enumerate(dc3files_fullpath_new_no_anonim):
    try:
        ds = pydicom.read_file(value)
        dicom_interest_new_no_anonim=dicom_interest_new_no_anonim.append(
            {'PatientID':ds.PatientID, 'SOPInstanceUID':ds.SOPInstanceUID, 'StudyDate':ds.StudyDate,'StudyTime':ds.StudyTime, 'Modality':ds.Modality, 'BodyPartExamined':ds.BodyPartExamined, 'ViewPosition':ds.ViewPosition}
            ,ignore_index=True)
    except:
        pass

In [None]:
dicom_cxr_index=pd.concat([dicom_interest, dicom_interest_new_anonim,dicom_interest_new_no_anonim], axis=0)

In [None]:
dicom_cxr_index.describe()

Unnamed: 0,PatientID,SOPInstanceUID,StudyDate,StudyTime,Modality,BodyPartExamined,ViewPosition
count,6394,6394,6394,6394.0,6394,6394,6394
unique,2007,6394,128,5482.0,3,17,6
top,577,1.3.51.0.7.2702142446.25444.62788.33861.55254....,20200330,181301.0,CR,CHEST,AP
freq,42,1,300,8.0,4489,6289,4572


In [None]:
dicom_interest.groupby('Modality').PatientID.nunique()

Modality
CR    1371
DX     525
Name: PatientID, dtype: int64

In [None]:
dicom_interest.groupby('BodyPartExamined').PatientID.nunique()

BodyPartExamined
ABDOMEN        12
CALCANEUS       1
CHEST        1820
ELBOW           1
FOOT            4
HAND            1
HIP             2
HUMERUS         1
KNEE            2
LEG             1
LSPINE          3
PELVIS          4
SPINE           1
SSPINE          1
TSPINE          2
Name: PatientID, dtype: int64

In [None]:
pd.set_option('display.max_rows', 9999999999999999999999999999999999999)
dicom_interest.groupby('PatientID').SOPInstanceUID.nunique()

In [None]:
dicom_cxr_index.to_csv('dicom_cxr_index.csv')

# Quality control

Let's check which jpg files were not converted

In [None]:
jpgfiles_names = [os.path.join(name.replace("_result.jpg", ""))
             for root, dirs, files in os.walk('C:/Users/miguel/Documents/COVIDDSL/converted/')
             for name in filesb
             if name.endswith(("_result.jpg"))]

In [None]:
len(jpgfiles_names)

800046

In [None]:
dc3files_names_df=pd.DataFrame({'files':dc3files_names})
dc3files_fullpath_df=pd.Series(dc3files_fullpath)

In [None]:
dc3files_fullpath_df

0         C:/Users/miguel/Documents/COVIDDSL/ANONYMIZED/...
1         C:/Users/miguel/Documents/COVIDDSL/ANONYMIZED/...
2         C:/Users/miguel/Documents/COVIDDSL/ANONYMIZED/...
3         C:/Users/miguel/Documents/COVIDDSL/ANONYMIZED/...
4         C:/Users/miguel/Documents/COVIDDSL/ANONYMIZED/...
                                ...                        
800386    C:/Users/miguel/Documents/COVIDDSL/ANONYMIZED/...
800387    C:/Users/miguel/Documents/COVIDDSL/ANONYMIZED/...
800388    C:/Users/miguel/Documents/COVIDDSL/ANONYMIZED/...
800389    C:/Users/miguel/Documents/COVIDDSL/ANONYMIZED/...
800390    C:/Users/miguel/Documents/COVIDDSL/ANONYMIZED/...
Length: 800391, dtype: object

In [None]:
not_converted_images=dc3files_names_df[~dc3files_names_df.files.isin(jpgfiles_names)]


In [None]:
not_converted_images.to_csv('not_converted_images.csv')

In [None]:
dc3files_fullpath_df.str.contains(not_converted_images, regex=False)

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
          ..
800386   NaN
800387   NaN
800388   NaN
800389   NaN
800390   NaN
Length: 800391, dtype: float64

# Data move

In [None]:
dc3files_fullpath

In [None]:
source='C:/Users/miguel/Documents/COVIDDSL/ANONYMIZED'
destination='C:/Users/miguel/Documents/COVIDDSL/missing'



for index, row in not_converted_images.iterrows():
    copyfile(str(source+row['img_name']), str(destination+'/'+row['files']+'.DC3'))

In [None]:
not_converted_images

                                                    files
5776    1.2.124.113532.2421302611642119721531561564055...
24131         1.3.12.2.1107.5.3.56.3575.15.20200413092455
26287         1.3.12.2.1107.5.3.56.3575.15.20200330114233
26502         1.3.12.2.1107.5.3.56.3575.15.20200330120155
31470   1.3.46.670589.11.19112.5.24.5.1.5388.202004051...
...                                                   ...
776640  1.3.46.670589.11.18695.5.24.5.1.9500.202003221...
786066        1.3.12.2.1107.5.3.56.3575.15.20200330122405
786074        1.3.12.2.1107.5.3.56.3575.15.20200330115117
791489        1.3.12.2.1107.5.3.56.3575.15.20200330121422
795587  1.2.124.113532.1281351881485960712617724180238...

[345 rows x 1 columns]


# First CXR per patient using 30 days mortality covid dsl v2

In [None]:
patient_id_combined_structured=pd.read_csv('C:/Users/miguel/Documents/COVIDDSL/patient_id_combined_structured.csv')
dicom_cxr_index=pd.read_csv('C:/Users/miguel/Documents/MEGA/Boston/MIT/covidhm_survival/analysis/cxr/metadata_extraction/dicom_cxr_index.csv')

final_dataset = pd.merge(patient_id_combined_structured, dicom_cxr_index, how='inner', on=['PatientID'])

In [None]:
final_dataset

Unnamed: 0.1,PatientID,Unnamed: 0,SOPInstanceUID,StudyDate,StudyTime,Modality,BodyPartExamined,ViewPosition
0,1,2639,1.3.12.2.1107.5.3.56.2693.11.202004071536250109,20200407,153625.000,CR,CHEST,AP
1,47,4298,1.3.46.670589.30.36.0.1.18774111139.1584790133...,20200321,122739.723,DX,CHEST,AP
2,47,4299,1.3.46.670589.30.36.0.1.18774111139.1584790141...,20200321,122739.723,DX,CHEST,AP
3,47,4300,1.3.46.670589.30.36.0.1.18774111139.1584520902...,20200318,94132.202,DX,CHEST,AP
4,47,4301,1.3.46.670589.30.36.0.1.18774111139.1584951898...,20200323,92418.512,DX,CHEST,AP
...,...,...,...,...,...,...,...,...
5716,2567,445,1.3.51.0.7.12403204790.42539.2378.44734.3369.6...,20200423,112046.000,CR,CHEST,AP
5717,2569,446,1.3.51.0.7.11606373772.7387.57412.37519.29639....,20200426,100515.000,CR,CHEST,PA
5718,2569,447,1.3.51.0.7.1412207000.33144.18498.42018.56370....,20200425,113123.000,CR,CHEST,AP
5719,2569,448,1.3.51.0.7.1746121578.9690.19279.48510.24081.1...,20200430,92331.000,CR,CHEST,PA


In [None]:
final_dataset=final_dataset.sort_values(['PatientID','StudyDate','StudyTime'],ascending=True) # we are ordering by 'PatientID','StudyDate','StudyTime'],ascending

In [None]:
 final_dataset=final_dataset.groupby('PatientID').first()

In [None]:
final_dataset.to_csv('first_cxr_covid_dsl_v2_08072020.csv')