In [1]:
# code modified from: 
# https://github.com/leonardgarcia90/Sybil-Project/blob/main/sybil_training.ipynb

In [1]:
import pydicom 
import os 
import pandas as pd 
import glob
import re
from datetime import datetime

# Filter NLST cases based on kernel, slice thickness, and orientation

In [111]:
def get_first_file_in_every_folder(root_directory):
    first_files = []
    for dirpath, dirnames, filenames in os.walk(root_directory):
        if filenames:
            filenames.sort()
            first_file = os.path.join(dirpath, filenames[0])
            first_files.append(first_file)
    return first_files

In [112]:
root_directory_nlst = '/workspace/home/leonardgarcia/sybil_test'
first_files_nlst = get_first_file_in_every_folder(root_directory_nlst)
first_files_nlst.sort()
first_files_nlst.remove('/workspace/home/leonardgarcia/sybil_test/LICENSE')
print(first_files_nlst[-10])

/workspace/home/leonardgarcia/sybil_test/218872/01-02-1999-NA-NLST-ACRIN-21146/3.000000-0OPASESEN4B50f300512040202-18705/1-01.dcm


In [None]:
#make list of dicom files based on list of directories

first_diacom_per_nlst_file = []

for scan in first_files_nlst:
        first_diacom_per_nlst_file.append(pydicom.dcmread(scan,force=True))
    
print(first_diacom_per_nlst_file[0])

Dataset.file_meta -------------------------------
(0002, 0000) File Meta Information Group Length  UL: 192
(0002, 0001) File Meta Information Version       OB: b'\x00\x01'
(0002, 0002) Media Storage SOP Class UID         UI: CT Image Storage
(0002, 0003) Media Storage SOP Instance UID      UI: 1.2.840.113654.2.55.278749222456145672507153754911543871073
(0002, 0010) Transfer Syntax UID                 UI: Explicit VR Little Endian
(0002, 0012) Implementation Class UID            UI: 1.2.40.0.13.1.1.1
(0002, 0013) Implementation Version Name         SH: 'dcm4che-1.4.31'
-------------------------------------------------
(0008, 0005) Specific Character Set              CS: 'ISO_IR 100'
(0008, 0008) Image Type                          CS: ['ORIGINAL', 'PRIMARY', 'AXIAL']
(0008, 0016) SOP Class UID                       UI: CT Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.2.840.113654.2.55.278749222456145672507153754911543871073
(0008, 0020) Study Date                 

In [114]:
#make a list of desired dicom metadata properties

def make_dicom_metadata_df(dicom_list, directory_list):
    dicom_metadata_list = []
    index = 0

    for dicom in dicom_list:
        
        if hasattr(dicom, 'ConvolutionKernel') and dicom.ConvolutionKernel is not None:
            dicom_metadata_list.append([dicom.AccessionNumber, dicom.SliceThickness, dicom.ConvolutionKernel,
                                        dicom.ImageOrientationPatient, dicom.PatientID, index, directory_list[index]])
 
            index += 1
    
        elif hasattr(dicom, 'SliceThickness') and dicom.SliceThickness is not None:
            
            dicom_metadata_list.append([dicom.AccessionNumber, dicom.SliceThickness, "missing",
                                        dicom.ImageOrientationPatient, dicom.PatientID, index, directory_list[index]])
            index +=1
            
        elif hasattr(dicom, 'ImageOrientationPatient') and dicom.SliceThickness is not None:
            
            dicom_metadata_list.append([dicom.AccessionNumber, 'missing', "missing",
                                        dicom.ImageOrientationPatient, dicom.PatientID, index, directory_list[index]])
            index +=1
              
        else:
            dicom_metadata_list.append([dicom.AccessionNumber, 'missing', "missing",
                                        'missing', dicom.PatientID, index, directory_list[index]])
            print(index)
            index +=1

        
    diacom_metadata_df = pd.DataFrame(dicom_metadata_list, columns=['AccessionNumber', 'SliceThickness', 'ConvolutionKernel'
                                                               ,'ImageOrientationPatient', 'Patient_ID',"index", "Directory"])
    return diacom_metadata_df

In [115]:
nlst_diacom_metadata_df = make_dicom_metadata_df(first_diacom_per_nlst_file, first_files_nlst)


16783


In [116]:
nlst_diacom_metadata_df['ConvolutionKernel'].value_counts()


ConvolutionKernel
STANDARD         4840
B30f             3329
B50f             2719
T20s             2237
EXPERIMENTAL7    1803
BONE             1776
FL01              537
FC51              529
C                 480
missing           456
LUNG              347
D                 319
FC10              122
B                  45
FC11               34
FC30               30
FC01               24
B60f               21
A                  13
B70f               10
FC02                7
B80f                5
FC50                4
FC82                3
FC53                2
B20f                2
B30s                2
B60s                1
FL04                1
B31f                1
B50s                1
B45f                1
B40f                1
BONEPLUS            1
Name: count, dtype: int64

In [117]:
#function to determine ct orientation based on 'ImageOrientationPatient' column

def get_ct_orientation(df, index):

    image_orientation_patient_string = str(df['ImageOrientationPatient'].iloc[index])
    image_vectors = image_orientation_patient_string.replace('[','').replace(']','').split(',')
    image_vectors = [round(float(x)) for x in image_vectors]

    #[1,0,0,0,1,0]
    if image_vectors[0] == 1 and image_vectors[4] == 1: 
        return 'axial'
    
    #[0,1,0,0,0,-1]
    elif image_vectors[1] == 1 and image_vectors[5] == -1: 
        return 'sagittal'
    
    #[1,0,0,0,0,-1]
    elif image_vectors[0] == 1 and image_vectors[5] == -1: 
        return 'coronal'
    
    else:
        return 'unknown'

In [118]:
#add column for scan orientation

def add_orientation_column(dicom_df):
    orientation = []
    
    for index in range(len(dicom_df)):
        if str(dicom_df['ImageOrientationPatient'].iloc[index]) == 'missing':
            orientation.append('missing')
        else:
            orientation.append(get_ct_orientation(dicom_df, index))

    dicom_df['Orientation'] = orientation

# add timepoint
def add_timepoint(df):
    # Extract the year from the Directory
    df['Year'] = df['Directory'].apply(lambda x: int(re.search(r'\d{2}-\d{2}-(\d{4})', x).group(1)))

    # Calculate the difference in years for each patient's selected scan relative to their first scan

    
    df['Year_Difference'] = df.groupby('Patient_ID')['Year'].transform(lambda x: x - x.min())

    # Rename Year_Difference to timepoint
    df_with_t = df.rename(columns={'Year_Difference': 'timepoint'})

    return df_with_t

In [119]:
add_orientation_column(nlst_diacom_metadata_df)
nlst_diacom_metadata_df = add_timepoint(nlst_diacom_metadata_df)

In [None]:
#cast 'ConvolutionKernel' to string to deal with ['Br49d', '3'] value
nlst_diacom_metadata_df['ConvolutionKernel'] = nlst_diacom_metadata_df['ConvolutionKernel'].astype(str)

#calculate the counts for different combinations of features
feature_count_nlst_df = nlst_diacom_metadata_df.groupby(['SliceThickness','ConvolutionKernel', 'Orientation']).size().reset_index().rename(columns={0:'count'})

In [121]:
feature_count_nlst_df = feature_count_nlst_df.sort_values('count', ascending=False)
feature_count_nlst_df

Unnamed: 0,SliceThickness,ConvolutionKernel,Orientation,count
50,2.5,STANDARD,axial,3137
25,2.0,B30f,axial,2258
28,2.0,B50f,axial,1864
12,1.0,T20s,coronal,1509
46,2.5,BONE,axial,1461
...,...,...,...,...
131,368.727264,EXPERIMENTAL7,unknown,1
132,369.090881,EXPERIMENTAL7,unknown,1
136,370.727264,EXPERIMENTAL7,unknown,1
137,370.727264,STANDARD,coronal,1


In [122]:
#want entries with following properties: 
#SliceThickness = 2.5, 2.0, ConvolutionKernel = STANDARD, B30f, Orientation = axial

reduced_nlst_df1 = nlst_diacom_metadata_df[(nlst_diacom_metadata_df['SliceThickness'] == 2.5) & (nlst_diacom_metadata_df['Orientation'] == 'axial')
                               & (nlst_diacom_metadata_df['ConvolutionKernel'] == 'STANDARD')]

reduced_nlst_df2 = nlst_diacom_metadata_df[(nlst_diacom_metadata_df['SliceThickness'] == 2.0) & (nlst_diacom_metadata_df['Orientation'] == 'axial')
                               & (nlst_diacom_metadata_df['ConvolutionKernel'] == 'B30f')]

In [123]:
reduced_nlst_df = pd.concat([reduced_nlst_df1, reduced_nlst_df2], axis=0, ignore_index=True)
reduced_nlst_df

Unnamed: 0,AccessionNumber,SliceThickness,ConvolutionKernel,ImageOrientationPatient,Patient_ID,index,Directory,Orientation,Year,timepoint
0,451441,2.5,STANDARD,"[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",100005,2,/workspace/home/leonardgarcia/sybil_test/10000...,axial,2000,1
1,900447,2.5,STANDARD,"[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",100005,4,/workspace/home/leonardgarcia/sybil_test/10000...,axial,2001,2
2,570860,2.5,STANDARD,"[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",100032,10,/workspace/home/leonardgarcia/sybil_test/10003...,axial,1999,0
3,315777,2.5,STANDARD,"[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",100032,12,/workspace/home/leonardgarcia/sybil_test/10003...,axial,2000,1
4,156408,2.5,STANDARD,"[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",100032,14,/workspace/home/leonardgarcia/sybil_test/10003...,axial,2001,2
...,...,...,...,...,...,...,...,...,...,...
5390,4425224220206255,2.0,B30f,"[1, 0, 0, 0, 1, 0]",218802,19666,/workspace/home/leonardgarcia/sybil_test/21880...,axial,2000,1
5391,8458124601177092,2.0,B30f,"[1, 0, 0, 0, 1, 0]",218802,19669,/workspace/home/leonardgarcia/sybil_test/21880...,axial,2001,2
5392,5852165011024924,2.0,B30f,"[1, 0, 0, 0, 1, 0]",218872,19694,/workspace/home/leonardgarcia/sybil_test/21887...,axial,1999,0
5393,1302452295231190,2.0,B30f,"[1, 0, 0, 0, 1, 0]",218872,19698,/workspace/home/leonardgarcia/sybil_test/21887...,axial,2000,1


In [124]:
#check features of ommited patients

#creates dataframe of patients not included in other dataset
ommitted_nlst_df = nlst_diacom_metadata_df[~nlst_diacom_metadata_df['Patient_ID'].isin(reduced_nlst_df['Patient_ID'].tolist())]
ommitted_feature_count_nlst_df = ommitted_nlst_df.groupby(['SliceThickness','ConvolutionKernel', 'Orientation']).size().reset_index().rename(columns={0:'count'})
ommitted_feature_count_nlst_df = ommitted_feature_count_nlst_df.sort_values('count', ascending=False)
ommitted_feature_count_nlst_df



Unnamed: 0,SliceThickness,ConvolutionKernel,Orientation,count
23,2.0,FC51,axial,512
25,2.0,FL01,coronal,372
9,1.0,missing,coronal,369
31,3.2,C,axial,330
12,1.25,STANDARD,axial,318
32,3.2,D,axial,264
27,2.5,BONE,axial,163
10,1.25,BONE,axial,157
28,2.5,LUNG,axial,128
43,350.181824,STANDARD,unknown,122


In [125]:
nlst_diacom_metadata_df.columns


Index(['AccessionNumber', 'SliceThickness', 'ConvolutionKernel',
       'ImageOrientationPatient', 'Patient_ID', 'index', 'Directory',
       'Orientation', 'Year', 'timepoint'],
      dtype='object')

In [126]:
#SliceThickness = 2.0, ConvolutionKernel = FC51, Orientation = axial

reduced_nlst_df3 = ommitted_nlst_df[(ommitted_nlst_df['SliceThickness'] == 2.0) & (ommitted_nlst_df['Orientation'] == 'axial')
                               & (ommitted_nlst_df['ConvolutionKernel'] == 'FC51')]


In [127]:
reduced_nlst_df = pd.concat([reduced_nlst_df, reduced_nlst_df3], axis=0, ignore_index=True)


In [128]:
#check features of ommited patients

#creates dataframe of patients not included in other dataset
ommitted_nlst_df = nlst_diacom_metadata_df[~nlst_diacom_metadata_df['Patient_ID'].isin(reduced_nlst_df['Patient_ID'].tolist())]
ommitted_feature_count_nlst_df = ommitted_nlst_df.groupby(['SliceThickness','ConvolutionKernel', 'Orientation']).size().reset_index().rename(columns={0:'count'})
ommitted_feature_count_nlst_df = ommitted_feature_count_nlst_df.sort_values('count', ascending=False)
ommitted_feature_count_nlst_df

Unnamed: 0,SliceThickness,ConvolutionKernel,Orientation,count
6,1.0,missing,coronal,369
20,3.2,C,axial,330
9,1.25,STANDARD,axial,318
21,3.2,D,axial,264
16,2.5,BONE,axial,163
7,1.25,BONE,axial,157
17,2.5,LUNG,axial,128
31,350.181824,STANDARD,unknown,122
28,350.181793,EXPERIMENTAL7,unknown,104
15,2.0,C,axial,67


In [129]:
#SliceThickness = 1.25, ConvolutionKernel = STANDARD, Orientation = axial

reduced_nlst_df4 = ommitted_nlst_df[(ommitted_nlst_df['SliceThickness'] == 1.25) & (ommitted_nlst_df['Orientation'] == 'axial')
                               & (ommitted_nlst_df['ConvolutionKernel'] == 'STANDARD')]

In [130]:
reduced_nlst_df = pd.concat([reduced_nlst_df, reduced_nlst_df4], axis=0, ignore_index=True)


In [131]:
#check features of ommited patients

#creates dataframe of patients not included in other dataset
ommitted_nlst_df = nlst_diacom_metadata_df[~nlst_diacom_metadata_df['Patient_ID'].isin(reduced_nlst_df['Patient_ID'].tolist())]
ommitted_feature_count_nlst_df = ommitted_nlst_df.groupby(['SliceThickness','ConvolutionKernel', 'Orientation']).size().reset_index().rename(columns={0:'count'})
ommitted_feature_count_nlst_df = ommitted_feature_count_nlst_df.sort_values('count', ascending=False)
ommitted_feature_count_nlst_df

Unnamed: 0,SliceThickness,ConvolutionKernel,Orientation,count
6,1.0,missing,coronal,369
15,3.2,C,axial,330
16,3.2,D,axial,264
12,2.0,C,axial,65
1,0.75,missing,coronal,57
14,3.2,B,axial,38
13,3.2,A,axial,13
2,1.0,B30f,axial,5
3,1.0,B80f,axial,5
4,1.0,C,axial,3


In [132]:
#SliceThickness = 3.2, ConvolutionKernel = C, Orientation = axial

reduced_nlst_df5 = ommitted_nlst_df[(ommitted_nlst_df['SliceThickness'] == 3.2) & (ommitted_nlst_df['Orientation'] == 'axial')
                               & (ommitted_nlst_df['ConvolutionKernel'] == 'C')]

In [133]:
reduced_nlst_df = pd.concat([reduced_nlst_df, reduced_nlst_df5], axis=0, ignore_index=True)


In [134]:
#check features of ommited patients

#creates dataframe of patients not included in other dataset
ommitted_nlst_df = nlst_diacom_metadata_df[~nlst_diacom_metadata_df['Patient_ID'].isin(reduced_nlst_df['Patient_ID'].tolist())]
ommitted_feature_count_nlst_df = ommitted_nlst_df.groupby(['SliceThickness','ConvolutionKernel', 'Orientation']).size().reset_index().rename(columns={0:'count'})
ommitted_feature_count_nlst_df = ommitted_feature_count_nlst_df.sort_values('count', ascending=False)
ommitted_feature_count_nlst_df

Unnamed: 0,SliceThickness,ConvolutionKernel,Orientation,count
9,2.0,C,axial,64
0,0.75,missing,coronal,57
12,3.2,D,axial,29
11,3.2,B,axial,28
5,1.0,missing,coronal,24
1,1.0,B30f,axial,5
2,1.0,B80f,axial,5
10,3.2,A,axial,5
3,1.0,C,axial,3
4,1.0,T20s,coronal,1


In [135]:
#SliceThickness = 2.0, ConvolutionKernel = C, Orientation = axial

reduced_nlst_df6 = ommitted_nlst_df[(ommitted_nlst_df['SliceThickness'] == 2.0) & (ommitted_nlst_df['Orientation'] == 'axial')
                               & (ommitted_nlst_df['ConvolutionKernel'] == 'C')]

In [136]:
reduced_nlst_df = pd.concat([reduced_nlst_df, reduced_nlst_df6], axis=0, ignore_index=True)


In [137]:
#check features of ommited patients

#creates dataframe of patients not included in other dataset
ommitted_nlst_df = nlst_diacom_metadata_df[~nlst_diacom_metadata_df['Patient_ID'].isin(reduced_nlst_df['Patient_ID'].tolist())]
ommitted_feature_count_nlst_df = ommitted_nlst_df.groupby(['SliceThickness','ConvolutionKernel', 'Orientation']).size().reset_index().rename(columns={0:'count'})
ommitted_feature_count_nlst_df = ommitted_feature_count_nlst_df.sort_values('count', ascending=False)
ommitted_feature_count_nlst_df

Unnamed: 0,SliceThickness,ConvolutionKernel,Orientation,count
9,3.2,D,axial,29
8,3.2,B,axial,28
3,1.0,missing,coronal,24
0,1.0,B30f,axial,5
1,1.0,B80f,axial,5
7,3.2,A,axial,5
2,1.0,T20s,coronal,1
4,1.3,C,axial,1
5,1.3,D,axial,1
6,2.0,B50f,axial,1


In [138]:
#SliceThickness = 3.2, ConvolutionKernel = D, Orientation = axial

reduced_nlst_df7 = ommitted_nlst_df[(ommitted_nlst_df['SliceThickness'] == 3.2) & (ommitted_nlst_df['Orientation'] == 'axial')
                               & (ommitted_nlst_df['ConvolutionKernel'] == 'D')]

In [139]:
reduced_nlst_df = pd.concat([reduced_nlst_df, reduced_nlst_df7], axis=0, ignore_index=True)


In [140]:
#check features of ommited patients

#creates dataframe of patients not included in other dataset
ommitted_nlst_df = nlst_diacom_metadata_df[~nlst_diacom_metadata_df['Patient_ID'].isin(reduced_nlst_df['Patient_ID'].tolist())]
ommitted_feature_count_nlst_df = ommitted_nlst_df.groupby(['SliceThickness','ConvolutionKernel', 'Orientation']).size().reset_index().rename(columns={0:'count'})
ommitted_feature_count_nlst_df = ommitted_feature_count_nlst_df.sort_values('count', ascending=False)
ommitted_feature_count_nlst_df

Unnamed: 0,SliceThickness,ConvolutionKernel,Orientation,count
0,1.0,B30f,axial,5
1,1.0,B80f,axial,5
2,1.0,T20s,coronal,1
3,1.3,C,axial,1
4,1.3,D,axial,1
5,2.0,B50f,axial,1


In [141]:
#SliceThickness = 1.0, ConvolutionKernel = B30f, Orientation = axial

reduced_nlst_df8 = ommitted_nlst_df[(ommitted_nlst_df['SliceThickness'] == 1.0) & (ommitted_nlst_df['Orientation'] == 'axial')
                               & (ommitted_nlst_df['ConvolutionKernel'] == 'B30f')]

In [142]:
reduced_nlst_df = pd.concat([reduced_nlst_df, reduced_nlst_df8], axis=0, ignore_index=True)


In [143]:
#check features of ommited patients

#creates dataframe of patients not included in other dataset
ommitted_nlst_df = nlst_diacom_metadata_df[~nlst_diacom_metadata_df['Patient_ID'].isin(reduced_nlst_df['Patient_ID'].tolist())]

In [144]:
ommitted_nlst_df

Unnamed: 0,AccessionNumber,SliceThickness,ConvolutionKernel,ImageOrientationPatient,Patient_ID,index,Directory,Orientation,Year,timepoint
490,556382.0,1.0,T20s,"[1, -7.0448747e-016, 0, 0, 0, -1]",101066,490,/workspace/home/leonardgarcia/sybil_test/10106...,coronal,1999,0
491,556382.0,2.0,B50f,"[1, 0, 0, 0, 1, 0]",101066,491,/workspace/home/leonardgarcia/sybil_test/10106...,axial,1999,0
17001,,1.3,C,"[1, 0, 0, 0, 1, 0]",208792,17001,/workspace/home/leonardgarcia/sybil_test/20879...,axial,1999,0
17002,,1.3,D,"[1, 0, 0, 0, 1, 0]",208792,17002,/workspace/home/leonardgarcia/sybil_test/20879...,axial,1999,0


In [145]:
reduced_nlst_df9 = ommitted_nlst_df.loc[[491, 17001]]


In [146]:
reduced_nlst_df = pd.concat([reduced_nlst_df, reduced_nlst_df9], axis=0, ignore_index=True)


In [147]:
final_reduced_nlst_df = reduced_nlst_df.groupby('Patient_ID').tail(1)

In [149]:
final_reduced_nlst_df

Unnamed: 0,AccessionNumber,SliceThickness,ConvolutionKernel,ImageOrientationPatient,Patient_ID,index,Directory,Orientation,Year,timepoint
1,900447,2.5,STANDARD,"[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",100005,4,/workspace/home/leonardgarcia/sybil_test/10000...,axial,2001,2
4,156408,2.5,STANDARD,"[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",100032,14,/workspace/home/leonardgarcia/sybil_test/10003...,axial,2001,2
7,128341,2.5,STANDARD,"[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",100043,22,/workspace/home/leonardgarcia/sybil_test/10004...,axial,2001,2
10,205739,2.5,STANDARD,"[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",100085,47,/workspace/home/leonardgarcia/sybil_test/10008...,axial,2001,2
13,555313,2.5,STANDARD,"[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",100217,87,/workspace/home/leonardgarcia/sybil_test/10021...,axial,2001,2
...,...,...,...,...,...,...,...,...,...,...
6650,,1.0,B30f,"[1, 0, 0, 0, 1, 0]",200988,14920,/workspace/home/leonardgarcia/sybil_test/20098...,axial,2001,2
6651,,1.0,B30f,"[1, 0, 0, 0, 1, 0]",201769,15047,/workspace/home/leonardgarcia/sybil_test/20176...,axial,1999,0
6652,,1.0,B30f,"[1, 0, 0, 0, 1, 0]",214683,18610,/workspace/home/leonardgarcia/sybil_test/21468...,axial,1999,0
6653,556382,2.0,B50f,"[1, 0, 0, 0, 1, 0]",101066,491,/workspace/home/leonardgarcia/sybil_test/10106...,axial,1999,0


In [150]:
# remove suffix from dicom directories to get parent folder

final_reduced_nlst_df['Directory'] = final_reduced_nlst_df['Directory'].apply(
    lambda directory: directory[:directory.rindex('/')])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_reduced_nlst_df['Directory'] = final_reduced_nlst_df['Directory'].apply(


In [151]:
final_reduced_nlst_df.to_csv('/workspace/home/tengyuezhang/sybil_cect/data/nlst_selection2.csv', index=False)


In [152]:
#rename patient id column to match other dataframe

final_reduced_nlst_df = final_reduced_nlst_df.rename(columns={'Patient_ID': 'pid'})

In [153]:
# cast patient id column to int

final_reduced_nlst_df['pid'] = final_reduced_nlst_df['pid'].astype(int)

In [154]:
final_reduced_nlst_df

Unnamed: 0,AccessionNumber,SliceThickness,ConvolutionKernel,ImageOrientationPatient,pid,index,Directory,Orientation,Year,timepoint
1,900447,2.5,STANDARD,"[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",100005,4,/workspace/home/leonardgarcia/sybil_test/10000...,axial,2001,2
4,156408,2.5,STANDARD,"[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",100032,14,/workspace/home/leonardgarcia/sybil_test/10003...,axial,2001,2
7,128341,2.5,STANDARD,"[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",100043,22,/workspace/home/leonardgarcia/sybil_test/10004...,axial,2001,2
10,205739,2.5,STANDARD,"[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",100085,47,/workspace/home/leonardgarcia/sybil_test/10008...,axial,2001,2
13,555313,2.5,STANDARD,"[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",100217,87,/workspace/home/leonardgarcia/sybil_test/10021...,axial,2001,2
...,...,...,...,...,...,...,...,...,...,...
6650,,1.0,B30f,"[1, 0, 0, 0, 1, 0]",200988,14920,/workspace/home/leonardgarcia/sybil_test/20098...,axial,2001,2
6651,,1.0,B30f,"[1, 0, 0, 0, 1, 0]",201769,15047,/workspace/home/leonardgarcia/sybil_test/20176...,axial,1999,0
6652,,1.0,B30f,"[1, 0, 0, 0, 1, 0]",214683,18610,/workspace/home/leonardgarcia/sybil_test/21468...,axial,1999,0
6653,556382,2.0,B50f,"[1, 0, 0, 0, 1, 0]",101066,491,/workspace/home/leonardgarcia/sybil_test/10106...,axial,1999,0


# Determine time from screening to time of diagnosis

For each scan, obtain the timestamp. Then, calculate the time from screening to time of diagnosis based on time of screening, time of randomization, and time from randomization to diagnosis. 

In [157]:
nlst_clinical_path = '/workspace/home/leonardgarcia/NLST_clinical/csv/participant.csv'
clinical_df = pd.read_csv(nlst_clinical_path)

  clinical_df = pd.read_csv(nlst_clinical_path)


In [158]:
clinical_df.columns

Index(['smokeage', 'smokeday', 'age_quit', 'cigsmok', 'gender', 'rndgroup',
       'age', 'pkyr', 'scr_lat0', 'scr_lat1',
       ...
       'progsite_skin_days', 'progsite_lymph_n1_days',
       'progsite_lymph_n2_days', 'progsite_lymph_n3_days', 'progsite_unk_days',
       'last_progfree_days', 'confirmed_candxdays1', 'confirmed_candxdays2',
       'confirmed_candxdays3', 'confirmed_candxdays4'],
      dtype='object', length=320)

In [166]:
clinical_df[['pid', 'scr_days2', 'conflc']]

Unnamed: 0,pid,scr_days2,conflc
0,100001.0,,2.0
1,100002.0,731.0,0.0
2,100003.0,738.0,2.0
3,100004.0,743.0,2.0
4,100005.0,746.0,3.0
...,...,...,...
53447,218890.0,727.0,0.0
53448,218891.0,,0.0
53449,218892.0,708.0,0.0
53450,218893.0,756.0,0.0


In [68]:
clinical_df['candx_days'].notna().sum()


2058

In [168]:
merged_df = pd.merge(final_reduced_nlst_df, clinical_df, on='pid', how='left')

def get_scr_days(row): 
    timepoint_col = f'scr_days{row["timepoint"]}'
    return row[timepoint_col]

merged_df['days_from_randomization_to_scan'] = merged_df.apply(get_scr_days, axis=1)

print(merged_df[['pid', 'timepoint', 'days_from_randomization_to_scan']])

print(merged_df['days_from_randomization_to_scan'].notna().sum())

         pid  timepoint  days_from_randomization_to_scan
0     100005          2                            746.0
1     100032          2                            702.0
2     100043          2                            722.0
3     100085          2                            916.0
4     100217          2                            716.0
...      ...        ...                              ...
2323  200988          2                            771.0
2324  201769          0                              0.0
2325  214683          0                              0.0
2326  101066          0                             13.0
2327  208792          0                              0.0

[2328 rows x 3 columns]
2325


In [183]:

# Step 1: Identify rows in merged_df where 'days_from_randomization_to_scan' is NaN
nan_rows = merged_df[merged_df['days_from_randomization_to_scan'].isna()]

# Step 2: Extract the corresponding 'pid' values from those rows
nan_pids = nan_rows['pid'].unique()

# Step 3: Filter clinical_df using these 'pid' values
corresponding_clinical_rows = clinical_df[clinical_df['pid'].isin(nan_pids)]

# Print the corresponding rows in clinical_df
print("Corresponding rows in clinical_df:")
print(corresponding_clinical_rows[['pid', 'scr_days0', 'scr_days1', 'scr_days2', 'candx_days']])


# Print the pid and timepoint columns of these rows in merged_df
print("\nPID and Timepoint columns in merged_df where 'days_from_randomization_to_scan' is NaN:")
print(nan_rows[['pid', 'timepoint']])

Corresponding rows in clinical_df:
            pid  scr_days0  scr_days1  scr_days2  candx_days
6494   106498.0       26.0        NaN        NaN        98.0
17610  117618.0        NaN      507.0        NaN       960.0
20559  120570.0        NaN        NaN      757.0       845.0

PID and Timepoint columns in merged_df where 'days_from_randomization_to_scan' is NaN:
         pid  timepoint
484   120570          0
1199  106498          1
1421  117618          0


In [184]:
### For 3 cases with no time to scan info, use timepoint * 365 as an approximation 

def get_scr_days_approx(row): 
    timepoint_col = f'scr_days{row["timepoint"]}'
    if pd.notna(row[timepoint_col]):
        return row[timepoint_col]
    else:
        # Use timepoint * 365 as an approximation when the scr_days value is missing
        return row['timepoint'] * 365

In [189]:
merged_df_approx = pd.merge(final_reduced_nlst_df, clinical_df, on='pid', how='left')
merged_df_approx['days_from_randomization_to_scan'] = merged_df_approx.apply(get_scr_days_approx, axis=1)

# Print the resulting DataFrame
print(merged_df_approx[['pid', 'timepoint', 'days_from_randomization_to_scan']])

# Print the count of non-NaN values in the 'days_from_randomization_to_scan' column
print("Count of non-NaN values in 'days_from_randomization_to_scan':")
print(merged_df_approx['days_from_randomization_to_scan'].notna().sum())

# Find and print the corresponding rows in clinical_df where merged_df_approx has NaN in 'days_from_randomization_to_scan'
nan_rows = merged_df_approx[merged_df_approx['days_from_randomization_to_scan'].isna()]
nan_pids = nan_rows['pid'].unique()
corresponding_clinical_rows = clinical_df[clinical_df['pid'].isin(nan_pids)]

print("Corresponding rows in clinical_df:")
print(corresponding_clinical_rows[['pid', 'scr_days0', 'scr_days1', 'scr_days2', 'candx_days']])

# Print the pid and timepoint columns of these rows in merged_df_approx
print("\nPID and Timepoint columns in merged_df_approx where 'days_from_randomization_to_scan' is NaN:")
print(nan_rows[['pid', 'timepoint']])

         pid  timepoint  days_from_randomization_to_scan
0     100005          2                            746.0
1     100032          2                            702.0
2     100043          2                            722.0
3     100085          2                            916.0
4     100217          2                            716.0
...      ...        ...                              ...
2323  200988          2                            771.0
2324  201769          0                              0.0
2325  214683          0                              0.0
2326  101066          0                             13.0
2327  208792          0                              0.0

[2328 rows x 3 columns]
Count of non-NaN values in 'days_from_randomization_to_scan':
2328
Corresponding rows in clinical_df:
Empty DataFrame
Columns: [pid, scr_days0, scr_days1, scr_days2, candx_days]
Index: []

PID and Timepoint columns in merged_df_approx where 'days_from_randomization_to_scan' is NaN:
Empty Da

In [190]:
# Filter rows in clinical_df where 'conflc' is 1
lung_cancer_cases = clinical_df[clinical_df['conflc'] == 1]

# Check if all 'lung_cancer_cases' have a non-NaN value in 'candx_days'
all_have_candx_days = lung_cancer_cases['candx_days'].notna().all()

# Print the result
if all_have_candx_days:
    print("All rows with 'conflc' being 1 have a non-NaN value for 'candx_days'.")
else:
    print("Some rows with 'conflc' being 1 have a NaN value for 'candx_days'.")
    
    # Optionally, print the rows with NaN values in 'candx_days'
    nan_candx_days = lung_cancer_cases[lung_cancer_cases['candx_days'].isna()]
    print("Rows with NaN 'candx_days':")
    print(nan_candx_days)


All rows with 'conflc' being 1 have a non-NaN value for 'candx_days'.


In [191]:
# Filter rows in clinical_df where 'candx_days' is not NaN
non_nan_candx_days_cases = clinical_df[clinical_df['candx_days'].notna()]

# Check if all non-NaN 'candx_days' have 'conflc' equal to 1
all_candx_days_have_conflc_1 = (non_nan_candx_days_cases['conflc'] == 1).all()

# Print the result
if all_candx_days_have_conflc_1:
    print("All non-NaN 'candx_days' values have 'conflc' equal to 1.")
else:
    print("Some non-NaN 'candx_days' values do not have 'conflc' equal to 1.")
    
    # Optionally, print the rows where 'candx_days' is non-NaN and 'conflc' is not 1
    mismatched_cases = non_nan_candx_days_cases[non_nan_candx_days_cases['conflc'] != 1]
    print("Rows with non-NaN 'candx_days' and 'conflc' not equal to 1:")
    print(mismatched_cases)

All non-NaN 'candx_days' values have 'conflc' equal to 1.


In [206]:
merged_df_approx['days_from_scan_to_diagnosis'] = merged_df_approx.apply(
    lambda row: row['candx_days'] - row['days_from_randomization_to_scan'] if pd.notna(row['candx_days']) else pd.NA,
    axis=1
)

# Print the resulting DataFrame to check the new column
print(merged_df_approx[['pid', 'timepoint', 'candx_days', 'days_from_randomization_to_scan', 'days_from_scan_to_diagnosis']])

# Check the number of NaN values for days_from_scan_to_diagnosis and candx_days
nan_days_from_scan_to_diagnosis = merged_df_approx['days_from_scan_to_diagnosis'].isna().sum()
nan_candx_days = merged_df_approx['candx_days'].isna().sum()

print(f"Number of NaN values in 'days_from_scan_to_diagnosis': {nan_days_from_scan_to_diagnosis}")
print(f"Number of NaN values in 'original_candx_days': {nan_candx_days}")

# Check if these patient ids match 
# Get the PIDs where 'days_from_scan_to_diagnosis' is NaN
nan_days_from_scan_to_diagnosis_pids = merged_df_approx[merged_df_approx['days_from_scan_to_diagnosis'].isna()]['pid']

# Get the PIDs where 'candx_days' is NaN
nan_candx_days_pids = merged_df_approx[merged_df_approx['candx_days'].isna()]['pid']

# Check if all PIDs with NaN 'days_from_scan_to_diagnosis' match those with NaN 'candx_days'
matching_pids = nan_days_from_scan_to_diagnosis_pids.isin(nan_candx_days_pids)

if matching_pids.all():
    print("All PIDs with NaN 'days_from_scan_to_diagnosis' also have NaN 'candx_days'.")
    
    num_nan_rows = len(nan_days_from_scan_to_diagnosis_pids)
    total_rows = len(merged_df_approx)
    percentage_nan_rows = (num_nan_rows / total_rows) * 100
    
    print(f"Percentage of NaN rows over total rows: {percentage_nan_rows:.2f}%")
else:
    print("Some PIDs with NaN 'days_from_scan_to_diagnosis' do not have NaN 'candx_days'.")
    
    mismatched_pids = nan_days_from_scan_to_diagnosis_pids[~matching_pids]
    print("Mismatched PIDs (NaN in 'days_from_scan_to_diagnosis' but not in 'candx_days'):")
    print(mismatched_pids)

         pid  timepoint  candx_days  days_from_randomization_to_scan  \
0     100005          2         NaN                            746.0   
1     100032          2         NaN                            702.0   
2     100043          2         NaN                            722.0   
3     100085          2         NaN                            916.0   
4     100217          2         NaN                            716.0   
...      ...        ...         ...                              ...   
2323  200988          2         NaN                            771.0   
2324  201769          0        17.0                              0.0   
2325  214683          0       124.0                              0.0   
2326  101066          0        74.0                             13.0   
2327  208792          0        91.0                              0.0   

     days_from_scan_to_diagnosis  
0                           <NA>  
1                           <NA>  
2                           <N

In [207]:
merged_df_approx

Unnamed: 0,AccessionNumber,SliceThickness,ConvolutionKernel,ImageOrientationPatient,pid,index,Directory,Orientation,Year,timepoint,...,progsite_lymph_n3_days,progsite_unk_days,last_progfree_days,confirmed_candxdays1,confirmed_candxdays2,confirmed_candxdays3,confirmed_candxdays4,days_from_randomization_to_scan,time_from_scan_to_diagnosis,days_from_scan_to_diagnosis
0,900447,2.5,STANDARD,"[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",100005,4,/workspace/home/leonardgarcia/sybil_test/10000...,axial,2001,2,...,,,,,,,,746.0,,
1,156408,2.5,STANDARD,"[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",100032,14,/workspace/home/leonardgarcia/sybil_test/10003...,axial,2001,2,...,,,,142.0,,,,702.0,,
2,128341,2.5,STANDARD,"[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",100043,22,/workspace/home/leonardgarcia/sybil_test/10004...,axial,2001,2,...,,,,,,,,722.0,,
3,205739,2.5,STANDARD,"[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",100085,47,/workspace/home/leonardgarcia/sybil_test/10008...,axial,2001,2,...,,,,,,,,916.0,,
4,555313,2.5,STANDARD,"[1.000000, 0.000000, 0.000000, 0.000000, 1.000...",100217,87,/workspace/home/leonardgarcia/sybil_test/10021...,axial,2001,2,...,,,,2094.0,,,,716.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2323,,1.0,B30f,"[1, 0, 0, 0, 1, 0]",200988,14920,/workspace/home/leonardgarcia/sybil_test/20098...,axial,2001,2,...,,,,,,,,771.0,,
2324,,1.0,B30f,"[1, 0, 0, 0, 1, 0]",201769,15047,/workspace/home/leonardgarcia/sybil_test/20176...,axial,1999,0,...,,,552.0,17.0,,,,0.0,17.0,17.0
2325,,1.0,B30f,"[1, 0, 0, 0, 1, 0]",214683,18610,/workspace/home/leonardgarcia/sybil_test/21468...,axial,1999,0,...,,,1222.0,124.0,,,,0.0,124.0,124.0
2326,556382,2.0,B50f,"[1, 0, 0, 0, 1, 0]",101066,491,/workspace/home/leonardgarcia/sybil_test/10106...,axial,1999,0,...,,,2192.0,74.0,,,,13.0,61.0,61.0


In [212]:
# Select the relevant columns
cleaned_df = merged_df_approx[['pid', 'AccessionNumber', 'Directory', 'Year', 'timepoint', 'days_from_randomization_to_scan', 'days_from_scan_to_diagnosis', 'candx_days']]

# Rename 'candx_days' to 'days_from_randomization_to_diagnosis'
cleaned_df = cleaned_df.rename(columns={'candx_days': 'days_from_randomization_to_diagnosis'})

# Add 'confirmed_lc' column: 1 if 'days_from_randomization_to_diagnosis' is not NaN, 0 otherwise
cleaned_df['confirmed_lc'] = cleaned_df['days_from_randomization_to_diagnosis'].notna().astype(int)

# Print the cleaned DataFrame to verify the changes
print(cleaned_df)

         pid AccessionNumber  \
0     100005          900447   
1     100032          156408   
2     100043          128341   
3     100085          205739   
4     100217          555313   
...      ...             ...   
2323  200988                   
2324  201769                   
2325  214683                   
2326  101066          556382   
2327  208792                   

                                              Directory  Year  timepoint  \
0     /workspace/home/leonardgarcia/sybil_test/10000...  2001          2   
1     /workspace/home/leonardgarcia/sybil_test/10003...  2001          2   
2     /workspace/home/leonardgarcia/sybil_test/10004...  2001          2   
3     /workspace/home/leonardgarcia/sybil_test/10008...  2001          2   
4     /workspace/home/leonardgarcia/sybil_test/10021...  2001          2   
...                                                 ...   ...        ...   
2323  /workspace/home/leonardgarcia/sybil_test/20098...  2001          2   
2324  /

In [213]:
cleaned_df.to_csv('/workspace/home/tengyuezhang/sybil_cect/data/nlst_labels.csv', index=False)
