In [81]:
import pydicom 
import os 
import pandas as pd 
import glob
import re
from datetime import datetime
import json
from tqdm import tqdm


# Get a list of directories for series that Sybil used as their test set

In [116]:
DATA_ROOT = '/workspace/data/lung/nlst/NLST_CT_raw/data'
FINAL_CORRECTED_PATH = '/workspace/home/tengyuezhang/sybil_cect/data/nlst_baseline/final_nlst_baseline_cases.csv'

In [4]:
NLST_ANNOT_PATH = '/workspace/home/tengyuezhang/sybil_cect/data/nlst_baseline/nlst_annotations.json'
PID2SPLIT_PATH = '/workspace/home/tengyuezhang/sybil_cect/data/nlst_baseline/pid2split.csv'

with open(NLST_ANNOT_PATH, 'r') as file:
    nlst_annot = json.load(file)
pid2split_df = pd.read_csv(PID2SPLIT_PATH)

# Try to fix cases with incorrect paths with accession number

In [24]:
INCORRECT_CASES = "/workspace/home/tengyuezhang/sybil_cect/data/nlst_baseline/nlst_baseline_cases_with_incorrect_paths.csv"
incorrect_df = pd.read_csv(INCORRECT_CASES)
incorrect_df

Unnamed: 0,pid,event,years_to_event,years_from_scan_to_diagnosis,years_from_scan_to_last_followup,Year,timepoint,AccessionNumber,Directory
0,100005,0,5.0,,5.0,2000,1,4.514410e+05,/workspace/data/lung/nlst/NLST_CT_raw/data/100...
1,100085,0,5.0,,6.0,1999,0,5.175680e+05,/workspace/data/lung/nlst/NLST_CT_raw/data/100...
2,101149,0,5.0,,7.0,1999,0,2.798310e+05,/workspace/data/lung/nlst/NLST_CT_raw/data/101...
3,101192,1,2.0,2.0,6.0,1999,0,7.625110e+05,/workspace/data/lung/nlst/NLST_CT_raw/data/101...
4,101506,0,5.0,,6.0,1999,0,9.390540e+05,/workspace/data/lung/nlst/NLST_CT_raw/data/101...
...,...,...,...,...,...,...,...,...,...
218,202143,0,5.0,,6.0,1999,0,1.481356e+15,/workspace/data/lung/nlst/NLST_CT_raw/data/202...
219,214822,1,0.0,0.0,5.0,1999,0,6.110254e+15,/workspace/data/lung/nlst/NLST_CT_raw/data/214...
220,201769,1,0.0,0.0,6.0,1999,0,,/workspace/data/lung/nlst/NLST_CT_raw/data/201...
221,214683,1,0.0,0.0,7.0,1999,0,,/workspace/data/lung/nlst/NLST_CT_raw/data/214...


In [25]:
example = incorrect_df.iloc[0]
directory = example['Directory']
directory

'/workspace/data/lung/nlst/NLST_CT_raw/data/100005/01-02-2000-NA-NLST-LSS-07029/3.000000-1OPAGELS16D3702.514060.00.11.375-97837'

In [29]:
example_dcm_file = '/workspace/data/lung/nlst/NLST_CT_raw/data/100005/1.2.840.113654.2.55.153546750735549044492239658305292872969/1.2.840.113654.2.55.330656333774433317291205463243337143455/000000.dcm'
ds = pydicom.dcmread(example_dcm_file, stop_before_pixels=True)
ds

Dataset.file_meta -------------------------------
(0002, 0000) File Meta Information Group Length  UL: 190
(0002, 0001) File Meta Information Version       OB: b'\x00\x01'
(0002, 0002) Media Storage SOP Class UID         UI: CT Image Storage
(0002, 0003) Media Storage SOP Instance UID      UI: 1.2.840.113654.2.55.46439685292490426795918354813307298735
(0002, 0010) Transfer Syntax UID                 UI: Explicit VR Little Endian
(0002, 0012) Implementation Class UID            UI: 1.2.40.0.13.1.1.1
(0002, 0013) Implementation Version Name         SH: 'dcm4che-1.4.31'
-------------------------------------------------
(0008, 0005) Specific Character Set              CS: 'ISO_IR 100'
(0008, 0008) Image Type                          CS: ['ORIGINAL', 'PRIMARY', 'AXIAL']
(0008, 0016) SOP Class UID                       UI: CT Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.2.840.113654.2.55.46439685292490426795918354813307298735
(0008, 0020) Study Date                   

In [87]:
# search by pid and accession number 
def find_and_check_accession(data_root, pid, accession_number):
    """
    Locate the first DICOM file under a PID directory and check if the accession number matches.

    Parameters:
        data_root (str): Root directory containing all PIDs.
        pid (str): The PID to search within.
        accession_number (str): The accession number to verify in the DICOM header.

    Returns:
        str: Path to the DICOM file if the accession number matches, otherwise None.
    """
    pid_dir = os.path.join(data_root, pid)
    # print(pid_dir)
    if not os.path.exists(pid_dir):
        print(f"PID directory {pid_dir} does not exist.")
        return None

    for root, dirs, files in os.walk(pid_dir):
        for file_name in files:
            if file_name.endswith(".dcm"):
                file_path = os.path.join(root, file_name)
                try:
                    ds = pydicom.dcmread(file_path, stop_before_pixels=True)
                    if hasattr(ds, "AccessionNumber"):
                        # print(ds.AccessionNumber)
                        if ds.AccessionNumber == accession_number:
                            # print('found')
                            return root
                except Exception as e:
                    print(f"Error reading DICOM file {file_path}: {e}")
    return None

In [75]:
result_path = find_and_check_accession(DATA_ROOT, str(example['pid']), str(int(example['AccessionNumber'])))

/workspace/data/lung/nlst/NLST_CT_raw/data/100005
found


In [76]:
str(int(example['AccessionNumber']))

'451441'

In [77]:
result_path

'/workspace/data/lung/nlst/NLST_CT_raw/data/100005/1.2.840.113654.2.55.10251457513072085864761297205518907029/1.2.840.113654.2.55.148981425884825940179466259981962976653'

In [88]:
corrected_paths = []

for _, row in tqdm(incorrect_df.iterrows(), total=len(incorrect_df), desc="Correcting paths"):
    pid = str(row['pid'])
    accession_number = str(int(row['AccessionNumber']))
    corrected_path = find_and_check_accession(DATA_ROOT, pid, accession_number)
    corrected_paths.append(corrected_path)

# Add corrected paths to a new column
incorrect_df['CorrectedPath'] = corrected_paths
incorrect_df

Correcting paths:  34%|███████████████████████████████████████████████                                                                                             | 75/223 [01:49<03:36,  1.46s/it]


ValueError: cannot convert float NaN to integer

# Some cases have missing accession num. Try path matching using suffix instead.

In [89]:
example['Directory']

'/workspace/data/lung/nlst/NLST_CT_raw/data/100005/01-02-2000-NA-NLST-LSS-07029/3.000000-1OPAGELS16D3702.514060.00.11.375-97837'

In [90]:
def extract_suffixes(path):
    """
    Extract time_suffix and series_suffix from the given path.

    Parameters:
        path (str): The file path.

    Returns:
        tuple: (time_suffix, series_suffix)
    """
    # Split the path into components
    parts = os.path.normpath(path).split(os.sep)
    
    # Extract time_suffix from the second-to-last directory
    # Example: "01-02-2000-NA-NLST-LSS-07029"
    time_part = parts[-2]  # Second-to-last directory
    time_suffix = time_part.split('-')[-1]  # Last part after splitting by '-'

    # Extract series_suffix from the last directory
    # Example: "3.000000-1OPAGELS16D3702.514060.00.11.375-97837"
    series_part = parts[-1]  # Last directory
    series_suffix = series_part.split('-')[-1]  # Last part after splitting by '-'

    return time_suffix, series_suffix

In [91]:
time_suffix, series_suffix = extract_suffixes(row['Directory'])

In [92]:
time_suffix

'80333'

In [93]:
series_suffix

'66541'

In [104]:
# in the target {DATA_ROOT}/{row['pid']} dirctory, 
# search for {DATA_ROOT}/{row['pid']}/*{time_suffix}/*{series_suffix}
def search_target_directory(data_root, pid, time_suffix, series_suffix):
    """
    params:
        data_root (str): NLST_CT_raw/data
        pid (str): 6-digit pid in string format 
        time_suffix (str): The time suffix to match.
        series_suffix (str): The series suffix to match.

    Returns:
        list: A list of matching paths.
    """
    pid_dir = os.path.join(data_root, pid)
    if not os.path.exists(pid_dir):
        print(f"PID directory {pid_dir} does not exist.")
        return []
    
    # use glob to search 
    search_pattern = os.path.join(pid_dir, f"*{time_suffix}*", f"*{series_suffix}*")
    matching_paths = glob.glob(search_pattern)
    if len(matching_paths) == 0: 
        print(f'Warning: path not found for {pid}, {time_suffix}, {series_suffix}')
    elif len(matching_paths) > 1: 
        print(f'Warning: multiple matches found for {pid}, {time_suffix}, {series_suffix}')
    return matching_paths

In [101]:
DATA_ROOT = "/workspace/data/lung/nlst/NLST_CT_raw/data/"
row = {'pid': '100005'}
time_suffix = "07029"
series_suffix = "97837"

matching_paths = search_target_directory(DATA_ROOT, row['pid'], time_suffix, series_suffix)
matching_paths

['/workspace/data/lung/nlst/NLST_CT_raw/data/100005/1.2.840.113654.2.55.10251457513072085864761297205518907029/1.2.840.113654.2.55.17324290215190661437113320769488297837']

In [106]:
corrected_paths = []

for _, row in tqdm(incorrect_df.iterrows(), total=len(incorrect_df), desc="Correcting paths"):
    pid = str(row['pid'])
    time_suffix, series_suffix = extract_suffixes(row['Directory'])
    paths_found = search_target_directory(DATA_ROOT, pid, time_suffix, series_suffix)
    if len(paths_found) > 0: 
        corrected_paths.append(paths_found[0])
    else: 
        corrected_paths.append(None)
incorrect_df['CorrectedPath'] = corrected_paths
incorrect_df

Correcting paths:  52%|█████████████████████████████████████████████████████▋                                                  | 115/223 [00:00<00:00, 1143.49it/s]



Correcting paths: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 223/223 [00:00<00:00, 316.66it/s]


Unnamed: 0,pid,event,years_to_event,years_from_scan_to_diagnosis,years_from_scan_to_last_followup,Year,timepoint,AccessionNumber,Directory,CorrectedPath
0,100005,0,5.0,,5.0,2000,1,4.514410e+05,/workspace/data/lung/nlst/NLST_CT_raw/data/100...,/workspace/data/lung/nlst/NLST_CT_raw/data/100...
1,100085,0,5.0,,6.0,1999,0,5.175680e+05,/workspace/data/lung/nlst/NLST_CT_raw/data/100...,/workspace/data/lung/nlst/NLST_CT_raw/data/100...
2,101149,0,5.0,,7.0,1999,0,2.798310e+05,/workspace/data/lung/nlst/NLST_CT_raw/data/101...,/workspace/data/lung/nlst/NLST_CT_raw/data/101...
3,101192,1,2.0,2.0,6.0,1999,0,7.625110e+05,/workspace/data/lung/nlst/NLST_CT_raw/data/101...,/workspace/data/lung/nlst/NLST_CT_raw/data/101...
4,101506,0,5.0,,6.0,1999,0,9.390540e+05,/workspace/data/lung/nlst/NLST_CT_raw/data/101...,/workspace/data/lung/nlst/NLST_CT_raw/data/101...
...,...,...,...,...,...,...,...,...,...,...
218,202143,0,5.0,,6.0,1999,0,1.481356e+15,/workspace/data/lung/nlst/NLST_CT_raw/data/202...,/workspace/data/lung/nlst/NLST_CT_raw/data/202...
219,214822,1,0.0,0.0,5.0,1999,0,6.110254e+15,/workspace/data/lung/nlst/NLST_CT_raw/data/214...,/workspace/data/lung/nlst/NLST_CT_raw/data/214...
220,201769,1,0.0,0.0,6.0,1999,0,,/workspace/data/lung/nlst/NLST_CT_raw/data/201...,/workspace/data/lung/nlst/NLST_CT_raw/data/201...
221,214683,1,0.0,0.0,7.0,1999,0,,/workspace/data/lung/nlst/NLST_CT_raw/data/214...,/workspace/data/lung/nlst/NLST_CT_raw/data/214...


In [113]:
corrected_df = incorrect_df.drop(columns=['Directory'])
corrected_df = corrected_df.rename(columns={'CorrectedPath': 'Directory'})

# Read the correct df and combine them. Then save. 

In [114]:
correct_df = pd.read_csv("/workspace/home/tengyuezhang/sybil_cect/data/nlst_baseline/nlst_baseline_cases_with_correct_paths.csv")
correct_df

Unnamed: 0,pid,event,years_to_event,years_from_scan_to_diagnosis,years_from_scan_to_last_followup,Year,timepoint,AccessionNumber,Directory
0,100032,0,5.0,,7.0,1999,0,5.708600e+05,/workspace/data/lung/nlst/NLST_CT_raw/data/100...
1,100043,0,5.0,,6.0,1999,0,1.326060e+05,/workspace/data/lung/nlst/NLST_CT_raw/data/100...
2,100217,0,5.0,,5.0,1999,0,5.099390e+05,/workspace/data/lung/nlst/NLST_CT_raw/data/100...
3,100242,1,0.0,0.0,6.0,1999,0,3.961210e+05,/workspace/data/lung/nlst/NLST_CT_raw/data/100...
4,100327,0,5.0,,6.0,1999,0,3.503540e+05,/workspace/data/lung/nlst/NLST_CT_raw/data/100...
...,...,...,...,...,...,...,...,...,...
2100,212439,0,5.0,,6.0,1999,0,1.631518e+15,/workspace/data/lung/nlst/NLST_CT_raw/data/212...
2101,213775,0,5.0,,5.0,1999,0,3.231491e+15,/workspace/data/lung/nlst/NLST_CT_raw/data/213...
2102,216031,0,5.0,,5.0,1999,0,9.562193e+15,/workspace/data/lung/nlst/NLST_CT_raw/data/216...
2103,200988,0,5.0,,6.0,1999,0,,/workspace/data/lung/nlst/NLST_CT_raw/data/200...


In [115]:
stacked_df = pd.concat([correct_df, corrected_df], ignore_index=True)
stacked_df

Unnamed: 0,pid,event,years_to_event,years_from_scan_to_diagnosis,years_from_scan_to_last_followup,Year,timepoint,AccessionNumber,Directory
0,100032,0,5.0,,7.0,1999,0,5.708600e+05,/workspace/data/lung/nlst/NLST_CT_raw/data/100...
1,100043,0,5.0,,6.0,1999,0,1.326060e+05,/workspace/data/lung/nlst/NLST_CT_raw/data/100...
2,100217,0,5.0,,5.0,1999,0,5.099390e+05,/workspace/data/lung/nlst/NLST_CT_raw/data/100...
3,100242,1,0.0,0.0,6.0,1999,0,3.961210e+05,/workspace/data/lung/nlst/NLST_CT_raw/data/100...
4,100327,0,5.0,,6.0,1999,0,3.503540e+05,/workspace/data/lung/nlst/NLST_CT_raw/data/100...
...,...,...,...,...,...,...,...,...,...
2323,202143,0,5.0,,6.0,1999,0,1.481356e+15,/workspace/data/lung/nlst/NLST_CT_raw/data/202...
2324,214822,1,0.0,0.0,5.0,1999,0,6.110254e+15,/workspace/data/lung/nlst/NLST_CT_raw/data/214...
2325,201769,1,0.0,0.0,6.0,1999,0,,/workspace/data/lung/nlst/NLST_CT_raw/data/201...
2326,214683,1,0.0,0.0,7.0,1999,0,,/workspace/data/lung/nlst/NLST_CT_raw/data/214...


# For the missing one, manually changed it to:
### 218476/1.3.6.1.4.1.14519.5.2.1.7009.9004.112710550152264447844493113833/1.3.6.1.4.1.14519.5.2.1.7009.9004.169193826559204135185445821027

In [117]:
stacked_df.to_csv(FINAL_CORRECTED_PATH, index=False)

In [None]:
# 218476/1.3.6.1.4.1.14519.5.2.1.7009.9004.112710550152264447844493113833/1.3.6.1.4.1.14519.5.2.1.7009.9004.169193826559204135185445821027