In [3]:
import pandas as pd
import os
import pydicom
from pathlib import Path
from tqdm import tqdm

# Read the original CSV
df = pd.read_csv(r'E:\kaggle-rsna-data_processing3\series_metadata_224.csv')

# 1. Remove invalid series
INVALID_SERIES = [
    "1.2.826.0.1.3680043.8.498.35204126697881966597435252550544407444",
    "1.2.826.0.1.3680043.8.498.11145695452143851764832708867797988068",
    "1.2.826.0.1.3680043.8.498.12937082136541515013380696257898978214",
    "1.2.826.0.1.3680043.8.498.86840850085811129970747331978337342341",
    "1.2.826.0.1.3680043.8.498.10733938921373716882398209756836684843",
    "1.2.826.0.1.3680043.8.498.11292203154407642658894712229998766945",
    "1.2.826.0.1.3680043.8.498.74390569791112039529514861261033590424",
    "1.2.826.0.1.3680043.8.498.99892390884723813599532075083872271516",
    "1.2.826.0.1.3680043.8.498.99421822954919332641371697175982753182",
    "1.2.826.0.1.3680043.8.498.93005379507993862369794871518209403819",
    "1.2.826.0.1.3680043.8.498.87133443408651185245864983172506753347",
    "1.2.826.0.1.3680043.8.498.85042275841446604538710616923989532822",
    "1.2.826.0.1.3680043.8.498.75294325392457179365040684378207706807",
    "1.2.826.0.1.3680043.8.498.73348230187682293339845869829853553626",
    "1.2.826.0.1.3680043.8.498.34908224715351895924870591631151425521",
    "1.2.826.0.1.3680043.8.498.13299935636593758131187104226860563078",
    "1.2.826.0.1.3680043.8.498.12780687841924878965940656634052376723",
    "1.2.826.0.1.3680043.8.498.12285352638636973719542944532929535087",
    "1.2.826.0.1.3680043.8.498.10820472882684587647235099308830427864",
    "1.2.826.0.1.3680043.8.498.11019101980573889157112037207769236902",
    "1.2.826.0.1.3680043.8.498.13356606276376861530476731358572238037",
    "1.2.826.0.1.3680043.8.498.81867770017494605078034950552739870155"
]

df = df[~df['SeriesInstanceUID'].isin(INVALID_SERIES)]

# 2. Define new coronal and sagittal series
new_coronal = [
    "1.2.826.0.1.3680043.8.498.10862138275035843887055171875480735964",
    "1.2.826.0.1.3680043.8.498.11396958000946738156009956455739305762",
    "1.2.826.0.1.3680043.8.498.11915319973409844345177713085783065237",
    "1.2.826.0.1.3680043.8.498.12163038646729971461006564302880090481",
    "1.2.826.0.1.3680043.8.498.12754621213831983134209152548119057365",
    "1.2.826.0.1.3680043.8.498.13001629435974764211403087597568806527",
    "1.2.826.0.1.3680043.8.498.13334658148703615392388818414999249292",
    "1.2.826.0.1.3680043.8.498.18831402822041226140887003611379903167",
    "1.2.826.0.1.3680043.8.498.29351212950805314631667854934458469754",
    "1.2.826.0.1.3680043.8.498.35123157147325830213906326339070528034",
    "1.2.826.0.1.3680043.8.498.36861937197087749960171145883205456895",
    "1.2.826.0.1.3680043.8.498.40006562159206402632477316663171307697",
    "1.2.826.0.1.3680043.8.498.46538678358294253983538640149161986964",
    "1.2.826.0.1.3680043.8.498.49672398100697832208944634471809461961",
    "1.2.826.0.1.3680043.8.498.50668879928342593291812487079769153076",
    "1.2.826.0.1.3680043.8.498.50916621085656781540278427064467759139",
    "1.2.826.0.1.3680043.8.498.67129993505475797984506180089478722899",
    "1.2.826.0.1.3680043.8.498.85431182782929944864196573042506906105",
    "1.2.826.0.1.3680043.8.498.85694228896758469614431673786651945288",
    "1.2.826.0.1.3680043.8.498.91280907751913581577764343702856084945",
    "1.2.826.0.1.3680043.8.498.93156694293030030637766074579373694728",
    "1.2.826.0.1.3680043.8.498.99804081131933373817667779922320327920"
]

new_sagittal = [
    "1.2.826.0.1.3680043.8.498.11887329867812275491160566603814454129",
    "1.2.826.0.1.3680043.8.498.21275250875812455389777450891502640750",
    "1.2.826.0.1.3680043.8.498.27235396640484934153639773593945542938",
    "1.2.826.0.1.3680043.8.498.35440393683691371542782507480292365786",
    "1.2.826.0.1.3680043.8.498.56222999331067503423242588210365055932"
]

# 3. Scan series directory to find all multi-frame series
series_base_dir = Path(r'E:\data_old\series')
all_multiframe = []

print("Scanning for multi-frame series...")
for series_dir in tqdm(series_base_dir.iterdir()):
    if series_dir.is_dir():
        series_uid = series_dir.name
        dicom_files = list(series_dir.glob('*.dcm'))
        
        # Multi-frame series have exactly one DICOM file
        if len(dicom_files) == 1:
            all_multiframe.append(series_uid)

print(f"Found {len(all_multiframe)} total multi-frame series")

# Axial series = all multi-frame MINUS coronal and sagittal
new_axial = [uid for uid in all_multiframe if uid not in new_coronal and uid not in new_sagittal]

print(f"Multi-frame breakdown:")
print(f"  - Coronal: {len(new_coronal)}")
print(f"  - Sagittal: {len(new_sagittal)}")
print(f"  - Axial: {len(new_axial)}")

# Function to get modality from first DICOM file
def get_modality(series_uid):
    series_dir = Path(rf'E:\data_old\series\{series_uid}')
    if not series_dir.exists():
        print(f"Warning: Directory not found for {series_uid}")
        return None
    
    # Find first DICOM file
    dicom_files = list(series_dir.glob('*.dcm'))
    if not dicom_files:
        print(f"Warning: No DICOM files found for {series_uid}")
        return None
    
    try:
        dcm = pydicom.dcmread(dicom_files[0], stop_before_pixels=True)
        return dcm.Modality if hasattr(dcm, 'Modality') else None
    except Exception as e:
        print(f"Error reading DICOM for {series_uid}: {e}")
        return None

# Create new rows for coronal series
new_coronal_rows = []
for series_uid in new_coronal:
    modality = get_modality(series_uid)
    new_coronal_rows.append({
        'SeriesInstanceUID': series_uid,
        'RowDirection': 'RL',
        'ColDirection': 'HF',
        'ZDirection': 'AP',
        'Modality': modality
    })

# Create new rows for sagittal series
new_sagittal_rows = []
for series_uid in new_sagittal:
    modality = get_modality(series_uid)
    new_sagittal_rows.append({
        'SeriesInstanceUID': series_uid,
        'RowDirection': 'AP',
        'ColDirection': 'HF',
        'ZDirection': 'RL',
        'Modality': modality
    })

# Create new rows for axial series (multi-frame series not in coronal/sagittal)
new_axial_rows = []
for series_uid in new_axial:
    modality = get_modality(series_uid)
    new_axial_rows.append({
        'SeriesInstanceUID': series_uid,
        'RowDirection': 'RL',
        'ColDirection': 'AP',
        'ZDirection': 'FH',
        'Modality': modality
    })

# Combine all new rows
new_rows_df = pd.DataFrame(new_coronal_rows + new_sagittal_rows + new_axial_rows)

# Combine with existing dataframe
df_combined = pd.concat([df, new_rows_df], ignore_index=True)

# Sort by SeriesInstanceUID to maintain order
df_combined = df_combined.sort_values('SeriesInstanceUID').reset_index(drop=True)

# Save to new CSV
df_combined = df_combined[~df_combined['SeriesInstanceUID'].isin(INVALID_SERIES)]
df_combined.to_csv('./metadata-new.csv', index=False)

print(f"\n{'='*60}")
print(f"Processing complete!")
print(f"{'='*60}")
print(f"Original rows: {len(df)}")
print(f"New coronal rows: {len(new_coronal_rows)}")
print(f"New sagittal rows: {len(new_sagittal_rows)}")
print(f"New axial rows (multi-frame): {len(new_axial_rows)}")
print(f"Total rows after combining: {len(df_combined)}")
print(f"\nSaved to ./metadata-new.csv")

Scanning for multi-frame series...


4348it [00:02, 1648.96it/s]


Found 322 total multi-frame series
Multi-frame breakdown:
  - Coronal: 22
  - Sagittal: 5
  - Axial: 295

Processing complete!
Original rows: 4008
New coronal rows: 22
New sagittal rows: 5
New axial rows (multi-frame): 295
Total rows after combining: 4326

Saved to ./metadata-new.csv
