In [None]:
import pandas as pd
import re

# Load data
labels_df = pd.read_csv("../mimic-cxr-2.0.0-chexpert.csv")
paths_df = pd.read_csv("../mimic-cxr-2.0.0-train-filtered.csv")

# Convert study_id to string
labels_df['study_id'] = labels_df['study_id'].astype(str)
paths_df['study_id'] = paths_df['study_id'].astype(str)

# Define  labels
chexpert_labels = [
    'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema',
    'Enlarged Cardiomediastinum', 'Fracture', 'Lung Lesion', 'Lung Opacity',
    'No Finding', 'Pleural Other', 'Pleural Effusion',
    'Pneumonia', 'Pneumothorax', 'Support Devices'
]



In [4]:
paths_df=paths_df[:2000]
paths_df

Unnamed: 0,study_id,img_path,report
0,50000014,p11/p11941242/s50000014/dffc8ab2-ff37704f-2fb2...,Lung volumes are low. Retrocardiac opacity wi...
1,50000186,p14/p14444780/s50000186/93bcf53f-7c91b330-3738...,The cardiomediastinal and hilar contours are n...
2,50000198,p16/p16548129/s50000198/b66847d6-6848ea1f-58aa...,Heart size is normal. The mediastinal and hil...
3,50000230,p11/p11550925/s50000230/7e962a95-d661c0db-4769...,PA and lateral views of the chest are provided...
4,50000319,p13/p13797827/s50000319/3e9484b1-b246ce2b-9ce3...,The lungs are moderately inflated. There is p...
...,...,...,...
1995,50173649,p17/p17554404/s50173649/cf4bfc01-ac309212-3f91...,The left internal jugular central venous cathe...
1996,50173867,p14/p14289800/s50173867/a57e70d7-8f566f04-77c4...,Frontal and lateral chest radiographs demonstr...
1997,50173902,p16/p16955709/s50173902/8d3037e3-e1568365-bdf0...,PA and lateral views of the chest provided dem...
1998,50173951,p14/p14258949/s50173951/9834a3be-9c8bf2f3-25e1...,PA and lateral views of the chest provided. L...


In [6]:
expanded_paths = []
for _, row in paths_df.iterrows():
    study_id = row['study_id']
    paths = row['img_path'].split(';')
    
    for pathname in paths:
        if pathname.strip():  # Only add non-empty paths
            expanded_paths.append({
                'study_id': study_id,
                'Image': pathname.strip()
            })

expanded_paths_df = pd.DataFrame(expanded_paths)
expanded_paths_df['Image']

0       p11/p11941242/s50000014/dffc8ab2-ff37704f-2fb2...
1       p14/p14444780/s50000186/93bcf53f-7c91b330-3738...
2       p16/p16548129/s50000198/b66847d6-6848ea1f-58aa...
3       p11/p11550925/s50000230/7e962a95-d661c0db-4769...
4       p13/p13797827/s50000319/3e9484b1-b246ce2b-9ce3...
                              ...                        
2262    p17/p17554404/s50173649/cf4bfc01-ac309212-3f91...
2263    p14/p14289800/s50173867/a57e70d7-8f566f04-77c4...
2264    p16/p16955709/s50173902/8d3037e3-e1568365-bdf0...
2265    p14/p14258949/s50173951/9834a3be-9c8bf2f3-25e1...
2266    p11/p11600594/s50174015/6dd0e1c7-7f10535a-0e68...
Name: Image, Length: 2267, dtype: object

In [7]:
merged_df = labels_df.merge(expanded_paths_df, on='study_id', how='inner',sort=True)
merged_df

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices,Image
0,11941242,50000014,-1.0,1.0,,0.0,,,,1.0,,,,-1.0,,,p11/p11941242/s50000014/dffc8ab2-ff37704f-2fb2...
1,14444780,50000186,1.0,,,,,,,,,1.0,,-1.0,,,p14/p14444780/s50000186/93bcf53f-7c91b330-3738...
2,16548129,50000198,,,,,,,,,1.0,,,,,,p16/p16548129/s50000198/b66847d6-6848ea1f-58aa...
3,11550925,50000230,1.0,,,,,,,,,,,,,,p11/p11550925/s50000230/7e962a95-d661c0db-4769...
4,13797827,50000319,,1.0,,-1.0,,,,1.0,,,,,,,p13/p13797827/s50000319/3e9484b1-b246ce2b-9ce3...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2262,17554404,50173649,,,,1.0,,,,1.0,,1.0,,1.0,0.0,1.0,p17/p17554404/s50173649/cf4bfc01-ac309212-3f91...
2263,14289800,50173867,,,0.0,0.0,,,,,1.0,,,,,,p14/p14289800/s50173867/a57e70d7-8f566f04-77c4...
2264,16955709,50173902,,,,,,,,,1.0,,,,,,p16/p16955709/s50173902/8d3037e3-e1568365-bdf0...
2265,14258949,50173951,,,,,,,,,,,,1.0,,,p14/p14258949/s50173951/9834a3be-9c8bf2f3-25e1...


In [8]:
for label in chexpert_labels:
    if label in merged_df.columns:
        merged_df[label] = merged_df[label].replace(-1, 0)  # Convert uncertain (-1) to negative (0)
        merged_df[label] = merged_df[label].fillna(0)
        merged_df[label] = merged_df[label].astype(int)

# merged_df['dicon_id'] = merged_df['Image'].str.extract(r'([-\w]+\.jpg)')[0].str.replace('.jpg', '')
merged_df.insert(0,'dicon_id',merged_df['Image'].str.extract(r'([-\w]+\.jpg)')[0].str.replace('.jpg', ''))


For test data

In [9]:
merged_df.to_csv("inference.csv", index=False)

For train and valid

In [59]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(
    merged_df, 
    test_size=0.2, 
    stratify=merged_df['No Finding'], 
    random_state=42  # for reproducibility
)
train_df.to_csv('mimic_train.csv', index=False)
valid_df.to_csv('mimic_valid.csv', index=False)