# Preparing short varied captions from TCGA LUNG clinicalMatrix

This notebook takes age, gender, and smoking history information from the TCGA LUNG clinicalMatrix and generates short, consistent captions following varied templates. For example: 
```
Individual: male, age 66-year-old, smoking: Current Smoker
67-year-old, female, history: Lifelong Non-Smoker
female patient, 74-year-old, Current Reformed Smoker for > 15 yrs
```

In [1]:
import pandas as pd

# import TCGA LUNG clinicalMatrix
lung_clinical = pd.read_csv("/gpfs/home/yb2612/dl4med_25/dl_project/data/clinicalMatrix/TCGA.LUNG.sampleMap/LUNG_clinicalMatrix.tsv", sep="\t", index_col=0, header=0)
print("samples, features:", lung_clinical.shape)
lung_clinical

samples, features: (1299, 142)


Unnamed: 0_level_0,ABSOLUTE_Ploidy,ABSOLUTE_Purity,AKT1,ALK_translocation,BRAF,CBL,CTNNB1,Canonical_mut_in_KRAS_EGFR_ALK,Cnncl_mt_n_KRAS_EGFR_ALK_RET_ROS1_BRAF_ERBB2_HRAS_NRAS_AKT1_MAP2,EGFR,...,_GENOMIC_ID_TCGA_LUNG_exp_HiSeqV2_exon,_GENOMIC_ID_TCGA_LUNG_hMethyl27,_GENOMIC_ID_TCGA_LUNG_mutation,_GENOMIC_ID_TCGA_LUNG_exp_HiSeqV2_PANCAN,_GENOMIC_ID_TCGA_LUNG_hMethyl450,_GENOMIC_ID_TCGA_LUNG_gistic2thd,_GENOMIC_ID_TCGA_LUNG_G4502A_07_3,_GENOMIC_ID_TCGA_LUNG_exp_HiSeqV2,_GENOMIC_ID_TCGA_LUNG_gistic2,_GENOMIC_ID_TCGA_LUNG_RPPA_RBN
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-05-4244-01,,,,,,,,,,,...,e6a101b9-61f9-4ed1-a59f-d9db3fdb4555,TCGA-05-4244-01A-01D-1104-05,TCGA-05-4244-01A-01D-1105-08,e6a101b9-61f9-4ed1-a59f-d9db3fdb4555,,TCGA-05-4244-01A-01D-1877-01,TCGA-05-4244-01A-01R-1107-07,e6a101b9-61f9-4ed1-a59f-d9db3fdb4555,TCGA-05-4244-01A-01D-1877-01,TCGA-05-4244-01A-21-2190-20
TCGA-05-4249-01,3.77,0.46,none,,p.A762E,none,none,Y,Y,none,...,d1a8d88d-1708-4959-9695-6f2e67853bd5,TCGA-05-4249-01A-01D-1104-05,TCGA-05-4249-01A-01D-1105-08,d1a8d88d-1708-4959-9695-6f2e67853bd5,,TCGA-05-4249-01A-01D-1877-01,TCGA-05-4249-01A-01R-1107-07,d1a8d88d-1708-4959-9695-6f2e67853bd5,TCGA-05-4249-01A-01D-1877-01,TCGA-05-4249-01A-21-2190-20
TCGA-05-4250-01,,,,,,,,,,,...,bba9333a-09f7-4585-b22e-e4ae4049f7da,TCGA-05-4250-01A-01D-1104-05,TCGA-05-4250-01A-01D-1105-08,bba9333a-09f7-4585-b22e-e4ae4049f7da,,TCGA-05-4250-01A-01D-1877-01,TCGA-05-4250-01A-01R-1107-07,bba9333a-09f7-4585-b22e-e4ae4049f7da,TCGA-05-4250-01A-01D-1877-01,TCGA-05-4250-01A-21-2190-20
TCGA-05-4382-01,,,none,,p.L613F,none,none,N,N,"p.R222L, p.E545Q",...,e4177b01-6898-4bb7-b38d-0c09f85c5668,TCGA-05-4382-01A-01D-1205-05,TCGA-05-4382-01A-01D-1265-08,e4177b01-6898-4bb7-b38d-0c09f85c5668,,TCGA-05-4382-01A-01D-1204-01,,e4177b01-6898-4bb7-b38d-0c09f85c5668,TCGA-05-4382-01A-01D-1204-01,
TCGA-05-4384-01,2.04,0.48,none,,none,none,p.F777S,N,N,none,...,7d6cf896-b04a-431a-a192-aaf540eeaf77,,TCGA-05-4384-01A-01D-1753-08,7d6cf896-b04a-431a-a192-aaf540eeaf77,TCGA-05-4384-01A-01D-1756-05,TCGA-05-4384-01A-01D-1752-01,,7d6cf896-b04a-431a-a192-aaf540eeaf77,TCGA-05-4384-01A-01D-1752-01,TCGA-05-4384-01A-21-2190-20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-O2-A52W-01,,,,,,,,,,,...,e8c57cc1-0293-433a-8076-13cfa9935ce7,,,e8c57cc1-0293-433a-8076-13cfa9935ce7,TCGA-O2-A52W-01A-11D-A26N-05,TCGA-O2-A52W-01A-11D-A26L-01,,e8c57cc1-0293-433a-8076-13cfa9935ce7,TCGA-O2-A52W-01A-11D-A26L-01,
TCGA-O2-A5IB-01,,,,,,,,,,,...,5c99d6ce-742c-427c-a252-e2ce6b7f919c,,,5c99d6ce-742c-427c-a252-e2ce6b7f919c,TCGA-O2-A5IB-01A-11D-A27L-05,TCGA-O2-A5IB-01A-11D-A27J-01,,5c99d6ce-742c-427c-a252-e2ce6b7f919c,TCGA-O2-A5IB-01A-11D-A27J-01,
TCGA-O2-A5IC-01,,,,,,,,,,,...,,,,,TCGA-O2-A5IC-01A-11D-A27L-05,,,,,
TCGA-S2-AA1A-01,,,,,,,,,,,...,b129c3f7-a2ff-40ba-91b0-a9c96f1e2b00,,,b129c3f7-a2ff-40ba-91b0-a9c96f1e2b00,TCGA-S2-AA1A-01A-12D-A398-05,TCGA-S2-AA1A-01A-12D-A396-01,,b129c3f7-a2ff-40ba-91b0-a9c96f1e2b00,TCGA-S2-AA1A-01A-12D-A396-01,


In [2]:
# filter to only cols of interest
cols_of_interest = ['_primary_disease', 'age_at_initial_pathologic_diagnosis', 'gender', 'tobacco_smoking_history']
lung_clinical = lung_clinical[cols_of_interest]

print("filtered samples, features:", lung_clinical.shape)

lung_clinical

filtered samples, features: (1299, 4)


Unnamed: 0_level_0,_primary_disease,age_at_initial_pathologic_diagnosis,gender,tobacco_smoking_history
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TCGA-05-4244-01,lung adenocarcinoma,70.0,MALE,4
TCGA-05-4249-01,lung adenocarcinoma,67.0,MALE,3
TCGA-05-4250-01,lung adenocarcinoma,79.0,FEMALE,4
TCGA-05-4382-01,lung adenocarcinoma,68.0,MALE,4
TCGA-05-4384-01,lung adenocarcinoma,66.0,MALE,3
...,...,...,...,...
TCGA-O2-A52W-01,lung squamous cell carcinoma,63.0,MALE,4
TCGA-O2-A5IB-01,lung squamous cell carcinoma,71.0,FEMALE,4
TCGA-O2-A5IC-01,lung squamous cell carcinoma,,,
TCGA-S2-AA1A-01,lung adenocarcinoma,68.0,FEMALE,3


Need to map values to text for tobacco smoking history. Took legend from [here](https://15601124181662949551.googlegroups.com/attach/7d73980e3debb/Screen%20Shot%202016-09-09%20at%205.11.12%20PM.png?part=0.1&view=1&vt=ANaJVrFYa59tuAxJiurwC9ze6rVkcW7FoPxyfsJ0Tauf9sSKj1frDAT3NnWtidD4c06PDC7EFzNl_TRcTv5vJQabV8pUXPp_0IhLmtNFb_8D0jbb1rddgDI).

In [3]:
import pandas as pd

data = {
    "PV": [1, 2, 3, 4, 5, 6, 7],
    "PV Meaning": [
        "Lifelong Non-Smoker",
        "Current Smoker",
        "Current Reformed Smoker for > 15 yrs",
        "Current Reformed Smoker for < or = 15 yrs",
        "Current Reformed Smoker, Duration Not Specified",
        "Smoker at Diagnosis",
        "Smoking history not documented"
    ],
    "PV Meaning Concept Codes": [
        "C65108",
        "C67147",
        "C67148;C61584;C113429;C29848(Primary)",
        "C67148;C61586;C113429;C29848(Primary)",
        "C67148;C25330;C19594(Primary)",
        "C15220;C68751(Primary)",
        "C25594;C25356;C56425;C68751(Primary)"
    ],
    "PV Meaning Description": [
        "A person who was not smoking at the time of the interview and has smoked less than 100 cigarettes in their life.",
        "Includes daily smokers and non-daily smokers (also known as occasional smokers).",
        "A person who was not smoking at the time of the interview but has smoked at least 100 cigarettes in their life.",
        "A person who was not smoking at the time of the interview but has smoked at least 100 cigarettes in their life.",
        "A person who was not smoking at the time of the interview but has smoked at least 100 cigarettes in their life.",
        "The investigation, analysis and recognition of the presence and nature of disease, condition, or injury from expressed signs and symptoms...",
        "An operation in which a term denies or inverts the meaning of another term or construction..."
    ],
    "PV Begin Date": [
        "2004-03-08",
        "2004-03-08",
        "2004-03-08",
        "2004-03-08",
        "2012-03-06",
        "2016-01-13",
        "2016-01-13"
    ],
    "PV Public ID": [
        2568602,
        2568603,
        2568604,
        2568605,
        3404794,
        5102272,
        5102271
    ],
    "VM Version": [1.0]*7
}

df = pd.DataFrame(data)
df

Unnamed: 0,PV,PV Meaning,PV Meaning Concept Codes,PV Meaning Description,PV Begin Date,PV Public ID,VM Version
0,1,Lifelong Non-Smoker,C65108,A person who was not smoking at the time of th...,2004-03-08,2568602,1.0
1,2,Current Smoker,C67147,Includes daily smokers and non-daily smokers (...,2004-03-08,2568603,1.0
2,3,Current Reformed Smoker for > 15 yrs,C67148;C61584;C113429;C29848(Primary),A person who was not smoking at the time of th...,2004-03-08,2568604,1.0
3,4,Current Reformed Smoker for < or = 15 yrs,C67148;C61586;C113429;C29848(Primary),A person who was not smoking at the time of th...,2004-03-08,2568605,1.0
4,5,"Current Reformed Smoker, Duration Not Specified",C67148;C25330;C19594(Primary),A person who was not smoking at the time of th...,2012-03-06,3404794,1.0
5,6,Smoker at Diagnosis,C15220;C68751(Primary),"The investigation, analysis and recognition of...",2016-01-13,5102272,1.0
6,7,Smoking history not documented,C25594;C25356;C56425;C68751(Primary),An operation in which a term denies or inverts...,2016-01-13,5102271,1.0


In [4]:
# turn into dict
pv_to_meaning = {
    1: "Lifelong Non-Smoker",
    2: "Current Smoker",
    3: "Current Reformed Smoker for > 15 yrs",
    4: "Current Reformed Smoker for < or = 15 yrs",
    5: "Current Reformed Smoker, Duration Not Specified",
    6: "Smoker at Diagnosis",
    7: "Smoking history not documented"
}

In [5]:
# inspect cols
print(lung_clinical['_primary_disease'].value_counts())
print()
print(lung_clinical['tobacco_smoking_history'].value_counts())
print()
print(lung_clinical['gender'].value_counts())

# inspect missingness
print("\nmissingness:")
print(lung_clinical.isna().sum())

_primary_disease
lung adenocarcinoma             680
lung squamous cell carcinoma    619
Name: count, dtype: int64

tobacco_smoking_history
4                523
2                291
3                270
1                107
5                  9
[Discrepancy]      2
Name: count, dtype: int64

gender
MALE      734
FEMALE    498
Name: count, dtype: int64

missingness:
_primary_disease                        0
age_at_initial_pathologic_diagnosis    95
gender                                 67
tobacco_smoking_history                97
dtype: int64


In [6]:
import numpy as np

# create deep copy to map smoking status and make full captions
lung_clinical_captions = lung_clinical.copy()

# replace [Discrepancy] with na
lung_clinical_captions['tobacco_smoking_history'] = lung_clinical_captions['tobacco_smoking_history'].replace('[Discrepancy]', np.nan)

# make tobacco_smoking_history col numeric
lung_clinical_captions['tobacco_smoking_history'] = pd.to_numeric(lung_clinical_captions['tobacco_smoking_history'], errors='coerce')

# add smoking_status_label col for text version
lung_clinical_captions['tobacco_smoking_history_label'] = lung_clinical_captions['tobacco_smoking_history'].map(pv_to_meaning)

# remove numeric tobacco_smoking_history col
lung_clinical_captions.drop(columns='tobacco_smoking_history', inplace=True)

lung_clinical_captions

Unnamed: 0_level_0,_primary_disease,age_at_initial_pathologic_diagnosis,gender,tobacco_smoking_history_label
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TCGA-05-4244-01,lung adenocarcinoma,70.0,MALE,Current Reformed Smoker for < or = 15 yrs
TCGA-05-4249-01,lung adenocarcinoma,67.0,MALE,Current Reformed Smoker for > 15 yrs
TCGA-05-4250-01,lung adenocarcinoma,79.0,FEMALE,Current Reformed Smoker for < or = 15 yrs
TCGA-05-4382-01,lung adenocarcinoma,68.0,MALE,Current Reformed Smoker for < or = 15 yrs
TCGA-05-4384-01,lung adenocarcinoma,66.0,MALE,Current Reformed Smoker for > 15 yrs
...,...,...,...,...
TCGA-O2-A52W-01,lung squamous cell carcinoma,63.0,MALE,Current Reformed Smoker for < or = 15 yrs
TCGA-O2-A5IB-01,lung squamous cell carcinoma,71.0,FEMALE,Current Reformed Smoker for < or = 15 yrs
TCGA-O2-A5IC-01,lung squamous cell carcinoma,,,
TCGA-S2-AA1A-01,lung adenocarcinoma,68.0,FEMALE,Current Reformed Smoker for > 15 yrs


In [7]:
# check missingness again
lung_clinical_captions.isna().sum()

_primary_disease                        0
age_at_initial_pathologic_diagnosis    95
gender                                 67
tobacco_smoking_history_label          99
dtype: int64

In [8]:
# generate text captions from all cols
import random

def create_caption(row):
    age = row['age_at_initial_pathologic_diagnosis']
    gender = row['gender']
    smoking = row['tobacco_smoking_history_label']
    
    # standardize fields
    gender_text = gender.lower() if pd.notna(gender) else "unknown gender"
    age_text = f"{int(age)}-year-old" if pd.notna(age) else "unknown age"
    smoking_text = smoking if pd.notna(smoking) else "unknown smoking status"

    # caption templates
    templates = [
        f"{age_text} {gender_text}, {smoking_text}",
        f"{gender_text}, aged {age_text}, smoker: {smoking_text}",
        f"{smoking_text} — {age_text} {gender_text}",
        f"{gender_text} patient, {age_text}, {smoking_text}",
        f"{age_text}, {gender_text}, history: {smoking_text}",
        f"A case involving a {age_text} {gender_text} with smoking history: {smoking_text}",
        f"{gender_text.capitalize()}, {age_text}; smoking status: {smoking_text}",
        f"Individual: {gender_text}, age {age_text}, smoking: {smoking_text}",
        f"{age_text} individual of {gender_text} gender, reported as {smoking_text}",
        f"{gender_text} case, age {age_text}, smoking history: {smoking_text}"
    ]

    return random.choice(templates)

lung_clinical_captions['caption'] = lung_clinical_captions.apply(create_caption, axis=1)

lung_captions = lung_clinical_captions[['caption']] 

# Display the result
lung_captions

Unnamed: 0_level_0,caption
sampleID,Unnamed: 1_level_1
TCGA-05-4244-01,"Male, 70-year-old; smoking status: Current Ref..."
TCGA-05-4249-01,"67-year-old, male, history: Current Reformed S..."
TCGA-05-4250-01,"79-year-old, female, history: Current Reformed..."
TCGA-05-4382-01,"Male, 68-year-old; smoking status: Current Ref..."
TCGA-05-4384-01,"Male, 66-year-old; smoking status: Current Ref..."
...,...
TCGA-O2-A52W-01,Current Reformed Smoker for < or = 15 yrs — 63...
TCGA-O2-A5IB-01,Current Reformed Smoker for < or = 15 yrs — 71...
TCGA-O2-A5IC-01,"unknown gender, aged unknown age, smoker: unkn..."
TCGA-S2-AA1A-01,"Individual: female, age 68-year-old, smoking: ..."


In [9]:
lung_clinical_captions.to_csv("/gpfs/home/yb2612/dl4med_25/dl_project/data/lung_clinical_captions.csv", index=True)
lung_captions.to_csv("/gpfs/home/yb2612/dl4med_25/dl_project/data/lung_captions.csv", index=True)

# Relating tiles to captions

In [10]:
import pandas as pd 

lung_clinical_captions = pd.read_csv("/gpfs/home/yb2612/dl4med_25/dl_project/data/lung_clinical_captions.csv", index_col=0, header=0)
lung_clinical_captions

Unnamed: 0_level_0,_primary_disease,age_at_initial_pathologic_diagnosis,gender,tobacco_smoking_history_label,caption
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TCGA-05-4244-01,lung adenocarcinoma,70.0,MALE,Current Reformed Smoker for < or = 15 yrs,"Male, 70-year-old; smoking status: Current Ref..."
TCGA-05-4249-01,lung adenocarcinoma,67.0,MALE,Current Reformed Smoker for > 15 yrs,"67-year-old, male, history: Current Reformed S..."
TCGA-05-4250-01,lung adenocarcinoma,79.0,FEMALE,Current Reformed Smoker for < or = 15 yrs,"79-year-old, female, history: Current Reformed..."
TCGA-05-4382-01,lung adenocarcinoma,68.0,MALE,Current Reformed Smoker for < or = 15 yrs,"Male, 68-year-old; smoking status: Current Ref..."
TCGA-05-4384-01,lung adenocarcinoma,66.0,MALE,Current Reformed Smoker for > 15 yrs,"Male, 66-year-old; smoking status: Current Ref..."
...,...,...,...,...,...
TCGA-O2-A52W-01,lung squamous cell carcinoma,63.0,MALE,Current Reformed Smoker for < or = 15 yrs,Current Reformed Smoker for < or = 15 yrs — 63...
TCGA-O2-A5IB-01,lung squamous cell carcinoma,71.0,FEMALE,Current Reformed Smoker for < or = 15 yrs,Current Reformed Smoker for < or = 15 yrs — 71...
TCGA-O2-A5IC-01,lung squamous cell carcinoma,,,,"unknown gender, aged unknown age, smoker: unkn..."
TCGA-S2-AA1A-01,lung adenocarcinoma,68.0,FEMALE,Current Reformed Smoker for > 15 yrs,"Individual: female, age 68-year-old, smoking: ..."


## 250k train subset

In [11]:
import h5py
import pandas as pd

file_path = "/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/hdf5_TCGAFFPE_LUADLUSC_5x_60pc_250K_he_train.h5"

with h5py.File(file_path, 'r') as f:
    tiles = [t.decode('utf-8') for t in f['tiles'][:]]
    slides = [s.decode('utf-8') for s in f['slides'][:]]
    
    labels = f['labels'][:]
    patterns = f['patterns'][:]
    
    lung_h5 = pd.DataFrame({
        'slides': slides,
        'tiles': tiles,
        'labels': labels,
        'patterns': patterns
    })

    lung_h5['sampleID'] = lung_h5['slides'].str[:15]

    lung_h5.set_index('sampleID', inplace=True)

lung_h5


Unnamed: 0_level_0,slides,tiles,labels,patterns
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TCGA-49-6745-01,TCGA-49-6745-01Z-00-DX7,46_14.jpeg,3.0,b'TCGA-LUAD_stage_iii'
TCGA-68-A59J-01,TCGA-68-A59J-01Z-00-DX1,23_9.jpeg,6.0,b'TCGA-LUSC_stage_i'
TCGA-95-A4VN-01,TCGA-95-A4VN-01Z-00-DX1,41_5.jpeg,2.0,b'TCGA-LUAD_stage_ii'
TCGA-49-4490-01,TCGA-49-4490-01Z-00-DX6,83_25.jpeg,3.0,b'TCGA-LUAD_stage_iii'
TCGA-43-6647-01,TCGA-43-6647-01Z-00-DX1,36_20.jpeg,7.0,b'TCGA-LUSC_stage_ii'
...,...,...,...,...
TCGA-NC-A5HT-01,TCGA-NC-A5HT-01Z-00-DX1,18_22.jpeg,8.0,b'TCGA-LUSC_stage_iii'
TCGA-43-6647-01,TCGA-43-6647-01Z-00-DX1,34_20.jpeg,7.0,b'TCGA-LUSC_stage_ii'
TCGA-63-5131-01,TCGA-63-5131-01Z-00-DX1,14_6.jpeg,7.0,b'TCGA-LUSC_stage_ii'
TCGA-49-AARE-01,TCGA-49-AARE-01Z-00-DX1,23_11.jpeg,1.0,b'TCGA-LUAD_stage_i'


In [12]:
# join dfs
lung_images_clinical_captions = lung_h5.merge(lung_clinical_captions, left_index=True, right_index=True, how='left')
lung_images_clinical_captions

Unnamed: 0_level_0,slides,tiles,labels,patterns,_primary_disease,age_at_initial_pathologic_diagnosis,gender,tobacco_smoking_history_label,caption
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TCGA-49-6745-01,TCGA-49-6745-01Z-00-DX7,46_14.jpeg,3.0,b'TCGA-LUAD_stage_iii',lung adenocarcinoma,82.0,MALE,Current Reformed Smoker for < or = 15 yrs,"82-year-old male, Current Reformed Smoker for ..."
TCGA-68-A59J-01,TCGA-68-A59J-01Z-00-DX1,23_9.jpeg,6.0,b'TCGA-LUSC_stage_i',lung squamous cell carcinoma,74.0,FEMALE,Current Reformed Smoker for < or = 15 yrs,"74-year-old female, Current Reformed Smoker fo..."
TCGA-95-A4VN-01,TCGA-95-A4VN-01Z-00-DX1,41_5.jpeg,2.0,b'TCGA-LUAD_stage_ii',lung adenocarcinoma,62.0,FEMALE,Current Reformed Smoker for < or = 15 yrs,"Female, 62-year-old; smoking status: Current R..."
TCGA-49-4490-01,TCGA-49-4490-01Z-00-DX6,83_25.jpeg,3.0,b'TCGA-LUAD_stage_iii',lung adenocarcinoma,45.0,FEMALE,Current Reformed Smoker for > 15 yrs,A case involving a 45-year-old female with smo...
TCGA-43-6647-01,TCGA-43-6647-01Z-00-DX1,36_20.jpeg,7.0,b'TCGA-LUSC_stage_ii',lung squamous cell carcinoma,69.0,FEMALE,Current Reformed Smoker for < or = 15 yrs,"female patient, 69-year-old, Current Reformed ..."
...,...,...,...,...,...,...,...,...,...
TCGA-NC-A5HT-01,TCGA-NC-A5HT-01Z-00-DX1,18_22.jpeg,8.0,b'TCGA-LUSC_stage_iii',lung squamous cell carcinoma,69.0,MALE,Current Reformed Smoker for < or = 15 yrs,"69-year-old, male, history: Current Reformed S..."
TCGA-43-6647-01,TCGA-43-6647-01Z-00-DX1,34_20.jpeg,7.0,b'TCGA-LUSC_stage_ii',lung squamous cell carcinoma,69.0,FEMALE,Current Reformed Smoker for < or = 15 yrs,"female patient, 69-year-old, Current Reformed ..."
TCGA-63-5131-01,TCGA-63-5131-01Z-00-DX1,14_6.jpeg,7.0,b'TCGA-LUSC_stage_ii',lung squamous cell carcinoma,,MALE,Current Reformed Smoker for < or = 15 yrs,"male patient, unknown age, Current Reformed Sm..."
TCGA-49-AARE-01,TCGA-49-AARE-01Z-00-DX1,23_11.jpeg,1.0,b'TCGA-LUAD_stage_i',lung adenocarcinoma,51.0,FEMALE,Current Reformed Smoker for < or = 15 yrs,Current Reformed Smoker for < or = 15 yrs — 51...


In [13]:
# luad/lusc distribution
lung_images_clinical_captions['_primary_disease'].value_counts()

_primary_disease
lung squamous cell carcinoma    129654
lung adenocarcinoma             120309
Name: count, dtype: int64

In [14]:
# missingness
lung_images_clinical_captions.isna().sum()

slides                                    0
tiles                                     0
labels                                    0
patterns                                  0
_primary_disease                         37
age_at_initial_pathologic_diagnosis    2823
gender                                   37
tobacco_smoking_history_label          5800
caption                                  37
dtype: int64

In [15]:
# find rows without captions
na_rows = lung_images_clinical_captions[lung_images_clinical_captions['caption'].isna()]
unknown_ids = na_rows.index.unique().tolist()
print(unknown_ids)
na_rows.head()

['TCGA-05-4245-01']


Unnamed: 0_level_0,slides,tiles,labels,patterns,_primary_disease,age_at_initial_pathologic_diagnosis,gender,tobacco_smoking_history_label,caption
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TCGA-05-4245-01,TCGA-05-4245-01Z-00-DX1,9_33.jpeg,3.0,b'TCGA-LUAD_stage_iii',,,,,
TCGA-05-4245-01,TCGA-05-4245-01Z-00-DX1,8_32.jpeg,3.0,b'TCGA-LUAD_stage_iii',,,,,
TCGA-05-4245-01,TCGA-05-4245-01Z-00-DX1,12_32.jpeg,3.0,b'TCGA-LUAD_stage_iii',,,,,
TCGA-05-4245-01,TCGA-05-4245-01Z-00-DX1,10_35.jpeg,3.0,b'TCGA-LUAD_stage_iii',,,,,
TCGA-05-4245-01,TCGA-05-4245-01Z-00-DX1,7_34.jpeg,3.0,b'TCGA-LUAD_stage_iii',,,,,


In [16]:
# fill with luad
lung_images_clinical_captions['_primary_disease'] = lung_images_clinical_captions['_primary_disease'].fillna('lung adenocarcinoma')
lung_images_clinical_captions.isna().sum()

slides                                    0
tiles                                     0
labels                                    0
patterns                                  0
_primary_disease                          0
age_at_initial_pathologic_diagnosis    2823
gender                                   37
tobacco_smoking_history_label          5800
caption                                  37
dtype: int64

In [17]:
lung_images_clinical_captions['caption'] = lung_images_clinical_captions['caption'].fillna("An H&E stained tissue from an individual with no age information with no gender information with no smoking history information.")
lung_images_clinical_captions.isna().sum()

slides                                    0
tiles                                     0
labels                                    0
patterns                                  0
_primary_disease                          0
age_at_initial_pathologic_diagnosis    2823
gender                                   37
tobacco_smoking_history_label          5800
caption                                   0
dtype: int64

In [18]:
lung_images_clinical_captions.to_csv("/gpfs/home/yb2612/dl4med_25/dl_project/data/lung_250k_images_clinical_captions.csv", index=True)

ERROR! Session/line number was not unique in database. History logging moved to new session 20


In [19]:
lung_images_captions = lung_images_clinical_captions[['slides', 'tiles', 'caption']].copy()

lung_images_captions['filepath'] = lung_images_captions.apply(
    lambda row: f"/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/train_250k/{row['slides']}/{row['tiles']}",
    axis=1
)

lung_images_captions

Unnamed: 0_level_0,slides,tiles,caption,filepath
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TCGA-49-6745-01,TCGA-49-6745-01Z-00-DX7,46_14.jpeg,"82-year-old male, Current Reformed Smoker for ...",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-68-A59J-01,TCGA-68-A59J-01Z-00-DX1,23_9.jpeg,"74-year-old female, Current Reformed Smoker fo...",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-95-A4VN-01,TCGA-95-A4VN-01Z-00-DX1,41_5.jpeg,"Female, 62-year-old; smoking status: Current R...",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-49-4490-01,TCGA-49-4490-01Z-00-DX6,83_25.jpeg,A case involving a 45-year-old female with smo...,/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-43-6647-01,TCGA-43-6647-01Z-00-DX1,36_20.jpeg,"female patient, 69-year-old, Current Reformed ...",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
...,...,...,...,...
TCGA-NC-A5HT-01,TCGA-NC-A5HT-01Z-00-DX1,18_22.jpeg,"69-year-old, male, history: Current Reformed S...",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-43-6647-01,TCGA-43-6647-01Z-00-DX1,34_20.jpeg,"female patient, 69-year-old, Current Reformed ...",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-63-5131-01,TCGA-63-5131-01Z-00-DX1,14_6.jpeg,"male patient, unknown age, Current Reformed Sm...",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-49-AARE-01,TCGA-49-AARE-01Z-00-DX1,23_11.jpeg,Current Reformed Smoker for < or = 15 yrs — 51...,/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...


In [20]:
lung_images_captions[['filepath', 'caption']].to_csv("/gpfs/home/yb2612/dl4med_25/dl_project/data/lung_250k_filepath_caption.csv", index=False)

## Full train set

In [21]:
import h5py
import pandas as pd

file_path = "/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/hdf5_TCGAFFPE_LUADLUSC_5x_60pc_he_train-002.h5"

with h5py.File(file_path, 'r') as f:
    tiles = [t.decode('utf-8') for t in f['train_tiles'][:]]
    slides = [s.decode('utf-8') for s in f['train_slides'][:]]
    
    labels = f['train_labels'][:]
    patterns = f['train_patterns'][:]
    
    lung_h5 = pd.DataFrame({
        'slides': slides,
        'tiles': tiles,
        'labels': labels,
        'patterns': patterns
    })

    lung_h5['sampleID'] = lung_h5['slides'].str[:15]

    lung_h5.set_index('sampleID', inplace=True)

lung_h5

Unnamed: 0_level_0,slides,tiles,labels,patterns
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TCGA-73-4677-01,TCGA-73-4677-01Z-00-DX1,41_27.jpeg,0.0,b'TCGA-LUAD_not_reported'
TCGA-50-5045-01,TCGA-50-5045-01Z-00-DX1,18_22.jpeg,0.0,b'TCGA-LUAD_not_reported'
TCGA-69-7765-01,TCGA-69-7765-01Z-00-DX1,33_15.jpeg,0.0,b'TCGA-LUAD_not_reported'
TCGA-69-7765-01,TCGA-69-7765-01Z-00-DX1,34_13.jpeg,0.0,b'TCGA-LUAD_not_reported'
TCGA-73-4677-01,TCGA-73-4677-01Z-00-DX1,23_14.jpeg,0.0,b'TCGA-LUAD_not_reported'
...,...,...,...,...
TCGA-18-3414-01,TCGA-18-3414-01Z-00-DX1,19_27.jpeg,9.0,b'TCGA-LUSC_stage_iv'
TCGA-18-3414-01,TCGA-18-3414-01Z-00-DX1,44_32.jpeg,9.0,b'TCGA-LUSC_stage_iv'
TCGA-18-3414-01,TCGA-18-3414-01Z-00-DX1,45_5.jpeg,9.0,b'TCGA-LUSC_stage_iv'
TCGA-NC-A5HP-01,TCGA-NC-A5HP-01Z-00-DX1,38_22.jpeg,9.0,b'TCGA-LUSC_stage_iv'


In [22]:
# join dfs
lung_images_clinical_captions = lung_h5.merge(lung_clinical_captions, left_index=True, right_index=True, how='left')
lung_images_clinical_captions

Unnamed: 0_level_0,slides,tiles,labels,patterns,_primary_disease,age_at_initial_pathologic_diagnosis,gender,tobacco_smoking_history_label,caption
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TCGA-73-4677-01,TCGA-73-4677-01Z-00-DX1,41_27.jpeg,0.0,b'TCGA-LUAD_not_reported',lung adenocarcinoma,74.0,MALE,Current Reformed Smoker for > 15 yrs,"74-year-old individual of male gender, reporte..."
TCGA-50-5045-01,TCGA-50-5045-01Z-00-DX1,18_22.jpeg,0.0,b'TCGA-LUAD_not_reported',lung adenocarcinoma,57.0,FEMALE,,"57-year-old, female, history: unknown smoking ..."
TCGA-69-7765-01,TCGA-69-7765-01Z-00-DX1,33_15.jpeg,0.0,b'TCGA-LUAD_not_reported',lung adenocarcinoma,56.0,MALE,Current Reformed Smoker for < or = 15 yrs,"56-year-old male, Current Reformed Smoker for ..."
TCGA-69-7765-01,TCGA-69-7765-01Z-00-DX1,34_13.jpeg,0.0,b'TCGA-LUAD_not_reported',lung adenocarcinoma,56.0,MALE,Current Reformed Smoker for < or = 15 yrs,"56-year-old male, Current Reformed Smoker for ..."
TCGA-73-4677-01,TCGA-73-4677-01Z-00-DX1,23_14.jpeg,0.0,b'TCGA-LUAD_not_reported',lung adenocarcinoma,74.0,MALE,Current Reformed Smoker for > 15 yrs,"74-year-old individual of male gender, reporte..."
...,...,...,...,...,...,...,...,...,...
TCGA-18-3414-01,TCGA-18-3414-01Z-00-DX1,19_27.jpeg,9.0,b'TCGA-LUSC_stage_iv',lung squamous cell carcinoma,73.0,MALE,Current Reformed Smoker for < or = 15 yrs,"male case, age 73-year-old, smoking history: C..."
TCGA-18-3414-01,TCGA-18-3414-01Z-00-DX1,44_32.jpeg,9.0,b'TCGA-LUSC_stage_iv',lung squamous cell carcinoma,73.0,MALE,Current Reformed Smoker for < or = 15 yrs,"male case, age 73-year-old, smoking history: C..."
TCGA-18-3414-01,TCGA-18-3414-01Z-00-DX1,45_5.jpeg,9.0,b'TCGA-LUSC_stage_iv',lung squamous cell carcinoma,73.0,MALE,Current Reformed Smoker for < or = 15 yrs,"male case, age 73-year-old, smoking history: C..."
TCGA-NC-A5HP-01,TCGA-NC-A5HP-01Z-00-DX1,38_22.jpeg,9.0,b'TCGA-LUSC_stage_iv',lung squamous cell carcinoma,69.0,MALE,Current Smoker,"male patient, 69-year-old, Current Smoker"


In [23]:
# luad/lusc distribution
lung_images_clinical_captions['_primary_disease'].value_counts()

_primary_disease
lung squamous cell carcinoma    301406
lung adenocarcinoma             281132
Name: count, dtype: int64

In [24]:
# missingness
lung_images_clinical_captions.isna().sum()

slides                                     0
tiles                                      0
labels                                     0
patterns                                   0
_primary_disease                          78
age_at_initial_pathologic_diagnosis     6496
gender                                    78
tobacco_smoking_history_label          13646
caption                                   78
dtype: int64

In [25]:
# find rows without captions
na_rows = lung_images_clinical_captions[lung_images_clinical_captions['caption'].isna()]
unknown_ids = na_rows.index.unique().tolist()
print(unknown_ids)
na_rows.head()

['TCGA-05-4245-01']


Unnamed: 0_level_0,slides,tiles,labels,patterns,_primary_disease,age_at_initial_pathologic_diagnosis,gender,tobacco_smoking_history_label,caption
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TCGA-05-4245-01,TCGA-05-4245-01Z-00-DX1,8_32.jpeg,3.0,b'TCGA-LUAD_stage_iii',,,,,
TCGA-05-4245-01,TCGA-05-4245-01Z-00-DX1,12_36.jpeg,3.0,b'TCGA-LUAD_stage_iii',,,,,
TCGA-05-4245-01,TCGA-05-4245-01Z-00-DX1,9_31.jpeg,3.0,b'TCGA-LUAD_stage_iii',,,,,
TCGA-05-4245-01,TCGA-05-4245-01Z-00-DX1,7_25.jpeg,3.0,b'TCGA-LUAD_stage_iii',,,,,
TCGA-05-4245-01,TCGA-05-4245-01Z-00-DX1,10_30.jpeg,3.0,b'TCGA-LUAD_stage_iii',,,,,


In [26]:
# fill with luad
lung_images_clinical_captions['_primary_disease'] = lung_images_clinical_captions['_primary_disease'].fillna('lung adenocarcinoma')
lung_images_clinical_captions.isna().sum()

slides                                     0
tiles                                      0
labels                                     0
patterns                                   0
_primary_disease                           0
age_at_initial_pathologic_diagnosis     6496
gender                                    78
tobacco_smoking_history_label          13646
caption                                   78
dtype: int64

In [27]:
lung_images_clinical_captions['caption'] = lung_images_clinical_captions['caption'].fillna("An H&E stained tissue from an individual with no age information with no gender information with no smoking history information.")
lung_images_clinical_captions.isna().sum()

slides                                     0
tiles                                      0
labels                                     0
patterns                                   0
_primary_disease                           0
age_at_initial_pathologic_diagnosis     6496
gender                                    78
tobacco_smoking_history_label          13646
caption                                    0
dtype: int64

In [28]:
lung_images_clinical_captions.to_csv("/gpfs/home/yb2612/dl4med_25/dl_project/data/lung_train_images_clinical_captions.csv", index=True)

In [29]:
lung_images_captions = lung_images_clinical_captions[['slides', 'tiles', 'caption']].copy()

lung_images_captions['filepath'] = lung_images_captions.apply(
    lambda row: f"/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/train/{row['slides']}/{row['tiles']}",
    axis=1
)

lung_images_captions

Unnamed: 0_level_0,slides,tiles,caption,filepath
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TCGA-73-4677-01,TCGA-73-4677-01Z-00-DX1,41_27.jpeg,"74-year-old individual of male gender, reporte...",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-50-5045-01,TCGA-50-5045-01Z-00-DX1,18_22.jpeg,"57-year-old, female, history: unknown smoking ...",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-69-7765-01,TCGA-69-7765-01Z-00-DX1,33_15.jpeg,"56-year-old male, Current Reformed Smoker for ...",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-69-7765-01,TCGA-69-7765-01Z-00-DX1,34_13.jpeg,"56-year-old male, Current Reformed Smoker for ...",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-73-4677-01,TCGA-73-4677-01Z-00-DX1,23_14.jpeg,"74-year-old individual of male gender, reporte...",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
...,...,...,...,...
TCGA-18-3414-01,TCGA-18-3414-01Z-00-DX1,19_27.jpeg,"male case, age 73-year-old, smoking history: C...",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-18-3414-01,TCGA-18-3414-01Z-00-DX1,44_32.jpeg,"male case, age 73-year-old, smoking history: C...",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-18-3414-01,TCGA-18-3414-01Z-00-DX1,45_5.jpeg,"male case, age 73-year-old, smoking history: C...",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-NC-A5HP-01,TCGA-NC-A5HP-01Z-00-DX1,38_22.jpeg,"male patient, 69-year-old, Current Smoker",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...


In [30]:
lung_images_captions[['filepath', 'caption']].to_csv("/gpfs/home/yb2612/dl4med_25/dl_project/data/lung_train_filepath_caption.csv", index=False)

## Full validation set

In [31]:
import h5py
import pandas as pd

file_path = "/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/hdf5_TCGAFFPE_LUADLUSC_5x_60pc_he_validation-003.h5"

with h5py.File(file_path, 'r') as f:
    tiles = [t.decode('utf-8') for t in f['valid_tiles'][:]]
    slides = [s.decode('utf-8') for s in f['valid_slides'][:]]
    
    labels = f['valid_labels'][:]
    patterns = f['valid_patterns'][:]
    
    lung_h5 = pd.DataFrame({
        'slides': slides,
        'tiles': tiles,
        'labels': labels,
        'patterns': patterns
    })

    lung_h5['sampleID'] = lung_h5['slides'].str[:15]

    lung_h5.set_index('sampleID', inplace=True)

lung_h5

Unnamed: 0_level_0,slides,tiles,labels,patterns
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TCGA-38-4626-01,TCGA-38-4626-01Z-00-DX1,57_25.jpeg,0.0,b'TCGA-LUAD_not_reported'
TCGA-38-4626-01,TCGA-38-4626-01Z-00-DX1,42_18.jpeg,0.0,b'TCGA-LUAD_not_reported'
TCGA-38-4626-01,TCGA-38-4626-01Z-00-DX1,20_20.jpeg,0.0,b'TCGA-LUAD_not_reported'
TCGA-38-4626-01,TCGA-38-4626-01Z-00-DX1,8_16.jpeg,0.0,b'TCGA-LUAD_not_reported'
TCGA-38-4626-01,TCGA-38-4626-01Z-00-DX1,72_6.jpeg,0.0,b'TCGA-LUAD_not_reported'
...,...,...,...,...
TCGA-34-8455-01,TCGA-34-8455-01Z-00-DX1,25_23.jpeg,9.0,b'TCGA-LUSC_stage_iv'
TCGA-34-8455-01,TCGA-34-8455-01Z-00-DX1,12_33.jpeg,9.0,b'TCGA-LUSC_stage_iv'
TCGA-34-8455-01,TCGA-34-8455-01Z-00-DX1,44_23.jpeg,9.0,b'TCGA-LUSC_stage_iv'
TCGA-34-8455-01,TCGA-34-8455-01Z-00-DX1,22_20.jpeg,9.0,b'TCGA-LUSC_stage_iv'


In [32]:
# join dfs
lung_images_clinical_captions = lung_h5.merge(lung_clinical_captions, left_index=True, right_index=True, how='left')
lung_images_clinical_captions

Unnamed: 0_level_0,slides,tiles,labels,patterns,_primary_disease,age_at_initial_pathologic_diagnosis,gender,tobacco_smoking_history_label,caption
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TCGA-38-4626-01,TCGA-38-4626-01Z-00-DX1,57_25.jpeg,0.0,b'TCGA-LUAD_not_reported',lung adenocarcinoma,57.0,FEMALE,Current Smoker,"female, aged 57-year-old, smoker: Current Smoker"
TCGA-38-4626-01,TCGA-38-4626-01Z-00-DX1,42_18.jpeg,0.0,b'TCGA-LUAD_not_reported',lung adenocarcinoma,57.0,FEMALE,Current Smoker,"female, aged 57-year-old, smoker: Current Smoker"
TCGA-38-4626-01,TCGA-38-4626-01Z-00-DX1,20_20.jpeg,0.0,b'TCGA-LUAD_not_reported',lung adenocarcinoma,57.0,FEMALE,Current Smoker,"female, aged 57-year-old, smoker: Current Smoker"
TCGA-38-4626-01,TCGA-38-4626-01Z-00-DX1,8_16.jpeg,0.0,b'TCGA-LUAD_not_reported',lung adenocarcinoma,57.0,FEMALE,Current Smoker,"female, aged 57-year-old, smoker: Current Smoker"
TCGA-38-4626-01,TCGA-38-4626-01Z-00-DX1,72_6.jpeg,0.0,b'TCGA-LUAD_not_reported',lung adenocarcinoma,57.0,FEMALE,Current Smoker,"female, aged 57-year-old, smoker: Current Smoker"
...,...,...,...,...,...,...,...,...,...
TCGA-34-8455-01,TCGA-34-8455-01Z-00-DX1,25_23.jpeg,9.0,b'TCGA-LUSC_stage_iv',lung squamous cell carcinoma,67.0,MALE,Current Reformed Smoker for < or = 15 yrs,"male patient, 67-year-old, Current Reformed Sm..."
TCGA-34-8455-01,TCGA-34-8455-01Z-00-DX1,12_33.jpeg,9.0,b'TCGA-LUSC_stage_iv',lung squamous cell carcinoma,67.0,MALE,Current Reformed Smoker for < or = 15 yrs,"male patient, 67-year-old, Current Reformed Sm..."
TCGA-34-8455-01,TCGA-34-8455-01Z-00-DX1,44_23.jpeg,9.0,b'TCGA-LUSC_stage_iv',lung squamous cell carcinoma,67.0,MALE,Current Reformed Smoker for < or = 15 yrs,"male patient, 67-year-old, Current Reformed Sm..."
TCGA-34-8455-01,TCGA-34-8455-01Z-00-DX1,22_20.jpeg,9.0,b'TCGA-LUSC_stage_iv',lung squamous cell carcinoma,67.0,MALE,Current Reformed Smoker for < or = 15 yrs,"male patient, 67-year-old, Current Reformed Sm..."


In [33]:
# luad/lusc distribution
lung_images_clinical_captions['_primary_disease'].value_counts()

_primary_disease
lung squamous cell carcinoma    79060
lung adenocarcinoma             77269
Name: count, dtype: int64

In [34]:
# missingness
lung_images_clinical_captions.isna().sum()

slides                                    0
tiles                                     0
labels                                    0
patterns                                  0
_primary_disease                          0
age_at_initial_pathologic_diagnosis    2871
gender                                    0
tobacco_smoking_history_label          2630
caption                                   0
dtype: int64

In [35]:
lung_images_clinical_captions.to_csv("/gpfs/home/yb2612/dl4med_25/dl_project/data/lung_val_images_clinical_captions.csv", index=True)

In [36]:
lung_images_captions = lung_images_clinical_captions[['slides', 'tiles', 'caption']].copy()

lung_images_captions['filepath'] = lung_images_captions.apply(
    lambda row: f"/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/val/{row['slides']}/{row['tiles']}",
    axis=1
)

lung_images_captions

Unnamed: 0_level_0,slides,tiles,caption,filepath
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TCGA-38-4626-01,TCGA-38-4626-01Z-00-DX1,57_25.jpeg,"female, aged 57-year-old, smoker: Current Smoker",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-38-4626-01,TCGA-38-4626-01Z-00-DX1,42_18.jpeg,"female, aged 57-year-old, smoker: Current Smoker",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-38-4626-01,TCGA-38-4626-01Z-00-DX1,20_20.jpeg,"female, aged 57-year-old, smoker: Current Smoker",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-38-4626-01,TCGA-38-4626-01Z-00-DX1,8_16.jpeg,"female, aged 57-year-old, smoker: Current Smoker",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-38-4626-01,TCGA-38-4626-01Z-00-DX1,72_6.jpeg,"female, aged 57-year-old, smoker: Current Smoker",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
...,...,...,...,...
TCGA-34-8455-01,TCGA-34-8455-01Z-00-DX1,25_23.jpeg,"male patient, 67-year-old, Current Reformed Sm...",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-34-8455-01,TCGA-34-8455-01Z-00-DX1,12_33.jpeg,"male patient, 67-year-old, Current Reformed Sm...",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-34-8455-01,TCGA-34-8455-01Z-00-DX1,44_23.jpeg,"male patient, 67-year-old, Current Reformed Sm...",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-34-8455-01,TCGA-34-8455-01Z-00-DX1,22_20.jpeg,"male patient, 67-year-old, Current Reformed Sm...",/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...


In [37]:
lung_images_captions[['filepath', 'caption']].to_csv("/gpfs/home/yb2612/dl4med_25/dl_project/data/lung_val_filepath_caption.csv", index=False)

In [46]:
# reducing val set
import pandas as pd

val_complete = pd.read_csv("/gpfs/home/yb2612/dl4med_25/dl_project/data/lung_val_filepath_caption.csv")
val_subset = val_complete.sample(n=10000, random_state=9)
val_subset.to_csv("/gpfs/home/yb2612/dl4med_25/dl_project/data/lung_val_10k_filepath_caption.csv", index=False)

val_subset["caption"].value_counts()

caption
Individual: male, age 66-year-old, smoking: Current Smoker                                             1060
67-year-old, female, history: Lifelong Non-Smoker                                                       342
female patient, 74-year-old, Current Reformed Smoker for > 15 yrs                                       182
51-year-old, female, history: Current Smoker                                                            171
male, aged 81-year-old, smoker: Current Reformed Smoker for > 15 yrs                                    134
                                                                                                       ... 
Individual: male, age 64-year-old, smoking: Current Smoker                                                4
A case involving a 56-year-old male with smoking history: Current Reformed Smoker for < or = 15 yrs       4
A case involving a 72-year-old male with smoking history: Current Reformed Smoker for < or = 15 yrs       3
Individual: female, 

## Full test set

In [39]:
import h5py
import pandas as pd

file_path = "/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/hdf5_TCGAFFPE_LUADLUSC_5x_60pc_he_test-001.h5"

with h5py.File(file_path, 'r') as f:
    tiles = [t.decode('utf-8') for t in f['test_tiles'][:]]
    slides = [s.decode('utf-8') for s in f['test_slides'][:]]
    
    labels = f['test_labels'][:]
    patterns = f['test_patterns'][:]
    
    lung_h5 = pd.DataFrame({
        'slides': slides,
        'tiles': tiles,
        'labels': labels,
        'patterns': patterns
    })

    lung_h5['sampleID'] = lung_h5['slides'].str[:15]

    lung_h5.set_index('sampleID', inplace=True)

lung_h5

Unnamed: 0_level_0,slides,tiles,labels,patterns
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TCGA-69-8254-01,TCGA-69-8254-01Z-00-DX1,41_26.jpeg,0.0,b'TCGA-LUAD_not_reported'
TCGA-69-8254-01,TCGA-69-8254-01Z-00-DX1,41_32.jpeg,0.0,b'TCGA-LUAD_not_reported'
TCGA-69-8254-01,TCGA-69-8254-01Z-00-DX1,12_19.jpeg,0.0,b'TCGA-LUAD_not_reported'
TCGA-69-8254-01,TCGA-69-8254-01Z-00-DX1,33_34.jpeg,0.0,b'TCGA-LUAD_not_reported'
TCGA-69-8254-01,TCGA-69-8254-01Z-00-DX1,10_20.jpeg,0.0,b'TCGA-LUAD_not_reported'
...,...,...,...,...
TCGA-18-3417-01,TCGA-18-3417-01Z-00-DX1,30_25.jpeg,9.0,b'TCGA-LUSC_stage_iv'
TCGA-18-3417-01,TCGA-18-3417-01Z-00-DX1,26_29.jpeg,9.0,b'TCGA-LUSC_stage_iv'
TCGA-18-3417-01,TCGA-18-3417-01Z-00-DX1,4_8.jpeg,9.0,b'TCGA-LUSC_stage_iv'
TCGA-18-3417-01,TCGA-18-3417-01Z-00-DX1,27_22.jpeg,9.0,b'TCGA-LUSC_stage_iv'


In [40]:
# join dfs
lung_images_clinical_captions = lung_h5.merge(lung_clinical_captions, left_index=True, right_index=True, how='left')
lung_images_clinical_captions

Unnamed: 0_level_0,slides,tiles,labels,patterns,_primary_disease,age_at_initial_pathologic_diagnosis,gender,tobacco_smoking_history_label,caption
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
TCGA-69-8254-01,TCGA-69-8254-01Z-00-DX1,41_26.jpeg,0.0,b'TCGA-LUAD_not_reported',lung adenocarcinoma,85.0,MALE,Current Reformed Smoker for > 15 yrs,Current Reformed Smoker for > 15 yrs — 85-year...
TCGA-69-8254-01,TCGA-69-8254-01Z-00-DX1,41_32.jpeg,0.0,b'TCGA-LUAD_not_reported',lung adenocarcinoma,85.0,MALE,Current Reformed Smoker for > 15 yrs,Current Reformed Smoker for > 15 yrs — 85-year...
TCGA-69-8254-01,TCGA-69-8254-01Z-00-DX1,12_19.jpeg,0.0,b'TCGA-LUAD_not_reported',lung adenocarcinoma,85.0,MALE,Current Reformed Smoker for > 15 yrs,Current Reformed Smoker for > 15 yrs — 85-year...
TCGA-69-8254-01,TCGA-69-8254-01Z-00-DX1,33_34.jpeg,0.0,b'TCGA-LUAD_not_reported',lung adenocarcinoma,85.0,MALE,Current Reformed Smoker for > 15 yrs,Current Reformed Smoker for > 15 yrs — 85-year...
TCGA-69-8254-01,TCGA-69-8254-01Z-00-DX1,10_20.jpeg,0.0,b'TCGA-LUAD_not_reported',lung adenocarcinoma,85.0,MALE,Current Reformed Smoker for > 15 yrs,Current Reformed Smoker for > 15 yrs — 85-year...
...,...,...,...,...,...,...,...,...,...
TCGA-18-3417-01,TCGA-18-3417-01Z-00-DX1,30_25.jpeg,9.0,b'TCGA-LUSC_stage_iv',lung squamous cell carcinoma,65.0,MALE,Current Reformed Smoker for > 15 yrs,A case involving a 65-year-old male with smoki...
TCGA-18-3417-01,TCGA-18-3417-01Z-00-DX1,26_29.jpeg,9.0,b'TCGA-LUSC_stage_iv',lung squamous cell carcinoma,65.0,MALE,Current Reformed Smoker for > 15 yrs,A case involving a 65-year-old male with smoki...
TCGA-18-3417-01,TCGA-18-3417-01Z-00-DX1,4_8.jpeg,9.0,b'TCGA-LUSC_stage_iv',lung squamous cell carcinoma,65.0,MALE,Current Reformed Smoker for > 15 yrs,A case involving a 65-year-old male with smoki...
TCGA-18-3417-01,TCGA-18-3417-01Z-00-DX1,27_22.jpeg,9.0,b'TCGA-LUSC_stage_iv',lung squamous cell carcinoma,65.0,MALE,Current Reformed Smoker for > 15 yrs,A case involving a 65-year-old male with smoki...


In [41]:
# luad/lusc distribution
lung_images_clinical_captions['_primary_disease'].value_counts()

_primary_disease
lung adenocarcinoma             76918
lung squamous cell carcinoma    71819
Name: count, dtype: int64

In [42]:
# missingness
lung_images_clinical_captions.isna().sum()

slides                                    0
tiles                                     0
labels                                    0
patterns                                  0
_primary_disease                          0
age_at_initial_pathologic_diagnosis    1894
gender                                    0
tobacco_smoking_history_label           833
caption                                   0
dtype: int64

In [43]:
lung_images_clinical_captions.to_csv("/gpfs/home/yb2612/dl4med_25/dl_project/data/lung_test_images_clinical_captions.csv", index=True)

In [44]:
lung_images_captions = lung_images_clinical_captions[['slides', 'tiles', 'caption']].copy()

lung_images_captions['filepath'] = lung_images_captions.apply(
    lambda row: f"/gpfs/home/yb2612/dl4med_25/dl_project/data/scratch_data/test/{row['slides']}/{row['tiles']}",
    axis=1
)

lung_images_captions

Unnamed: 0_level_0,slides,tiles,caption,filepath
sampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TCGA-69-8254-01,TCGA-69-8254-01Z-00-DX1,41_26.jpeg,Current Reformed Smoker for > 15 yrs — 85-year...,/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-69-8254-01,TCGA-69-8254-01Z-00-DX1,41_32.jpeg,Current Reformed Smoker for > 15 yrs — 85-year...,/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-69-8254-01,TCGA-69-8254-01Z-00-DX1,12_19.jpeg,Current Reformed Smoker for > 15 yrs — 85-year...,/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-69-8254-01,TCGA-69-8254-01Z-00-DX1,33_34.jpeg,Current Reformed Smoker for > 15 yrs — 85-year...,/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-69-8254-01,TCGA-69-8254-01Z-00-DX1,10_20.jpeg,Current Reformed Smoker for > 15 yrs — 85-year...,/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
...,...,...,...,...
TCGA-18-3417-01,TCGA-18-3417-01Z-00-DX1,30_25.jpeg,A case involving a 65-year-old male with smoki...,/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-18-3417-01,TCGA-18-3417-01Z-00-DX1,26_29.jpeg,A case involving a 65-year-old male with smoki...,/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-18-3417-01,TCGA-18-3417-01Z-00-DX1,4_8.jpeg,A case involving a 65-year-old male with smoki...,/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...
TCGA-18-3417-01,TCGA-18-3417-01Z-00-DX1,27_22.jpeg,A case involving a 65-year-old male with smoki...,/gpfs/home/yb2612/dl4med_25/dl_project/data/sc...


In [45]:
lung_images_captions[['filepath', 'caption']].to_csv("/gpfs/home/yb2612/dl4med_25/dl_project/data/lung_test_filepath_caption.csv", index=False)