In [1]:
import pandas as pd
import numpy as np
import warnings

In [2]:
path = "/home/val/workspaces/histotab/data/raw/combined_study_clinical_data.tsv"
clinical_df = pd.read_csv(path, sep="\t", low_memory=False)

In [3]:
clinical_df["Study ID"].unique()  # Ensure 'Study ID' is a string type

array(['luad_cptac_gdc', 'luad_oncosg_2020', 'luad_mskcc_2015',
       'luad_cptac_2020', 'luad_broad', 'lung_nci_2022', 'lung_msk_2017',
       'luad_mskcc_2023_met_organotropism', 'luad_msk_npjpo_2021',
       'luad_mskcc_2020', 'luad_mskimpact_2021',
       'luad_tcga_pan_can_atlas_2018', 'luad_tcga', 'luad_tcga_gdc',
       'luad_tcga_pub'], dtype=object)

In [4]:
clinical_df_tcga = clinical_df[clinical_df["Study ID"].str.startswith("luad_tcga")]
clinical_df_tcga = clinical_df_tcga[clinical_df_tcga["Study ID"] != "luad_tcga_pub"]

In [5]:
clinical_df_tcga = clinical_df_tcga.dropna(axis=1, how="all", inplace=False)

In [6]:
clinical_df_tcga["Study ID"].unique()  # Ensure 'Study ID' is a string type

array(['luad_tcga_pan_can_atlas_2018', 'luad_tcga', 'luad_tcga_gdc'],
      dtype=object)

In [7]:
def deduplicate_clinical(df, study_priority):

    # Assign priority to each row
    df["priority"] = df["Study ID"].map(study_priority).fillna(999)

    # Sort by priority (lower is better), then keep the first for each Sample ID
    df_sorted = df.sort_values(by=["Sample ID", "priority"])
    deduped_df = df_sorted.drop_duplicates(subset="Sample ID", keep="first")

        # Group by Sample ID and reduce
    def merge_group(group):
        # Sort by priority
        group_sorted = group.sort_values("priority")
        # Start from the highest-priority row
        merged = group_sorted.iloc[0].copy()
        # Fill missing values from lower-priority rows
        for _, row in group_sorted.iloc[1:].iterrows():
            for col in merged.index:
                if pd.isna(merged[col]) and not pd.isna(row[col]):
                    merged[col] = row[col]
        return merged

    deduped_df = df_sorted.groupby("Sample ID", group_keys=False).apply(merge_group)

    # Drop helper column
    deduped_df = deduped_df.drop(columns="priority")

    return deduped_df

In [8]:
study_priority = {
    "luad_tcga_pan_can_atlas_2018": 1,
    "luad_tcga": 2,
    "luad_tcga_luad_gdc": 3,
    # Other studies get default priority
}

clinical_df_tcga = deduplicate_clinical(
    clinical_df_tcga, study_priority=study_priority,
)

  deduped_df = df_sorted.groupby("Sample ID", group_keys=False).apply(merge_group)


In [15]:
def merge_columns_with_conflict_warning(df, col1, col2, new_col):
    def merge_row(row):
        val1 = row[col1]
        val2 = row[col2]
        if pd.isna(val1) and pd.isna(val2):
            return pd.NA
        if pd.isna(val1):
            return val2
        if pd.isna(val2):
            return val1
        if val1 == val2:
            return val1
        # If values conflict
        warnings.warn(f"Conflict: '{val1}' != '{val2}' in row with index {row.name}")
        return val1  # or raise Exception / keep val1 or val2 arbitrarily
    
    df[new_col] = df.apply(merge_row, axis=1)
    return df

In [16]:
clinical_df_tcga = merge_columns_with_conflict_warning(
    clinical_df_tcga,
    "Morphology",
    "International Classification of Diseases for Oncology, Third Edition ICD-O-3 Histology Code",
    "Morphology",
)



In [14]:
mask = clinical_df_tcga["Morphology"] == clinical_df_tcga["International Classification of Diseases for Oncology, Third Edition ICD-O-3 Histology Code"]
clinical_df_tcga.loc[~mask, "Patient ID"].head() 

Sample ID
TCGA-05-4245-01    TCGA-05-4245
TCGA-17-Z000-01    TCGA-17-Z000
TCGA-17-Z001-01    TCGA-17-Z001
TCGA-17-Z002-01    TCGA-17-Z002
TCGA-17-Z003-01    TCGA-17-Z003
Name: Patient ID, dtype: object

In [9]:
clinical_df_tcga.to_csv("/home/val/workspaces/histotab/data/processed/clinical_df_tcga_ded.csv", index=False)

In [None]:
with open('column_names.txt', 'w') as f:
    for col in clinical_df_tcga.columns:
        f.write(col + '\n')

In [26]:
histology_map = {
    "8140/3": "Adenocarcinoma, NOS",
    "8255/3": "Adenocarcinoma with mixed subtypes",
    "8260/3": "Papillary adenocarcinoma, NOS",
    "8265/3": "Micropapillary carcinoma, NOS",
    "8550/3": "Acinar cell carcinoma",
    "8480/3": "Mucinous adenocarcinoma",
    "8310/3": "Clear cell adenocarcinoma, NOS",
    "8252/3": "Bronchiolo-alveolar carcinoma, non-mucinous",
    "8253/3": "Invasive mucinous adenocarcinoma",
    "8230/3": "Solid carcinoma, NOS",
    "8507/3": "Invasive micropapillary carcinoma",
    "8250/3": "Lepidic adenocarcinoma",
    "8490/3": "Signet ring cell carcinoma",
    }

pattern_map = {
    "Lepidic adenocarcinoma": "Lepidic",
    "Bronchiolo-alveolar carcinoma, non-mucinous": "Lepidic",  # older term
    
    "Acinar cell carcinoma": "Acinar",

    "Papillary adenocarcinoma, NOS": "Papillary",

    "Solid carcinoma, NOS": "Solid",

    "Invasive micropapillary carcinoma": "Micropapillary",
    "Micropapillary carcinoma, NOS" : "Micropapillary", 

    "Mucinous adenocarcinoma": "To drop",  # not part of 5 canonical patterns
    "Invasive mucinous adenocarcinoma": "To drop",  # not part of 5 canonical patterns

    "Clear cell adenocarcinoma, NOS": "To drop",  # not part of 5 canonical patterns
    "Signet ring cell carcinoma": "To drop",  # not part of 5 canonical patterns

    "Adenocarcinoma with mixed subtypes": "Mixed",  # optionally drop or keep as its own group
    "Adenocarcinoma, NOS": None  # too vague
}

In [27]:
clinical_df_tcga["histologic_subtype"] = clinical_df_tcga["Morphology"].map(histology_map)
clinical_df_tcga["luad_major_pattern"] = clinical_df_tcga["histologic_subtype"].map(pattern_map)

In [28]:
subtype_counts = clinical_df_tcga["luad_major_pattern"].value_counts().sort_values(ascending=False)
subtype_counts

luad_major_pattern
Mixed             110
Papillary          22
Acinar             22
Lepidic            22
To drop            21
Solid               6
Micropapillary      3
Name: count, dtype: int64

In [22]:
clinical_df_tcga = clinical_df_tcga[~clinical_df_tcga["luad_major_pattern"].isin(["To drop"])]

In [None]:
clinical_df_tcga.shape

In [None]:
gene_df = pd.read_csv("/home/val/workspaces/histotab/data/raw/alterations_across_samples.tsv", sep="\t")


In [None]:
study_priority = {
    "tcga_pan_can_atlas_2018": 1,
    "tcga_luad_gdc": 2,
    "tcga_luad": 3,
    "cptac_2020": 1,
    "cptac_3_luad": 2,
    # Other studies get default priority
}


In [None]:
clinical_df["priority"] = clinical_df["Study ID"].map(study_priority).fillna(100)

In [None]:
clinical_df = (
    clinical_df.sort_values("priority")
    .drop_duplicates(subset=["Patient ID", "Sample ID"], keep="first")
    .drop(columns=["priority"])
)
