In [1]:
import pandas as pd
import numpy as np
import warnings

In [2]:
def deduplicate_clinical(df, study_priority, exclude_cols=None, store_merged_columns=None):
    df = df.copy()
    df["priority"] = df["Study ID"].map(study_priority).fillna(999)
    df_sorted = df.sort_values(by=["Sample ID", "priority"])

    if exclude_cols is None:
        exclude_cols = []

    def merge_group(group, sample_id):
        group = group.copy()
        group["Sample ID"] = sample_id  # Inject group key manually
        group_sorted = group.sort_values("priority")
        merged = group_sorted.iloc[0].copy()
        contributing_studies = set([merged["Study ID"]])
        filled_columns = set()

        for _, row in group_sorted.iloc[1:].iterrows():
            contributing_studies.add(row["Study ID"])
            for col in group.columns:
                if col in exclude_cols or col == "priority":
                    continue
                if pd.isna(merged[col]) and not pd.isna(row[col]):
                    merged[col] = row[col]
                    filled_columns.add(col)

        merged["Study ID"] = "+".join(
            sorted(contributing_studies, key=lambda x: study_priority.get(x, 999))
        )
        if store_merged_columns:
            merged[store_merged_columns] = (
                "+".join(sorted(filled_columns)) if filled_columns else ""
            )

        return pd.DataFrame([merged])

    deduped_df = (
        df_sorted.groupby("Sample ID", group_keys=False)
        .apply(lambda g: merge_group(g, g.name), include_groups=False)
        .drop(columns=["priority"])
        .reset_index(drop=True)
    )

    cols = deduped_df.columns.tolist()
    if "Sample ID" in cols:
        cols.insert(0, cols.pop(cols.index("Sample ID")))
        deduped_df = deduped_df[cols]

    return deduped_df



In [3]:
path = "/home/val/workspaces/histotab/data/raw/combined_study_clinical_data.tsv"
clinical_df = pd.read_csv(path, sep="\t", low_memory=False)

In [4]:
clinical_df["Study ID"].unique()  # Ensure 'Study ID' is a string type

array(['luad_cptac_gdc', 'luad_oncosg_2020', 'luad_mskcc_2015',
       'luad_cptac_2020', 'luad_broad', 'lung_nci_2022', 'lung_msk_2017',
       'luad_mskcc_2023_met_organotropism', 'luad_msk_npjpo_2021',
       'luad_mskcc_2020', 'luad_mskimpact_2021',
       'luad_tcga_pan_can_atlas_2018', 'luad_tcga', 'luad_tcga_gdc',
       'luad_tcga_pub'], dtype=object)

In [5]:
clinical_df_tcga = clinical_df[clinical_df["Study ID"].str.startswith("luad_tcga")]
# Drop the old 'luad_tcga_pub' study
clinical_df_tcga = clinical_df_tcga[clinical_df_tcga["Study ID"] != "luad_tcga_pub"]

In [6]:
clinical_df_tcga = clinical_df_tcga.dropna(axis=1, how="all", inplace=False)

In [7]:
clinical_df_tcga["Study ID"].unique()  # Ensure 'Study ID' is a string type

array(['luad_tcga_pan_can_atlas_2018', 'luad_tcga', 'luad_tcga_gdc'],
      dtype=object)

In [8]:
study_priority_tcga = {
    "luad_tcga_pan_can_atlas_2018": 1,
    "luad_tcga": 2,
    "luad_tcga_gdc": 3,
    # Other studies get default priority
}


In [9]:
clinical_df_tcga = deduplicate_clinical(clinical_df_tcga, study_priority=study_priority_tcga)

In [10]:
clinical_df_tcga.to_csv(
    "/home/val/workspaces/histotab/data/processed/luad_tcga_clinical_data.csv", index=False)

In [11]:
icdo3_col = "International Classification of Diseases for Oncology, Third Edition ICD-O-3 Histology Code"
mask = clinical_df_tcga["Morphology"] != clinical_df_tcga[icdo3_col]
clinical_df_tcga.loc[mask, "Patient ID"].shape

(74,)

In [12]:
with open('column_names.txt', 'w') as f:
    for col in clinical_df_tcga.columns:
        f.write(col + '\n')

In [13]:
clinical_df_tcga[icdo3_col].unique()

array(['8140/3', '8255/3', nan, '8260/3', '8550/3', '8480/3', '8310/3',
       '8252/3', '8253/3', '8230/3', '8507/3', '8250/3', '8490/3'],
      dtype=object)

In [14]:
histology_map = {
    "8140/3": "Adenocarcinoma, NOS",
    "8255/3": "Adenocarcinoma with mixed subtypes",
    "8260/3": "Papillary adenocarcinoma, NOS",
    "8265/3": "Micropapillary carcinoma, NOS",
    "8550/3": "Acinar cell carcinoma",
    "8480/3": "Mucinous adenocarcinoma",
    "8310/3": "Clear cell adenocarcinoma, NOS",
    "8252/3": "Bronchiolo-alveolar carcinoma, non-mucinous",
    "8253/3": "Invasive mucinous adenocarcinoma",
    "8230/3": "Solid carcinoma, NOS",
    "8507/3": "Invasive micropapillary carcinoma",
    "8250/3": "Lepidic adenocarcinoma",
    "8490/3": "Signet ring cell carcinoma",
    }

pattern_map = {
    "Lepidic adenocarcinoma": "Lepidic",
    "Bronchiolo-alveolar carcinoma, non-mucinous": "Lepidic",  # older term
    
    "Acinar cell carcinoma": "Acinar",

    "Papillary adenocarcinoma, NOS": "Papillary",

    "Solid carcinoma, NOS": "Solid",

    "Invasive micropapillary carcinoma": "Micropapillary",
    "Micropapillary carcinoma, NOS" : "Micropapillary", 

    "Mucinous adenocarcinoma": "To drop",  # not part of 5 canonical patterns
    "Invasive mucinous adenocarcinoma": "To drop",  # not part of 5 canonical patterns

    "Clear cell adenocarcinoma, NOS": "To drop",  # not part of 5 canonical patterns
    "Signet ring cell carcinoma": "To drop",  # not part of 5 canonical patterns

    "Adenocarcinoma with mixed subtypes": "Mixed",  # optionally drop or keep as its own group
    "Adenocarcinoma, NOS": "Unknown"  # too vague
}

In [15]:
def map_dict(df_col, map_dict):
    """
    Map values in a DataFrame column using a provided dictionary.
    If a value is not found in the dictionary, it will be replaced with 'Value not found'.
    """
    return df_col.map(lambda x: map_dict.get(x, "Value not found") if pd.notna(x) else pd.NA)

In [16]:
clinical_df_tcga["histologic_subtype"] = map_dict(clinical_df_tcga["Morphology"], histology_map)
clinical_df_tcga["histologic_subtype_icdo3"] = map_dict(clinical_df_tcga[icdo3_col], histology_map)

clinical_df_tcga["luad_major_pattern"] = map_dict(clinical_df_tcga["histologic_subtype"], pattern_map)
clinical_df_tcga["luad_major_pattern_icdo3"] = map_dict(clinical_df_tcga["histologic_subtype_icdo3"], pattern_map)

In [17]:
mask = clinical_df_tcga["histologic_subtype"] != clinical_df_tcga["histologic_subtype_icdo3"]
clinical_df_tcga.loc[mask, "Patient ID"].shape

(74,)

In [18]:
clinical_df_tcga["luad_major_pattern"] =  clinical_df_tcga["luad_major_pattern"].fillna("Unknown")
clinical_df_tcga["luad_major_pattern_icdo3"] =  clinical_df_tcga["luad_major_pattern_icdo3"].fillna("Unknown")

In [19]:
clinical_df_tcga["luad_major_pattern"].isna().sum(), clinical_df_tcga["luad_major_pattern_icdo3"].isna().sum()

(np.int64(0), np.int64(0))

In [20]:
subtype_counts = clinical_df_tcga["luad_major_pattern"].value_counts().sort_values(ascending=False)
subtype_counts

luad_major_pattern
Unknown           382
Mixed             110
Papillary          22
Acinar             22
Lepidic            22
To drop            21
Solid               6
Micropapillary      3
Name: count, dtype: int64

In [21]:
subtype_counts = clinical_df_tcga["luad_major_pattern_icdo3"].value_counts().sort_values(ascending=False)
subtype_counts

luad_major_pattern_icdo3
Unknown           381
Mixed             109
Acinar             24
Papillary          22
To drop            22
Lepidic            22
Solid               5
Micropapillary      3
Name: count, dtype: int64

In [22]:
mask = clinical_df_tcga["luad_major_pattern"] == \
       clinical_df_tcga["luad_major_pattern_icdo3"]
clinical_df_tcga.loc[~mask, ["Patient ID", "luad_major_pattern", "luad_major_pattern_icdo3"]]

Unnamed: 0,Patient ID,luad_major_pattern,luad_major_pattern_icdo3
361,TCGA-69-7763,Mixed,Acinar
368,TCGA-69-7980,Unknown,Acinar
387,TCGA-73-A9RS,Unknown,To drop
508,TCGA-95-A4VN,Solid,Unknown


In [23]:
def merge_patterns(row):
    a = row["luad_major_pattern"]
    b = row["luad_major_pattern_icdo3"]
    
    if a == "To drop" or b == "To drop":
        return "To drop"
    if a == "Unknown": 
        return b
    if b == "Unknown":
        return a
    if a == "Mixed":
        return b
    if b == "Mixed":
        return a
    if a == b:
        return a
    return "CONFLICTS"

In [24]:
clinical_df_tcga["merged_major_pattern"] = clinical_df_tcga.apply(merge_patterns, axis=1)

In [25]:
mask = clinical_df_tcga["luad_major_pattern"] == \
       clinical_df_tcga["luad_major_pattern_icdo3"]
clinical_df_tcga.loc[~mask, ["Patient ID", "luad_major_pattern", "luad_major_pattern_icdo3", "merged_major_pattern"]]

Unnamed: 0,Patient ID,luad_major_pattern,luad_major_pattern_icdo3,merged_major_pattern
361,TCGA-69-7763,Mixed,Acinar,Acinar
368,TCGA-69-7980,Unknown,Acinar,Acinar
387,TCGA-73-A9RS,Unknown,To drop,To drop
508,TCGA-95-A4VN,Solid,Unknown,Solid


In [26]:
clinical_df_tcga.loc[mask, ["Patient ID", "luad_major_pattern", "luad_major_pattern_icdo3", "merged_major_pattern"]].head(20)

Unnamed: 0,Patient ID,luad_major_pattern,luad_major_pattern_icdo3,merged_major_pattern
0,TCGA-05-4244,Unknown,Unknown,Unknown
1,TCGA-05-4245,Unknown,Unknown,Unknown
2,TCGA-05-4249,Unknown,Unknown,Unknown
3,TCGA-05-4250,Unknown,Unknown,Unknown
4,TCGA-05-4382,Mixed,Mixed,Mixed
5,TCGA-05-4384,Mixed,Mixed,Mixed
6,TCGA-05-4389,Mixed,Mixed,Mixed
7,TCGA-05-4390,Mixed,Mixed,Mixed
8,TCGA-05-4395,Mixed,Mixed,Mixed
9,TCGA-05-4396,Mixed,Mixed,Mixed


In [27]:
subtype_counts = clinical_df_tcga["merged_major_pattern"].value_counts().sort_values(ascending=False)
subtype_counts

merged_major_pattern
Unknown           380
Mixed             109
Acinar             24
Papillary          22
To drop            22
Lepidic            22
Solid               6
Micropapillary      3
Name: count, dtype: int64

In [28]:
clinical_df_tcga.drop(columns=["histologic_subtype", "histologic_subtype_icdo3"], inplace=True)

In [29]:
clinical_df_tcga = clinical_df_tcga[~clinical_df_tcga["luad_major_pattern"].isin(["To drop"])]

In [30]:
clinical_df_tcga.to_csv("/home/val/workspaces/histotab/data/processed/clinical_df_tcga_ded.csv", index=False)

In [31]:
clinical_df_tcga["Oncotree Code"].unique()

array(['LUAD'], dtype=object)

In [32]:
clinical_col_tcga = [
    "Study ID",
    "Patient ID",
    "Sample ID",
    "merged_major_pattern",
    "Diagnosis Age",
    "Mutation Count",
    "Progress Free Survival (Months)",
    "Sex",
    "TMB (nonsynonymous)",
    "Disease Free (Months)",
    "Disease Free Status",
    "Overall Survival (Months)",
    "Overall Survival Status",
    "Fraction Genome Altered",
    "Longest Dimension",
    "Shortest Dimension",
    "Months of disease-specific survival",
    "Disease-specific Survival status",
    "Progression Free Status",
]

In [33]:
clinical_df_tcga[clinical_col_tcga].shape

(567, 19)

In [34]:
clinical_df["Study ID"].unique()  # Ensure 'Study ID' is a string type

array(['luad_cptac_gdc', 'luad_oncosg_2020', 'luad_mskcc_2015',
       'luad_cptac_2020', 'luad_broad', 'lung_nci_2022', 'lung_msk_2017',
       'luad_mskcc_2023_met_organotropism', 'luad_msk_npjpo_2021',
       'luad_mskcc_2020', 'luad_mskimpact_2021',
       'luad_tcga_pan_can_atlas_2018', 'luad_tcga', 'luad_tcga_gdc',
       'luad_tcga_pub'], dtype=object)

In [35]:
clinical_df_msk = clinical_df[clinical_df["Study ID"].str.startswith("luad_msk")]

In [36]:
clinical_df_msk["Study ID"].unique()  # Ensure 'Study ID' is a string type

array(['luad_mskcc_2015', 'luad_mskcc_2023_met_organotropism',
       'luad_msk_npjpo_2021', 'luad_mskcc_2020', 'luad_mskimpact_2021'],
      dtype=object)

In [37]:
# Focus on MSK cancer cell 2023 (mskcc) cause lot of data 
# and mskcc_2020 cause luad patterns
clinical_df_msk = clinical_df[
    (clinical_df["Study ID"] == "luad_mskcc_2020")
    | (clinical_df["Study ID"] == "luad_mskcc_2023_met_organotropism")
]

In [38]:
clinical_df_msk = clinical_df_msk.dropna(axis=1, how="all", inplace=False)

In [39]:
clinical_df_msk.shape

(3257, 110)

In [40]:
subtype_counts = clinical_df_msk["Predominant Histologic Subtype"].value_counts().sort_values(ascending=False)
subtype_counts

Predominant Histologic Subtype
Acinar            368
Lepidic            88
Solid              68
Papillary          43
Micropapillary     37
Name: count, dtype: int64

In [41]:
subtype_counts = clinical_df_msk["Predominant Histologic Subtype.1"].value_counts().sort_values(ascending=False)
subtype_counts

Predominant Histologic Subtype.1
Acinar            300
Unknown           112
Solid              93
Lepidic            61
Papillary          39
Micropapillary     36
Name: count, dtype: int64

In [42]:
clinical_df_msk["Sample Type"].value_counts().sort_values(ascending=False)

Sample Type
Metastasis    1666
Primary        987
Name: count, dtype: int64

In [43]:
study_priority_msk = {
    "luad_mskcc_2020": 1,
    "luad_mskcc_2023_met_organotropism": 2,
}
clinical_df_msk = deduplicate_clinical(
    clinical_df_msk, study_priority=study_priority_msk,
    )

In [44]:
clinical_df_msk["Sample Type"] = clinical_df_msk["Sample Type"].fillna("Not Filled")

In [45]:
clinical_df_msk["Sample Type"].value_counts().sort_values(ascending=False)

Sample Type
Metastasis    1666
Primary        987
Not Filled     238
Name: count, dtype: int64

In [46]:
clinical_df_msk.shape

(2891, 110)

In [47]:
clinical_df_msk = clinical_df_msk[clinical_df_msk["Sample Type"] != "Metastasis"]

In [48]:
clinical_df_msk[clinical_df_msk["Sample Type"] == "Not Filled"]["Study ID"].unique()

array(['luad_mskcc_2020'], dtype=object)

In [49]:
clinical_df_msk.to_csv("/home/val/workspaces/histotab/data/processed/clinical_df_msk.csv", index=False)

In [50]:
clinical_df_msk["Predominant Histologic Subtype.1"].value_counts().sort_values(ascending=False)

Predominant Histologic Subtype.1
Acinar            300
Unknown           112
Solid              93
Lepidic            61
Papillary          39
Micropapillary     36
Name: count, dtype: int64

In [51]:
# Replacing NaN values in histologic subtype columns with "Unknown" cause Unkown already presents
clinical_df_msk["Predominant Histologic Subtype"] = clinical_df_msk["Predominant Histologic Subtype"].fillna("Unknown")
clinical_df_msk ["Predominant Histologic Subtype.1"] = clinical_df_msk["Predominant Histologic Subtype.1"].fillna("Unknown")

In [52]:
mask = clinical_df_msk["Predominant Histologic Subtype"] != clinical_df_msk["Predominant Histologic Subtype.1"]
clinical_df_msk.loc[mask, ["Patient ID", "Predominant Histologic Subtype", "Predominant Histologic Subtype.1"]]

Unnamed: 0,Patient ID,Predominant Histologic Subtype,Predominant Histologic Subtype.1
7,P-0000280,Acinar,Unknown
8,P-0000348,Unknown,Papillary
10,P-0000459,Unknown,Papillary
20,P-0000627,Acinar,Unknown
26,P-0000642,Unknown,Acinar
...,...,...,...
2272,P-0040055,Unknown,Acinar
2274,P-0040133,Unknown,Papillary
2284,P-0040655,Unknown,Acinar
2304,P-0041217,Unknown,Papillary


In [53]:
clinical_df_msk["Predominant Histologic Subtype.1"].value_counts().sort_values(ascending=False)

Predominant Histologic Subtype.1
Unknown           696
Acinar            300
Solid              93
Lepidic            61
Papillary          39
Micropapillary     36
Name: count, dtype: int64

In [54]:
def merge_patterns(row):
    a = row["Predominant Histologic Subtype"]
    b = row["Predominant Histologic Subtype.1"]
    
    if a == "Unknown":
        return b
    if b == "Unknown":
        return a
    if a == b:
        return a
    return "CONFLICTS"

In [55]:
def merge_age(row):
    a = row["Age At Surgery"]
    b = row["Age at Surgery/Biopsy"]
    
    if pd.isna(a) and pd.isna(b):
        return "Not Filled"
    if pd.isna(a):
        return b
    return a

In [56]:
clinical_df_msk["merged_histologic_subtype"] = clinical_df_msk.apply(merge_patterns, axis=1)
clinical_df_msk["merged_aged"] = clinical_df_msk.apply(merge_age, axis=1)

In [57]:
clinical_cols_msk = [
    "Study ID",
    "Patient ID",
    "Sample ID",
    "Age At Surgery",  
    "Age at Surgery/Biopsy",
    "Mutation Count",
    "Sex",
    "TMB (nonsynonymous)",
    "Relapse Free Status (Months)",
    "Relapse Free Status",
    "Overall Survival (Months)",
    "Overall Survival Status",
    "Fraction Genome Altered",
    "CT Size",
    "Predominant Histologic Subtype",
    "Predominant Histologic Subtype.1",
    "merged_histologic_subtype",
    "merged_aged",
]


In [58]:
clinical_df_msk = clinical_df_msk[clinical_cols_msk]

In [59]:
clinical_df_msk.to_csv("/home/val/workspaces/histotab/data/processed/clinical_df_msk_ded.csv", index=False)

In [60]:
clinical_df_msk["merged_histologic_subtype"].value_counts().sort_values(ascending=False)

merged_histologic_subtype
Acinar            454
Unknown           448
Solid             121
Lepidic            96
Papillary          54
Micropapillary     52
Name: count, dtype: int64

In [61]:
clinical_df_msk.shape

(1225, 18)

In [62]:
gene_df = pd.read_csv("/home/val/workspaces/histotab/data/raw/alterations_across_samples.tsv", sep="\t")

In [63]:
col_to_keep = [c for c in gene_df.columns if ":" not in c]
gene_df = gene_df[col_to_keep]

In [64]:
# Check for missing values firs-
if gene_df.isna().any().any():
    raise ValueError("DataFrame contains NaN values please handle them before mapping.")

# Define a mapping function
def classify_mutation(val):
    if val == "no alteration":
        return "WT"
    elif val == "not profiled":
        return "not profiled"
    else:
        return "MUT"

# Apply the mapping to gene columns only (exclude non-gene metadata columns)
non_gene_cols = ["Study ID", "Sample ID", "Patient ID", "Altered"]
gene_cols = [col for col in gene_df.columns if col not in non_gene_cols]

# Apply mapping to gene columns
gene_df[gene_cols] = gene_df[gene_cols].map(classify_mutation)


In [65]:
gene_df.head()

Unnamed: 0,Study ID,Sample ID,Patient ID,Altered,EGFR,KRAS,BRAF,TP53,PIK3CA,PTEN,...,ALK,AKT1,SMARCA4,ARID1A,ARID2,RIT1,MAP2K1,SPOP,NFE2L2,TSC2
0,luad_broad,LU-A08-43,LU-A08-43,1,MUT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
1,luad_broad,LUAD-2GUGK,LUAD-2GUGK,1,WT,WT,WT,MUT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
2,luad_broad,LUAD-5O6B5,LUAD-5O6B5,1,WT,WT,WT,MUT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
3,luad_broad,LUAD-5V8LT,LUAD-5V8LT,1,WT,WT,WT,MUT,WT,WT,...,WT,WT,MUT,WT,WT,WT,WT,WT,WT,WT
4,luad_broad,LUAD-74TBW,LUAD-74TBW,1,MUT,WT,WT,MUT,WT,WT,...,MUT,WT,WT,WT,MUT,WT,WT,WT,WT,WT


In [66]:
gene_df_tcga = gene_df[gene_df["Study ID"].isin(study_priority_tcga.keys())]
gene_df_tcga.shape

(1723, 46)

In [None]:
def deduplicate_genes(df, study_priority):
    df = df.copy()
    df["priority"] = df["Study ID"].map(study_priority).fillna(999)
    df = df.sort_values(by=["Sample ID", "priority"])
    df = df.drop_duplicates(subset=["Sample ID"], keep="first")
    df = df.drop(columns=["priority"])
    return df

In [68]:
gene_df_tcga = gene_df[gene_df["Study ID"].isin(study_priority_tcga.keys())]
gene_df_tcga.shape

(1723, 46)

In [69]:
gene_df_tcga = deduplicate_genes(gene_df_tcga, study_priority_msk)
gene_df_tcga.shape

(588, 46)

In [70]:
gene_df_msk = gene_df[gene_df["Study ID"].isin(study_priority_msk.keys())]
gene_df_msk.shape

(3257, 46)

In [71]:
gene_df_msk = deduplicate_genes(gene_df_msk, study_priority_msk)
gene_df_msk.shape

(2891, 46)

In [72]:
clinical_df_tcga.columns

Index(['Sample ID', 'Study ID', 'Patient ID', 'Diagnosis Age', 'Cancer Type',
       'Cancer Type Detailed', 'Ethnicity Category', 'Mutation Count',
       'Oncotree Code', 'Progress Free Survival (Months)',
       ...
       'Tumor Break Load', 'Tissue Prospective Collection Indicator.1',
       'Tissue Retrospective Collection Indicator.1',
       'Tissue Source Site Code', 'Tumor Disease Anatomic Site', 'Tumor Type',
       'Winter Hypoxia Score', 'luad_major_pattern',
       'luad_major_pattern_icdo3', 'merged_major_pattern'],
      dtype='object', length=128)

In [73]:
colname_mapping_msk = {
    "Sample ID": "Sample ID",
    "Patient ID": "Patient ID",
    "Study ID": "Study ID",
    # "merged_age" : "Age",
    # "Fraction Genome Altered": "Fraction Genome Altered",
    "merged_histologic_subtype": "Predominant Histologic Pattern",
}
colname_mapping_tcga = {
    "Sample ID": "Sample ID",
    "Patient ID": "Patient ID",
    "Study ID": "Study ID",
    "merged_major_pattern": "Predominant Histologic Pattern",
}

In [90]:
final_df_tcga  = clinical_df_tcga[colname_mapping_tcga.keys()]
final_df_msk = clinical_df_msk[colname_mapping_msk.keys()]

In [91]:
final_df_tcga.columns = [colname_mapping_tcga[c] for c in final_df_tcga.columns]
final_df_msk.columns = [colname_mapping_msk[c] for c in final_df_msk.columns]

In [92]:
def merge_dfs(gene_df, clinical_df):
    clinical_df = clinical_df.copy()
    clinical_df = clinical_df.sort_values(by=["Sample ID"], inplace=False)
    clinical_df = clinical_df.drop_duplicates(subset=["Patient ID"], keep="first", inplace=False)

    gene_df = gene_df.drop(columns=["Altered", "Study ID", "Patient ID"])
    gene_df = gene_df.set_index("Sample ID")
    clinical_df = clinical_df.set_index("Sample ID")
    clinical_df  = clinical_df.join(gene_df, how="inner")
    return clinical_df

In [93]:
final_df_tcga = merge_dfs(gene_df_tcga, final_df_tcga)
final_df_tcga.to_csv("/home/val/workspaces/histotab/data/processed/tcga_pattern_gene_mutations.csv")

In [94]:
final_df_msk = merge_dfs(gene_df_msk, final_df_msk)
final_df_msk.to_csv("/home/val/workspaces/histotab/data/processed/msk_pattern_gene_mutations.csv")

In [95]:

final_df_msk

Unnamed: 0_level_0,Patient ID,Study ID,Predominant Histologic Pattern,EGFR,KRAS,BRAF,TP53,PIK3CA,PTEN,RBM10,...,ALK,AKT1,SMARCA4,ARID1A,ARID2,RIT1,MAP2K1,SPOP,NFE2L2,TSC2
Sample ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P-0000208-T01-WES,P-0000208,luad_mskcc_2023_met_organotropism,Unknown,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
P-0000219-T01-IM3,P-0000219,luad_mskcc_2020+luad_mskcc_2023_met_organotropism,Solid,MUT,WT,MUT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
P-0000280-T01-IM3,P-0000280,luad_mskcc_2020,Acinar,MUT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
P-0000348-T01-IM3,P-0000348,luad_mskcc_2023_met_organotropism,Papillary,MUT,WT,WT,MUT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
P-0000459-T01-IM3,P-0000459,luad_mskcc_2023_met_organotropism,Papillary,WT,WT,WT,WT,WT,WT,WT,...,WT,WT,WT,WT,MUT,WT,WT,WT,WT,WT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
P-0066345-T02-IM7,P-0066345,luad_mskcc_2023_met_organotropism,Unknown,MUT,WT,WT,MUT,WT,WT,MUT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
P-0067213-T01-IM7,P-0067213,luad_mskcc_2023_met_organotropism,Unknown,MUT,WT,WT,MUT,WT,WT,WT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
P-0068155-T02-IM7,P-0068155,luad_mskcc_2023_met_organotropism,Unknown,WT,WT,WT,MUT,WT,WT,WT,...,MUT,WT,WT,WT,WT,WT,WT,WT,WT,WT
P-0069037-T01-IM7,P-0069037,luad_mskcc_2023_met_organotropism,Unknown,WT,MUT,WT,WT,WT,WT,MUT,...,WT,WT,WT,WT,WT,WT,WT,WT,WT,WT
