# Prereqs

In [1]:
import pandas as pd
import gzip

pd.set_option('display.max_rows', None)  # Show all rows

# Import data and process

In [2]:
data_dir = "/Users/patrick/Documents/Data/DKS/"

Import Mutation Annotation Format file:

In [3]:
maf_path = data_dir + "mc3.v0.2.8.PUBLIC.maf.gz"

with gzip.open(maf_path, 'rt') as file:
    maf = pd.read_csv(file, sep = '\t', comment = '#', low_memory = False)

print("Total mutations loaded:", len(maf))

Total mutations loaded: 3600963


Import clinical, sample, gene expression, and Bagaev annotation data:

In [4]:
clinical_path = data_dir + "clinical.project-tcga-skcm.2025-05-02/clinical.tsv"
clinical = pd.read_csv(clinical_path, sep='\t')

sample_path = data_dir + "biospecimen.project-tcga-skcm.2025-05-03/sample.tsv"
sample = pd.read_csv(sample_path, sep = '\t')

expr_path = data_dir + "TCGA-SKCM.star_fpkm.tsv"
expr = pd.read_csv(expr_path, sep= '\t', index_col = 0)

bag_path = data_dir + "mmc2.xlsx"
bag = pd.read_excel(bag_path, sheet_name = 2)

Get primary tumor samples only:

In [5]:
primary_samples = sample[sample['samples.sample_type'] == 'Primary Tumor']
print("Total number of samples:", primary_samples.shape[0])

sample_counts = primary_samples['samples.submitter_id'].value_counts()
num_with_multiple = (sample_counts > 1).sum()
print(f"{num_with_multiple} patients have multiple primary tumor samples.")

Total number of samples: 396
0 patients have multiple primary tumor samples.


Filter MAF file for these primary tumors:

In [6]:
primary_barcodes = primary_samples['samples.submitter_id']
maf['sample_barcode'] = maf['Tumor_Sample_Barcode'].str.slice(0, 16)
skcm_maf = maf[maf['sample_barcode'].isin(primary_barcodes)]
print(f"SKCM primary tumor mutations: {len(skcm_maf)}")

SKCM primary tumor mutations: 79214


Mutation types and frequencies:

In [7]:
mutation_summary = pd.DataFrame({
    'Count': maf['Variant_Classification'].value_counts(),
    'Percent': (maf['Variant_Classification'].value_counts(normalize=True) * 100).round(2)
})
mutation_summary

Unnamed: 0_level_0,Count,Percent
Variant_Classification,Unnamed: 1_level_1,Unnamed: 2_level_1
Missense_Mutation,1921979,53.37
Silent,782687,21.74
3'UTR,282636,7.85
Nonsense_Mutation,157232,4.37
Intron,108104,3.0
Frame_Shift_Del,87013,2.42
5'UTR,81323,2.26
Splice_Site,50617,1.41
RNA,49540,1.38
Frame_Shift_Ins,27128,0.75


Non synonymous mutations only:

In [8]:
non_synonymous = ['Missense_Mutation','Nonsense_Mutation','Frame_Shift_Del',
    'Frame_Shift_Ins','Splice_Site','Translation_Start_Site',
    'In_Frame_Del','In_Frame_Ins','Nonstop_Mutation'
]

ns_maf = skcm_maf[skcm_maf['Variant_Classification'].isin(non_synonymous)]
print(f"Non-synonymous mutations: {len(ns_maf)}")

Non-synonymous mutations: 47273


TMB per sample:

In [10]:
tmb_counts = (
    ns_maf['Tumor_Sample_Barcode']
    .value_counts()
    .rename_axis('sample_barcode')
    .reset_index(name='mutation_count')
)

tmb_counts['TMB'] = (tmb_counts['mutation_count'] / 38)
tmb_counts['TMB_bin'] = pd.qcut(tmb_counts['TMB'], q = 3, labels = ['Low', 'Medium', 'High'])

In [11]:
tmb_counts.head()

Unnamed: 0,sample_barcode,mutation_count,TMB,TMB_bin
0,TCGA-FR-A726-01A-11D-A32N-08,5776,152.0,High
1,TCGA-BF-A1Q0-01A-21D-A19A-08,2592,68.210526,High
2,TCGA-GN-A26C-01A-11D-A19A-08,2412,63.473684,High
3,TCGA-YG-AA3N-01A-11D-A38G-08,1644,43.263158,High
4,TCGA-BF-A1PZ-01A-11D-A19A-08,1594,41.947368,High


# Gene expression based T-cell classification

In [12]:
bag.head()

Unnamed: 0,Table S1. The 29 Fges developed for the TME classification platform. Related to Figure 1.,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,Gene,Gene signature,Description,PMID
1,ANGPT1,Angiogenesis,"ANGPT1 functions as a TIE2 agonist, which indu...",20651738
2,ANGPT2,Angiogenesis,ANGPT2 normally functions as an ANGPT1antagoni...,20651738
3,CDH5,Angiogenesis,Vascular endothelial (VE)-cadherin is specific...,16256984
4,CXCL5,Angiogenesis,"CXCL5, a member of the CXC chemokine family, h...",21356384


# Pan cancer atlas immune subtypes (C1-C6)