# Prereqs

In [1]:
import pandas as pd
import gzip
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pd.set_option('display.max_rows', None)

# Import data and process

In [2]:
data_dir = "/Users/patrick/Documents/Data/DKS/"

Import Mutation Annotation Format file:

In [3]:
maf_path = data_dir + "mc3.v0.2.8.PUBLIC.maf.gz"

with gzip.open(maf_path, 'rt') as file:
    maf = pd.read_csv(file, sep = '\t', comment = '#', low_memory = False)

print("Total mutations loaded:", len(maf))

Total mutations loaded: 3600963


Import clinical, sample, gene expression, and Bagaev annotation data:

In [4]:
clinical = pd.read_csv(data_dir + "clinical.project-tcga-skcm.2025-05-02/clinical.tsv", sep='\t')

sample = pd.read_csv(data_dir + "biospecimen.project-tcga-skcm.2025-05-03/sample.tsv", sep = '\t')

expr = pd.read_csv(data_dir + "TCGA-SKCM.star_fpkm.tsv", sep= '\t', index_col = 0)

bag = pd.read_excel(data_dir + "mmc2.xlsx", sheet_name = 2, header = 1)

gencode_map = pd.read_csv(data_dir + "gencode.v36.annotation.gtf.gene.probemap", sep='\t')

Get primary tumor samples only:

In [5]:
primary_samples = sample[sample['samples.sample_type'] == 'Primary Tumor']
print("Total number of samples:", primary_samples.shape[0])

sample_counts = primary_samples['samples.submitter_id'].value_counts()
num_with_multiple = (sample_counts > 1).sum()
print(f"{num_with_multiple} patients have multiple primary tumor samples.")

Total number of samples: 396
0 patients have multiple primary tumor samples.


Filter MAF file for these primary tumors:

In [6]:
primary_barcodes = primary_samples['samples.submitter_id']
maf['sample_barcode'] = maf['Tumor_Sample_Barcode'].str.slice(0, 16)
skcm_maf = maf[maf['sample_barcode'].isin(primary_barcodes)]
print(f"SKCM primary tumor mutations: {len(skcm_maf)}")

SKCM primary tumor mutations: 79214


Mutation types and frequencies:

In [7]:
mutation_summary = pd.DataFrame({
    'Count': maf['Variant_Classification'].value_counts(),
    'Percent': (maf['Variant_Classification'].value_counts(normalize=True) * 100).round(2)
})
mutation_summary

Unnamed: 0_level_0,Count,Percent
Variant_Classification,Unnamed: 1_level_1,Unnamed: 2_level_1
Missense_Mutation,1921979,53.37
Silent,782687,21.74
3'UTR,282636,7.85
Nonsense_Mutation,157232,4.37
Intron,108104,3.0
Frame_Shift_Del,87013,2.42
5'UTR,81323,2.26
Splice_Site,50617,1.41
RNA,49540,1.38
Frame_Shift_Ins,27128,0.75


Non synonymous mutations only:

In [8]:
non_synonymous = ['Missense_Mutation','Nonsense_Mutation','Frame_Shift_Del',
    'Frame_Shift_Ins','Splice_Site','Translation_Start_Site',
    'In_Frame_Del','In_Frame_Ins','Nonstop_Mutation'
]

ns_maf = skcm_maf[skcm_maf['Variant_Classification'].isin(non_synonymous)]
print(f"Non-synonymous mutations: {len(ns_maf)}")

Non-synonymous mutations: 47273


TMB per sample:

In [9]:
tmb_counts = (
    ns_maf['Tumor_Sample_Barcode']
    .value_counts()
    .rename_axis('sample_barcode')
    .reset_index(name='mutation_count')
)

tmb_counts['TMB'] = (tmb_counts['mutation_count'] / 38)
tmb_counts['TMB_bin'] = pd.qcut(tmb_counts['TMB'], q = 3, labels = ['Low', 'Medium', 'High'])

In [10]:
tmb_counts.head()

Unnamed: 0,sample_barcode,mutation_count,TMB,TMB_bin
0,TCGA-FR-A726-01A-11D-A32N-08,5776,152.0,High
1,TCGA-BF-A1Q0-01A-21D-A19A-08,2592,68.210526,High
2,TCGA-GN-A26C-01A-11D-A19A-08,2412,63.473684,High
3,TCGA-YG-AA3N-01A-11D-A38G-08,1644,43.263158,High
4,TCGA-BF-A1PZ-01A-11D-A19A-08,1594,41.947368,High


# Gene expression based T-cell classification

In [11]:
bag.head()

Unnamed: 0,Gene,Gene signature,Description,PMID
0,ANGPT1,Angiogenesis,"ANGPT1 functions as a TIE2 agonist, which indu...",20651738
1,ANGPT2,Angiogenesis,ANGPT2 normally functions as an ANGPT1antagoni...,20651738
2,CDH5,Angiogenesis,Vascular endothelial (VE)-cadherin is specific...,16256984
3,CXCL5,Angiogenesis,"CXCL5, a member of the CXC chemokine family, h...",21356384
4,CXCL8,Angiogenesis,IL8 is a proinflammatory CXC chemokine associa...,18980965


In [47]:
expr.index = expr.index.str.replace(r"\..*", "", regex = True)

In [59]:
gencode_map['Ensembl_ID'] = gencode_map['id'].str.replace(r"\.\d+$", "", regex = True)

In [64]:
def pathway_scores(pathway_name):
    genes = bag[bag['Gene signature'] == pathway_name]['Gene'].unique()
    matches = gencode_map[gencode_map['gene'].isin(genes)]['Ensembl_ID'].unique()
    gene_expr = expr.loc[expr.index.intersection(matches)]
    return gene_expr

In [65]:
pathway_scores('Angiogenesis')

Unnamed: 0_level_0,TCGA-ER-A199-06A,TCGA-EE-A3J5-06A,TCGA-EE-A2ME-06A,TCGA-BF-A5ES-01A,TCGA-D3-A8GE-06A,TCGA-EE-A2MG-06A,TCGA-EE-A2MR-06A,TCGA-DA-A1HY-06A,TCGA-FR-A44A-06A,TCGA-RP-A6K9-06A,...,TCGA-ER-A2NH-06A,TCGA-FS-A1ZS-06A,TCGA-HR-A2OH-06A,TCGA-ER-A19T-06A,TCGA-D3-A3C8-06A,TCGA-ER-A2NC-06A,TCGA-ER-A2NG-06A,TCGA-ER-A2NB-01A,TCGA-FS-A4F0-06A,TCGA-GN-A262-06A
Ensembl_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000091879,2.674732,0.958286,1.920179,0.208767,1.354339,2.15254,0.814837,2.379344,0.949946,0.84559,...,2.416299,0.988848,1.482693,1.886745,0.794353,3.299582,0.285698,0.337197,0.457227,1.184344
ENSG00000102755,1.782115,1.830702,2.48117,0.410558,3.143916,2.321784,1.596172,3.104504,1.420186,2.328003,...,2.705514,1.532716,1.540424,2.186184,1.346191,3.264897,1.315276,0.478609,1.012783,0.691534
ENSG00000110799,4.035131,3.107085,6.223399,2.555473,4.317413,5.412504,3.496437,2.940637,2.691958,5.253029,...,4.49446,3.584578,3.320773,4.77643,3.303664,5.066149,2.849639,2.752278,2.645056,1.746614
ENSG00000112715,1.738119,1.146003,1.378623,0.703721,2.316117,1.750178,1.716025,4.065779,1.81258,1.59321,...,3.172071,1.291191,2.16784,2.186849,1.776778,2.947517,1.465922,1.534858,2.301236,3.295414
ENSG00000119630,1.551295,3.541998,3.423713,0.982583,4.019097,3.426936,1.962327,3.574961,3.316913,2.054189,...,4.496335,0.972987,3.454571,1.678342,1.302816,3.624347,0.858936,3.239474,0.587941,5.214222
ENSG00000120156,1.12002,1.047887,2.771083,0.491699,2.506424,2.590458,1.384216,0.857583,0.425244,2.453913,...,1.416137,3.852928,0.370276,0.722991,1.12651,1.090989,0.626766,0.262674,0.547351,0.078063
ENSG00000128052,2.263966,2.471578,3.292369,0.926531,3.510481,3.345297,2.431543,2.627187,1.556748,3.435682,...,2.735002,1.694301,1.690641,2.274798,2.225491,3.422421,2.37089,0.5065,2.074677,1.00209
ENSG00000145431,1.000937,0.92873,1.754845,0.418999,1.473267,1.692918,3.27368,4.977317,1.192005,0.784839,...,1.615558,4.806968,0.618802,1.976364,0.958806,3.825938,0.853277,0.098285,0.478195,1.113167
ENSG00000150630,2.159306,1.35558,2.652555,0.96547,2.182692,2.941088,4.02618,2.140681,1.369103,2.660974,...,2.140615,3.722193,1.151534,1.033229,3.911883,2.524866,2.08658,0.864572,1.537942,0.454386
ENSG00000154188,0.171079,0.079566,0.822852,0.004753,0.398131,1.042434,0.289362,3.543211,0.752406,0.316841,...,1.172808,3.531045,0.113567,0.102322,1.457542,4.777157,0.096127,0.054918,2.345368,0.020058


# Pan cancer atlas immune subtypes (C1-C6)