# Format I-PREDICT for annotation with Molecular Oncology Almanac
We'll read the excel file from the supplement directly and export formatted files to be annotated by the Molecular Oncology Almanac. Formatted files will be placed in the folder `formatted_variants/`. 

## Initial formatting
We read the variants, as are, from the excel spreadsheet and prepare them for annotation with Molecular Oncology Almanac. We annotate for alteration type for point variants based on sequence variant nomenclature (https://varnomen.hgvs.org/). 

The variants will be formatted mostly in Python and exported to `variants-to-format.raw.txt`. The file will be copied and renamed to be `variants-to-format.txt` before proceeding with the remainder of the formatting, manually. 

We focused on variants called from `Tissue NGS` in 2019-Sicklick. 

In [1]:
import pandas as pd

handle = 'paper/2019-Sicklick-supplement.xlsx'
df = pd.read_excel(handle, sheet_name='Supplementary Table 2', header=2)
df = df.iloc[:83, :]

In [2]:
tmb = 'Tumor Mutational Burden (TMB)'
msi = 'Microsatellite Status (MSI)'

df[tmb].value_counts()

Low             33
Intermediate    17
High             5
Name: Tumor Mutational Burden (TMB), dtype: int64

In [3]:
df.columns

Index(['Study ID', 'Tissue NGS', 'ctDNA NGS', 'Total Genomic Alterations',
       'PD-L1 (+) IHC ', 'Tumor Mutational Burden (TMB)',
       'Microsatellite Status (MSI)', 'Other Markers Targeted (IHC)',
       'Matched Drug(s) / Agent(s) Administered',
       'Drug Match to Genomic Alteration',
       'Drug(s) / Agent(s) Not Matched to Genomic Alteration for Patients with No Match',
       'References'],
      dtype='object')

In [4]:
df.head(2)

Unnamed: 0,Study ID,Tissue NGS,ctDNA NGS,Total Genomic Alterations,PD-L1 (+) IHC,Tumor Mutational Burden (TMB),Microsatellite Status (MSI),Other Markers Targeted (IHC),Matched Drug(s) / Agent(s) Administered,Drug Match to Genomic Alteration,Drug(s) / Agent(s) Not Matched to Genomic Alteration for Patients with No Match,References
0,2,"HGF amplification, MET amplification",,2.0,Yes,Low,Stable,,crizotinib,crizotinib for HGF amplification and MET ampli...,,1
1,5,"CDK4 amplification, MDM2 amplification, WT1 tr...",,3.0,,Low,Stable,,palbociclib,palbociclib for CDK4 amplification,,2


In [5]:
tmp = df['Tissue NGS'].str.split(', ', expand=True).stack(0).reset_index().drop('level_1', axis=1)
tmp.columns = ['main_index', 'feature_full']

for idx in tmp.index:
    if tmp.loc[idx, 'feature_full'][0] == ' ':
        tmp.loc[idx, 'feature_full'] = tmp.loc[idx, 'feature_full'][1:]
tmp['feature'] = tmp['feature_full'].str.split(' ', expand=True).loc[:, 0]

In [6]:
amplification = tmp[
    (tmp['feature_full'].str.lower().str.contains('amplification')) | 
    (tmp['feature_full'].str.lower().str.contains('duplication')) |
    (tmp['feature_full'].str.lower().str.contains('amp'))
].index
tmp.loc[amplification, 'feature_type'] = 'Copy Number'
tmp.loc[amplification, 'alteration_type'] = 'Amplification'

deletion = tmp[
    (tmp['feature_full'].str.contains('loss'))
].index
tmp.loc[deletion, 'feature_type'] = 'Copy Number'
tmp.loc[deletion, 'alteration_type'] = 'Deletion'

fusion = tmp[
    (tmp['feature_full'].str.lower().str.contains('rearrangement')) | 
    (tmp['feature_full'].str.lower().str.contains('fusion'))
].index
tmp.loc[fusion, 'feature_type'] = 'Rearrangement'
tmp.loc[fusion, 'alteration_type'] = 'Fusion'
tmp.loc[fusion, 'alteration'] = tmp.loc[fusion, 'feature_full'].apply(lambda x: '{}'.format(x.split(' ')[0]))

var_del = tmp[
    (tmp['feature_full'].str.lower().str.contains('del'))
].index
tmp.loc[var_del, 'feature_type'] = 'Somatic Variant'
tmp.loc[var_del, 'alteration_type'] = 'In_Frame_Del'
tmp.loc[var_del, 'alteration'] = tmp.loc[var_del, 'feature_full'].apply(lambda x: 'p.{}'.format(x.split(' ')[-1]))

var_ins = tmp[
    (tmp['feature_full'].str.lower().str.contains('ins'))
].index
tmp.loc[var_ins, 'feature_type'] = 'Somatic Variant'
tmp.loc[var_ins, 'alteration_type'] = 'In_Frame_Ins'
tmp.loc[var_ins, 'alteration'] = tmp.loc[var_ins, 'feature_full'].apply(lambda x: 'p.{}'.format(x.split(' ')[-1]))

var_nonsense = tmp[
    (tmp['feature_full'].str.lower().str.contains('\*'))
].index
tmp.loc[var_nonsense, 'feature_type'] = 'Somatic Variant'
tmp.loc[var_nonsense, 'alteration_type'] = 'Nonsense_Mutation'
tmp.loc[var_nonsense, 'alteration'] = tmp.loc[var_nonsense, 'feature_full'].apply(lambda x: 'p.{}'.format(x.split(' ')[-1]))

var_splice = tmp[
    (tmp['feature_full'].str.lower().str.contains('splice site'))
].index
tmp.loc[var_splice, 'feature_type'] = 'Somatic Variant'
tmp.loc[var_splice, 'alteration_type'] = 'Splice_Site'
tmp.loc[var_splice, 'alteration'] = tmp.loc[var_splice, 'feature_full'].apply(lambda x: 'p.{}'.format(x.split(' ')[-1]))

In [7]:
tmp['alteration_type'].value_counts()

Amplification        160
Nonsense_Mutation     95
Deletion              19
Splice_Site            7
Fusion                 7
In_Frame_Del           3
In_Frame_Ins           2
Name: alteration_type, dtype: int64

In [8]:
tmp.to_csv('variants-to-format.raw.txt', sep='\t', index=False)

## Split files to be annotated with the molecular oncology almanac
`variants-to-format.txt` will be read and split into individual folders for each patient.

In [9]:
classification_map = {
        'Missense_Mutation': 'Missense',
        'Nonsense_Mutation': 'Nonsense',
        'Nonstop_Mutation': 'Nonstop',
        'Splice_Site': 'Splice Site',
        'Frame_Shift_Ins': 'Frameshift',
        'Frame_Shift_Del': 'Frameshift',
        'In_Frame_Ins': 'Insertion',
        'In_Frame_Del': 'Deletion'
    }

variants = pd.read_csv('variants-to-format.txt', sep='\t')
variants['alteration_type'].replace(classification_map, inplace=True)

In [10]:
variants['feature_type'].value_counts()

Somatic Variant             281
Copy Number                 195
Rearrangement                13
IHC                          12
Mutational Burden             5
Microsatellite Stability      1
Name: feature_type, dtype: int64

In [11]:
df = df.reset_index().rename(columns={'index': 'main_index'})
combined = pd.merge(variants, df.loc[:, ['main_index', 'Study ID']], how='left')
combined.drop('main_index', axis=1, inplace=True)
combined.rename(columns={'Study ID': 'patient_id'}, inplace=True)
combined['patient_id'] = combined['patient_id'].astype(str)
combined.sort_values(['patient_id'], inplace=True)
combined.head()

Unnamed: 0,feature_full,feature,feature_type,alteration_type,alteration,patient_id
398,SMAD4 loss,SMAD4,Copy Number,Deletion,,101
75,CCND1 amplification,CCND1,Copy Number,Amplification,,101
120,CDKN2A/B loss,CDKN2A,Copy Number,Deletion,,101
121,CDKN2A/B loss,CDKN2B,Copy Number,Deletion,,101
6,AKT2 amplification,AKT2,Copy Number,Amplification,,101


In [12]:
for label, group in combined.groupby('patient_id'):
    outname = 'formatted_variants/{}.formatted_variants.txt'.format(label)
    group.drop('feature_full', axis=1).to_csv(outname, sep='\t', index=False)

In [13]:
combined

Unnamed: 0,feature_full,feature,feature_type,alteration_type,alteration,patient_id
398,SMAD4 loss,SMAD4,Copy Number,Deletion,,101
75,CCND1 amplification,CCND1,Copy Number,Amplification,,101
120,CDKN2A/B loss,CDKN2A,Copy Number,Deletion,,101
121,CDKN2A/B loss,CDKN2B,Copy Number,Deletion,,101
6,AKT2 amplification,AKT2,Copy Number,Amplification,,101
...,...,...,...,...,...,...
271,KRAS G12D,KRAS,Somatic Variant,Missense,p.G12D,A045
135,CDKN2A/B loss,CDKN2B,Copy Number,Deletion,,A045
134,CDKN2A/B loss,CDKN2A,Copy Number,Deletion,,A045
472,TP53 V197G,TP53,Somatic Variant,Missense,p.V197G,A045
