# Format molecular features of cell lines for use by Molecular Oncology Almanac
We format somatic variants, copy nubmer alterations, and fusions for use by the Molecular Oncology Almanac to test for preclinical efficacy and for matchmaking. For each data type, we will create a table that contains at most five columns:
- `model_id`: The cell line sample name
- `feature_type`: The feature type (e.g. Somatic Variant, Copy Number, Rearrangement)
- `feature`: The gene of interest
- `alteration_type`: Variant classification or copy number direction
- `alteration`: Protein change or fusion partner

Additionally, we will import GDSC sensitivity data to create an "about" table that contains the following for each cell line studied:
- booleans for each data type 
- additional names of the cell line 
- cell line ontology

We use cell lines that meet the following criteria:
- map to a CCLE name
- have all four data types available
- is not a blood cancer 
- is not subject to genetic drift
- is not contaminated by fibroblast
- for evaluation, they must be sensitive to at least one therapy and there must be at least one other cell line that shares therapeutic sensitivity

In all, we are left with 377 cell lines for evaluation and 452 for production.

In [1]:
import pandas as pd

gdsc_model_info = pd.read_csv('../source/gdsc/model_list_20200204.csv')
ccle_model_info = pd.read_csv('../source/ccle-2019/data_clinical_sample.txt', sep='\t', comment='#')

fibroblast_maps = (pd.
                   read_excel('../source/ccle-2019/41586_2019_1186_MOESM4_ESM.xlsx', 
                              sheet_name='Cell line name changes')
                   .iloc[:45, :]
                   .set_index('old_CCLE_ID')
                   .loc[:, 'new_CCLE_ID']
                   .to_dict()
                  )

summary = pd.read_csv('cell-line-names.formatted.txt', sep='\t')
summary['fibroblast'] = (summary['ccle_name'].isin(fibroblast_maps.keys()) | 
                         summary['alt_ccle'].isin(fibroblast_maps.keys()) | 
                         summary['ccle_name'].isin(fibroblast_maps.values()) |
                         summary['alt_ccle'].isin(fibroblast_maps.values())
                        ).astype(int)

drift = pd.read_excel('../source/ccle-2019/41586_2019_1186_MOESM6_ESM.xlsx', sheet_name='CCLE_GDSC_comparison')
drift_samples = drift[drift['r_somatic_CCLE_HC_vs_GDSC_WES'].fillna(0).lt(0.75)]['CCLE_ID']
summary['drift'] = (summary['ccle_name'].isin(drift_samples) | summary['alt_ccle'].isin(drift_samples)).astype(int)

ccle_alt_map = (summary
 .loc[summary['alt_ccle'].dropna().index, ['ccle_name', 'alt_ccle']]
 .set_index('alt_ccle')
 .loc[:, 'ccle_name']
 .to_dict()
)

In [2]:
summary['fibroblast'].value_counts()

0    1853
1      45
Name: fibroblast, dtype: int64

In [3]:
summary['drift'].value_counts()

0    1833
1      65
Name: drift, dtype: int64

## Somatic Variants

In [4]:
handle = '../source/ccle-2019/data_mutations_extended.txt'
df = pd.read_csv(handle, sep='\t', comment='#', low_memory=False)
df['Tumor_Sample_Barcode'].replace(fibroblast_maps, inplace=True)
df['Tumor_Sample_Barcode'].replace(ccle_alt_map, inplace=True)
somatic_samples = df['Tumor_Sample_Barcode'].drop_duplicates().sort_values().tolist()

df = df[~df['Chromosome'].eq('M')]

classification_map = {
    'Missense_Mutation': 'Missense',
    'Frame_Shift_Del': 'Frameshift',
    'Frame_Shift_Ins': 'Frameshift',
    'Nonsense_Mutation': 'Nonsense',
    'Splice_Site': 'Splice Site',
    'In_Frame_Ins': 'Insertion',
    'In_Frame_Del': 'Deletion',
    'Nonstop_Mutation': 'Nonstop'
}

idx_coding = df['Variant_Classification'].isin(list(classification_map.keys()))
silent = df[df['Variant_Classification'].eq('Silent')]

df = df[idx_coding]
df['Variant_Classification'] = df['Variant_Classification'].replace(classification_map)

somatic = df.loc[:, ['Tumor_Sample_Barcode', 'Hugo_Symbol', 'Variant_Classification', 'Protein_Change']]
somatic = somatic.rename(columns = {'Hugo_Symbol': 'feature', 
                                   'Variant_Classification': 'alteration_type',
                                   'Protein_Change': 'alteration',
                                   'Tumor_Sample_Barcode': 'model_id'})
silent.rename(columns={'Tumor_Sample_Barcode': 'model_id'}, inplace=True)

somatic = (somatic
           .loc[:, ['feature', 'alteration_type', 'alteration', 'model_id']]
           .sort_values(['model_id', 'feature', 'alteration_type', 'alteration'])
           .drop_duplicates()
           .reset_index(drop=True)
          )

somatic.to_csv('cell-lines.somatic-variants.txt', sep='\t', index=False)
somatic.head()

Unnamed: 0,feature,alteration_type,alteration,model_id
0,ABCA10,Missense,p.S609L,127399_SOFT_TISSUE
1,ABCA2,Splice Site,,127399_SOFT_TISSUE
2,ABCA5,Missense,p.E245K,127399_SOFT_TISSUE
3,ABCA9,Missense,p.M259I,127399_SOFT_TISSUE
4,ABCC10,Frameshift,p.G1345fs,127399_SOFT_TISSUE


In [5]:
summary['somatic_variants'] = (summary['ccle_name'].isin(somatic_samples) | 
                               summary['alt_ccle'].isin(somatic_samples)
                              ).astype(int)
summary['somatic_variants'].value_counts()

1    1570
0     328
Name: somatic_variants, dtype: int64

In [6]:
somatic['model_id'].value_counts().describe()

count     1570.000000
mean       509.947134
std        996.413210
min         14.000000
25%        184.000000
50%        265.000000
75%        416.000000
max      18962.000000
Name: model_id, dtype: float64

In [7]:
for gene in ['EGFR', 'CDKN2A', 'BRAF']:
    print(somatic[somatic['feature'].eq(gene)].shape[0] / len(somatic_samples))

0.08726114649681528
0.1
0.13184713375796178


## Copy Number

In [8]:
handle = '../source/ccle-2019/data_CNA.txt'
df = pd.read_csv(handle, sep='\t', low_memory=False)
df = df.set_index('Hugo_Symbol')
df.rename(columns=fibroblast_maps, inplace=True)
df.rename(columns=ccle_alt_map, inplace=True)

copy_number_samples = df.columns.tolist()

df = df.stack().to_frame().reset_index()
df[0] = df[0].astype(float)
df = df[df[0].abs().ge(2)]
df['mapped'] = df[0].replace({2.0: 'Amplification', 
                              -2.0: 'Deletion', 
                              1.0: 'Amplification',
                              -1.0: 'Deletion'})
df = df.rename(columns = 
          {'Hugo_Symbol': 'feature',
           'level_1': 'model_id',
           'mapped': 'alteration_type'
          })
df['alteration'] = ''

copy_number = (df
               .loc[:, ['feature', 'alteration_type', 'alteration', 'model_id']]
               .sort_values(['model_id', 'feature', 'alteration_type', 'alteration'])
               .drop_duplicates()
               .reset_index(drop=True)
     )

copy_number.to_csv('cell-lines.copy-numbers.txt', sep='\t', index=False)

In [9]:
summary['copy_number_alterations'] = (summary['ccle_name'].isin(copy_number_samples) | 
                                      summary['alt_ccle'].isin(copy_number_samples)
                                     ).astype(int)
summary['copy_number_alterations'].value_counts()

1    1030
0     868
Name: copy_number_alterations, dtype: int64

In [10]:
for gene in ['EGFR', 'CDKN2A', 'BRAF']:
    print(copy_number[copy_number['feature'].eq(gene)].shape[0] / len(copy_number_samples))

0.1378640776699029
0.43398058252427185
0.0970873786407767


In [11]:
copy_number['model_id'].value_counts().describe()

count    1030.000000
mean     1592.000000
std      1211.231688
min        99.000000
25%       664.000000
50%      1309.500000
75%      2301.000000
max      7463.000000
Name: model_id, dtype: float64

In [12]:
copy_number['alteration_type'].value_counts()

Deletion         948011
Amplification    691749
Name: alteration_type, dtype: int64

## Fusion

In [13]:
handle = '../source/gdsc/fusions_20191101.csv'
df = pd.read_csv(handle)

fusion_samples = df['model_id'].drop_duplicates().sort_values().tolist()

df = (df
      .rename(columns={'gene_symbol_3prime': 'feature', 'gene_symbol_5prime': 'partner'})
      .loc[:, ['feature', 'partner', 'model_id']]
      .drop_duplicates()
)

fusions = (df
          .sort_values(['model_id', 'feature', 'partner'])
          .drop_duplicates(['model_id', 'feature', 'partner'])
          .reset_index(drop=True)
         )

fusions.to_csv('cell-lines.fusions.txt', sep='\t', index=False)
fusions.head()

Unnamed: 0,feature,partner,model_id
0,EIF5A,HMGN2,SIDM00003
1,MYMK,RRBP1,SIDM00003
2,PRAME,MAPK1,SIDM00003
3,SHISA8,LARGE1,SIDM00003
4,SORD,FAM91A1,SIDM00003


In [14]:
summary['fusions'] = (summary['sanger'].isin(fusion_samples)).astype(int)
summary['fusions'].value_counts()

0    974
1    924
Name: fusions, dtype: int64

In [15]:
len(fusion_samples)

924

## Therapeutic sensitivity

In [16]:
gdsc1 = pd.read_excel('../source/gdsc/GDSC1_fitted_dose_response_25Feb20.xlsx')
gdsc2 = pd.read_excel('../source/gdsc/GDSC2_fitted_dose_response_25Feb20.xlsx')
gdsc = pd.concat([
    gdsc1[~gdsc1['DRUG_NAME'].isin(gdsc2['DRUG_NAME'])],
    gdsc2
], ignore_index=True)

gdsc_samples = gdsc['SANGER_MODEL_ID'].drop_duplicates().sort_values()

gdsc = (gdsc
        .loc[:, ['DATASET', 'SANGER_MODEL_ID', 'DRUG_NAME', 'LN_IC50', 'AUC', 'Z_SCORE']]
        .rename(columns={'SANGER_MODEL_ID': 'model_id', 'DRUG_NAME': 'therapy_name', 
                         'LN_IC50': 'ln_ic50', 'Z_SCORE': 'z_score'})
       )

gdsc = gdsc.sort_values('z_score', ascending=True).drop_duplicates(['model_id', 'therapy_name'], keep='first')

gdsc.to_csv('sanger.gdsc.txt', sep='\t', index=False)
gdsc.head()

Unnamed: 0,DATASET,model_id,therapy_name,ln_ic50,AUC,z_score
36547,GDSC1,SIDM01057,GW-2580,-1.850518,0.211278,-10.269077
196675,GDSC1,SIDM00486,AZD6094,-6.142786,0.053566,-9.829463
162790,GDSC1,SIDM00277,Motesanib,-6.043722,0.039769,-9.375108
197469,GDSC1,SIDM00538,AZD6094,-5.539672,0.098723,-9.190171
78848,GDSC1,SIDM00437,Linifanib,-6.533443,0.061135,-9.158231


In [17]:
summary['gdsc'] = (summary['sanger'].isin(gdsc_samples)).astype(int)
summary['gdsc'].value_counts()

1    988
0    910
Name: gdsc, dtype: int64

In [18]:
len(gdsc_samples)

988

## Summary

In [20]:
output = summary.copy(deep=True).reset_index(drop=True)
output['n_dtypes'] = output.loc[:, ['somatic_variants', 'copy_number_alterations', 'fusions', 'gdsc']].sum(axis=1)
output = output.merge(ccle_model_info
                      .rename(columns={'SAMPLE_ID': 'ccle_name'})
                      .loc[:, ['ccle_name', 'PRIMARY_SITE', 'TUMOR_TYPE', 'TYPE_REFINED']], 
                      on='ccle_name', how='left')

output = output.merge(gdsc_model_info
                      .rename(columns={'model_id': 'sanger', 
                                       'cancer_type': 'sanger_cancer_type', 
                                       'cancer_type_detail': 'sanger_cancer_type_detail'})
                      .loc[:, ['sanger', 'sanger_cancer_type', 'sanger_cancer_type_detail']],
                      on='sanger', how='left')

series = pd.Series(index=output['sanger'].dropna().tolist(), dtype=int)
for label, group in gdsc[gdsc['z_score'].le(-2)].groupby('model_id'):
    series.loc[label] = group.shape[0]
series.fillna(0, inplace=True)

output = (output
           .merge(
               series
               .reset_index()
               .rename(columns={'index': 'sanger', 0: 'n_therapies_sensitive'}),
               on='sanger', how='left'
    )
)    
output['n_therapies_sensitive'].fillna(pd.NA, inplace=True)

output = (output
           .merge(
               silent['model_id']
               .value_counts()
               .reset_index()
               .rename(columns={'index': 'ccle_name', 'model_id': 'silent_var_counts'}),
               on='ccle_name', how='left'
           )
)
output['silent_var_counts'].fillna(pd.NA, inplace=True)

idx_blood = (output['PRIMARY_SITE'].str.contains('Haematopoietic') | 
             output['ccle_name'].str.contains('HAEMATOPOIETIC') |
             output['sanger_cancer_type'].str.contains('Leukemia') | 
             output['sanger_cancer_type'].str.contains('Other Blood Carcinomas') | 
             output['sanger_cancer_type'].str.contains('Myeloma')
            ).fillna(False)
idx_drift = output['drift'].fillna(0).eq(1)
idx_fibroblast = output['fibroblast'].fillna(0).eq(1)
idx_reject = (idx_blood | idx_fibroblast | idx_drift)
idx_sensitive_one_therapy = output['n_therapies_sensitive'].fillna(0).ge(1)
idx_4_dtypes = output['n_dtypes'].eq(4)

output['use_evaluate'] = idx_sensitive_one_therapy & idx_4_dtypes & ~idx_reject
output['use'] = idx_4_dtypes & ~idx_reject

for column in ['use_evaluate', 'use']:
    print(column)
    print(output[column].value_counts())
    print('')
    
output.to_csv('cell-lines.summary.txt', sep='\t', index=False)

use_evaluate
False    1519
True      379
Name: use_evaluate, dtype: int64

use
False    1446
True      452
Name: use, dtype: int64



In [21]:
output[output['use'].eq(1)]['ccle_name'].isnull().value_counts()

False    452
Name: ccle_name, dtype: int64

In [22]:
summary.head()

Unnamed: 0,broad,ccle_name,sanger,alt_ccle,alt_broad,fibroblast,drift,somatic_variants,copy_number_alterations,fusions,gdsc
0,ACH-000001,NIHOVCAR3_OVARY,SIDM00105,,,0,0,1,1,1,1
1,ACH-000002,HL60_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SIDM00829,,,0,0,1,1,1,1
2,ACH-000003,CACO2_LARGE_INTESTINE,SIDM00891,,,0,0,1,0,0,0
3,ACH-000004,HEL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SIDM00594,,,0,0,1,1,1,1
4,ACH-000005,HEL9217_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,SIDM00593,,,0,0,1,1,0,0
