# Generate cell line dictionary
We create a dictionary with the following keys, indexed by cell line CCLE name, for use with the Molecular Oncology Almanac. Specifically, this dictionary is used for quick look ups when generating the report. It is saved and read with pickle.

- `ccle_name`: string, the CCLE designated name of the cell line
- `broad_name`: string, the Broad / DepMap designated name of the cell line
- `sanger_name`: string, the Sanger Institute designated name of the cell line
- `tumor_type`: string, the tumor ontology from CCLE
- `events`: list, clinically relevant events labeled either as putative actionable or investigate actionability by MOAlmanac
- `all somatic variants`: list, all somatic variants observed in the cell line whose gene appears in either MOAlmanac, Cancer Hotspots, or CGC
- `all copy number alterations`: list, all copy number alterations observed in the cell line whose gene appears in either MOAlmanac, Cancer Hotspots, or CGC
- `all fusions`: list, all fusions observed in the cell line whose gene appears in either MOAlmanac, Cancer Hotspots, or CGC
- `sensitive to`: list, all therapy names that have a zscore of less than or equal to 2 for the cell line
- `zscores`: pandas dataframe, this is a subset of the `gdsc` file that contains model id, zscore, AUC, ln_ic50, and therapy name. 

In [1]:
import glob as glob
import pandas as pd
import pickle

In [2]:
variants = pd.read_csv('annotated/cell-lines.somatic-variants.annotated.txt', sep='\t')
cnas = pd.read_csv('annotated/cell-lines.copy-numbers.annotated.txt', sep='\t')
fusions = pd.read_csv('annotated/cell-lines.fusions.annotated.txt', sep='\t')
gdsc = pd.read_csv('formatted/sanger.gdsc.txt', sep='\t')

for column in ['ln_ic50', 'AUC', 'z_score']:
    gdsc[column] = gdsc[column].astype(float)
gdsc.sort_values('z_score', ascending=True, inplace=True)

summary = pd.read_csv('formatted/cell-lines.summary.txt', sep='\t')
summary['use'] = summary['use'].replace('True', 1).replace('False', 0)
summary = summary[summary['use'].eq(1)]

print(summary.columns)
summary.head()

Index(['broad', 'ccle_name', 'sanger', 'alt_ccle', 'alt_broad', 'fibroblast',
       'drift', 'somatic_variants', 'copy_number_alterations', 'fusions',
       'gdsc', 'n_dtypes', 'PRIMARY_SITE', 'TUMOR_TYPE', 'TYPE_REFINED',
       'sanger_cancer_type', 'sanger_cancer_type_detail',
       'n_therapies_sensitive', 'silent_var_counts', 'use_evaluate', 'use'],
      dtype='object')


Unnamed: 0,broad,ccle_name,sanger,alt_ccle,alt_broad,fibroblast,drift,somatic_variants,copy_number_alterations,fusions,...,n_dtypes,PRIMARY_SITE,TUMOR_TYPE,TYPE_REFINED,sanger_cancer_type,sanger_cancer_type_detail,n_therapies_sensitive,silent_var_counts,use_evaluate,use
0,ACH-000001,NIHOVCAR3_OVARY,SIDM00105,,,0,0,1,1,1,...,4,Ovary,ovary,ovary,Ovarian Carcinoma,High Grade Ovarian Serous Adenocarcinoma,3.0,81.0,True,True
6,ACH-000007,LS513_LARGE_INTESTINE,SIDM00677,,,0,0,1,1,1,...,4,Large_Intestine,colorectal,colorectal,Colorectal Carcinoma,Cecum Adenocarcinoma,1.0,113.0,True,True
7,ACH-000008,A101D_SKIN,SIDM00801,,,0,0,1,1,1,...,4,Skin,melanoma,melanoma,Melanoma,Melanoma,10.0,105.0,True,True
8,ACH-000009,C2BBE1_LARGE_INTESTINE,SIDM01233,,,0,0,1,1,1,...,4,Large_Intestine,colorectal,colorectal,Colorectal Carcinoma,Colon Adenocarcinoma,4.0,105.0,True,True
11,ACH-000012,HCC827_LUNG,SIDM01067,,,0,0,1,1,1,...,4,Lung,lung_NSC,lung_NSC,Non-Small Cell Lung Carcinoma,Lung Adenocarcinoma,11.0,56.0,True,True


In [3]:
def display_feature_variants(dataframe):
    dataframe.reset_index(drop=True, inplace=True)
    strings = []
    for index in dataframe.index:
        series = dataframe.loc[index, :]
        if series.loc['feature_match_4'] == 1:
            strings.append(' '.join([series.loc['feature'], series.loc['alteration']]))
        elif series.loc['feature_match_3'] == 1:
            strings.append(' '.join([series.loc['feature'], series.loc['alteration_type']]))
        elif series.loc['feature_match_2'] == 1:
            strings.append(' '.join([series.loc['feature'], 'somatic variant']))
        else:
            continue
    return sorted(strings)

def display_feature_variants_all(dataframe):
    dataframe.reset_index(drop=True, inplace=True)
    dataframe['display'] = dataframe['feature'] + ' ' + dataframe['alteration']
    list_ = dataframe['display'].dropna().sort_values().tolist()
    return list_


def display_feature_cn(dataframe):
    dataframe.reset_index(drop=True, inplace=True)
    list_ = (dataframe['feature'] + ' ' + dataframe['alteration_type']).sort_values().dropna().tolist()
    return list_

def display_feature_fusion(dataframe):
    strings = []
    for index in dataframe.index:
        series = dataframe.loc[index, :]
        if int(series.loc['feature_match_4']) == 1:
            strings.append(series.loc['feature'] + '--' + series.loc['partner'])
        elif int(series.loc['feature_match_2']) == 1:
            strings.append(series.loc['which_match'] + ' Fusion')
        else:
            continue
    return sorted(strings)

def display_feature_fusion_all(dataframe):
    dataframe['display'] = dataframe['feature'] + '--' + dataframe['partner']
    return dataframe['display'].sort_values().dropna().tolist()

def subset_dataframe(dataframe):
    idx_almanac = dataframe['feature_match_1'].astype(int).eq(1)
    idx_hotspot = dataframe['cancerhotspots_bin'].astype(int).eq(1)
    idx_cgc = dataframe['cgc_bin'].astype(int).eq(1)
    return dataframe[idx_almanac | idx_hotspot | idx_cgc].reset_index(drop=True)
    #return dataframe

In [4]:
dictionary = {}

for column in variants.columns[variants.columns.str.contains('feature_match')]:
    variants[column] = variants[column].astype(int)
    cnas[column] = cnas[column].astype(int)
    fusions[column] = fusions[column].astype(int)

for label, group in summary.groupby('ccle_name'):
    series = group.loc[group.index[0], :]
    
    ccle_name = series.loc['ccle_name']
    broad = series.loc['broad']
    sanger = series.loc['sanger']
    tumor_type= series.loc['TUMOR_TYPE']
    
    label_variants = variants[variants['model_id'].eq(ccle_name)].reset_index(drop=True)
    strings_variants = display_feature_variants(label_variants[label_variants['feature_match_2'].eq(1)])
    strings_variants_all = display_feature_variants_all(subset_dataframe(label_variants))
    
    label_cns = cnas[cnas['model_id'].eq(ccle_name)].reset_index(drop=True)
    strings_cns = display_feature_cn(label_cns[label_cns['feature_match_2'].eq(1)])
    strings_cns_all = display_feature_cn(subset_dataframe(label_cns))
    
    label_fusions = fusions[fusions['model_id'].eq(sanger)].reset_index(drop=True)
    strings_fusions = display_feature_fusion(label_fusions[label_fusions['feature_match_2'].eq(1)])
    strings_fusions_all = display_feature_fusion_all(subset_dataframe(label_fusions))
    
    clinically_relevant_events = sorted(strings_variants + strings_cns + strings_fusions)
    
    label_gdsc = gdsc[gdsc['model_id'].eq(sanger)].reset_index(drop=True)
    label_gdsc['ccle_name'] = label
    sensitive_to = label_gdsc[label_gdsc['z_score'].le(-2)]['therapy_name'].sort_values().tolist()
    
    subdictionary = {}
    subdictionary['ccle_name'] = label
    subdictionary['broad_name'] = broad
    subdictionary['sanger_name'] = sanger
    subdictionary['tumor_type'] = tumor_type
    subdictionary['events'] = ', '.join(clinically_relevant_events)
    
    subdictionary['all somatic variants'] = ', '.join(strings_variants_all)
    subdictionary['all copy number alterations'] = ', '.join(strings_cns_all)
    subdictionary['all fusions'] = ', '.join(strings_fusions_all)
    
    subdictionary['sensitive to'] = ', '.join(sensitive_to)
    subdictionary['zscores'] = label_gdsc

    dictionary[broad] = subdictionary

In [5]:
import pickle

output = open('cell-lines.pkl', 'wb')
pickle.dump(dictionary, output)
output.close()