# Create a mapping between therapies catalogued in the Molecular Oncology Almanac and those profiles in GDSC
We read the fitted dose response curves from GDSC2 and create a dictionary, mapping almanac therapies of keys to a list of GDSC therapies. 

In summary,
- The Molecular Oncology Almanac contains 137 therapies. 
- GDSC2 tested 192 therapies and GDSC1 tested 345, 257 of which do not appear in GDSC1.
- We mapped almanac therapies to GDSC, with GDSC2 measurements taking priority. 
- In all, 59 therapies were mapped to GDSC therapies

In [1]:
import glob
import pandas as pd
import json

gdsc1 = pd.read_excel('source/gdsc/GDSC1_fitted_dose_response_25Feb20.xlsx')
gdsc1 = gdsc1['DRUG_NAME'].drop_duplicates().sort_values()

gdsc2 = pd.read_excel('source/gdsc/GDSC2_fitted_dose_response_25Feb20.xlsx')
gdsc2 = gdsc2['DRUG_NAME'].drop_duplicates().sort_values()

gdsc1 = gdsc1[~gdsc1.isin(gdsc2)]

handles = glob.glob('/Users/brendan/Github/moalmanac-paper/analyses/knowledge-bases/moalmanac-db/content/*')
almanac = []

columns = ['therapy_name', 'therapy_type']

for handle in handles:
    dtype = handle.split('/')[-1].split('.tsv')[0]
    tmp = pd.read_csv(handle, sep='\t')
    tmp['dtype'] = dtype
    almanac.append(tmp.loc[:, columns])
    
almanac = pd.concat(almanac, ignore_index=True)
almanac = almanac[~almanac['therapy_name'].isnull()].sort_values('therapy_name')
almanac.drop_duplicates(inplace=True)
almanac.shape

(137, 2)

In [2]:
almanac

Unnamed: 0,therapy_name,therapy_type
198,5-Fluorouracil,Chemotherapy
514,AMG 510,Targeted therapy
405,AZD3759,Targeted therapy
784,AZD8186,Targeted therapy
259,Abiraterone,Hormone therapy
...,...,...
685,VX-680,Targeted therapy
180,Vandetanib,Targeted therapy
206,Veliparib,Targeted therapy
723,Vemurafenib,Targeted therapy


In [3]:
dictionary = {}
for idx in almanac.index:
    therapy = almanac.loc[idx, 'therapy_name']
    therapy_type = almanac.loc[idx, 'therapy_type']
    dictionary[therapy] = {}
    dictionary[therapy]['type'] = therapy_type
    dictionary[therapy]['gdsc'] = []

## Map GDSC1
Earlier in the notebook we subset GDSC1 for therapies that do not appear in GDSC2. We map any almanac therapies to these drugs.

In [4]:
almanac['therapy_name'][almanac['therapy_name'].isin(gdsc1)].sort_values()

137       Alectinib
312       Bleomycin
246       Bosutinib
181    Cabozantinib
507       Cetuximab
131        Imatinib
693    Lenalidomide
310      Omipalisib
44        Pazopanib
156       Ponatinib
454     Quizartinib
336       Rucaparib
539         SU11274
790       Sunitinib
206       Veliparib
Name: therapy_name, dtype: object

In [5]:
for therapy in almanac['therapy_name'][almanac['therapy_name'].isin(gdsc1)].sort_values().tolist():
    dictionary[therapy]['gdsc'] = [therapy]

## Map GDSC2
Likewise, we map any almanac therapies to GDSC2 therapies

In [6]:
almanac['therapy_name'][almanac['therapy_name'].isin(gdsc2)].sort_values()

198    5-Fluorouracil
405           AZD3759
784           AZD8186
417          Afatinib
587         Alpelisib
161        Bortezomib
429         Cisplatin
145        Crizotinib
303        Dabrafenib
226         Dasatinib
394         Erlotinib
395         Gefitinib
146               JQ1
743         Lapatinib
250           MK-2206
219         Nilotinib
13          Niraparib
15           Olaparib
411       Osimertinib
149       Palbociclib
600        Pictilisib
474       Ruxolitinib
573       Selumetinib
189         Sorafenib
22        Talazoparib
781         Tamoxifen
536        Trametinib
Name: therapy_name, dtype: object

In [7]:
gdsc2

19581     5-Fluorouracil
105110            ABT737
107352          AGI-5198
70082           AGI-6780
126888           AMG-319
               ...      
64214            Wnt-C59
38410             XAV939
35075           YK-4-279
14067           ZM447439
84651        Zoledronate
Name: DRUG_NAME, Length: 192, dtype: object

In [8]:
for therapy in almanac['therapy_name'][almanac['therapy_name'].isin(gdsc2)].sort_values().tolist():
    dictionary[therapy]['gdsc'] = [therapy]

dictionary['AMG 510']['gdsc'] = ['KRAS (G12C) Inhibitor-12']
dictionary['nutlin-3']['gdsc'] = ['Nutlin-3a (-)']
dictionary['Vemurafenib']['gdsc'] = ['PLX-4720']

## Combination therapies
Some therapies catalogued in the Molecular Oncology Almanac are cocktails, we catalogue them here.

In [9]:
almanac['therapy_name'][almanac['therapy_name'].str.contains('\+')]

188                           Azacitidine + Panobinostat
346                               Bevacizumab + Olaparib
506                              Buparlisib + Trametinib
751               Capecitabine + Trastuzumab + Tucatinib
767            Carbogen and nicotinamide  + radiotherapy
298                              Cetuximab + Encorafenib
331                              Cetuximab + Vemurafenib
754    Chemotherapy + Hyaluronidase-zzxf + Pertuzumab...
730                         Chemotherapy + Pembrolizumab
750                           Chemotherapy + Trastuzumab
304                            Cobimetinib + Vemurafenib
293                              Dabrafenib + Trametinib
755    Docetaxel + Hyaluronidase-zzxf + Pertuzumab + ...
401                               Durvalumab + Gefitinib
400                             Durvalumab + Osimertinib
520                         FGFR1 inhibitor + Trametinib
521                        FGFR1 inhibitor + Trametinib 
322                            

In [10]:
dictionary['Buparlisib + Trametinib']['gdsc'] = ['Buparlisib', 'Trametinib']
dictionary['Cetuximab + Vemurafenib']['gdsc'] = ['PLX-4720', 'Cetuximab']
dictionary['Cobimetinib + Vemurafenib']['gdsc'] = ['PLX-4720']
dictionary['Dabrafenib + Trametinib']['gdsc'] = ['Dabrafenib', 'Trametinib']
dictionary['Durvalumab + Osimertinib']['gdsc'] = ['Osimertinib']
dictionary['FGFR1 inhibitor + Trametinib']['gdsc'] = ['Trametinib']
dictionary['GANT61 + Obatoclax']['gdsc'] = ['Obatoclax Mesylate']
dictionary['Ipilimumab + Vemurafenib']['gdsc'] = ['PLX-4720']
dictionary['Lapatinib + Trastuzumab']['gdsc'] = ['Lapatinib']
dictionary['Letrozole + Tamoxifen']['gdsc'] = ['Tamoxifen']
dictionary['Neratinib + Vemurafenib']['gdsc'] = ['PLX-4720']
dictionary['Selumetinib + Vemurafenib']['gdsc'] = ['PLX-4720', 'Selumetinib']
dictionary['Palbociclib + Trametinib']['gdsc'] = ['Palbociclib', 'Trametinib']
dictionary['Trametinib + Vemurafenib']['gdsc'] = ['PLX-4720', 'Trametinib']

In [27]:
catalogued_therapies = []
count = 0
for key in list(dictionary.keys()):
    if dictionary[key]['gdsc']:
        print(key)
        count += 1
        catalogued_therapies.append(key)
count

5-Fluorouracil
AMG 510
AZD3759
AZD8186
Afatinib
Alectinib
Alpelisib
Bleomycin
Bortezomib
Bosutinib
Buparlisib + Trametinib
Cabozantinib
Cetuximab
Cetuximab + Vemurafenib
Cisplatin
Cobimetinib + Vemurafenib
Crizotinib
Dabrafenib
Dabrafenib + Trametinib
Dasatinib
Durvalumab + Osimertinib
Erlotinib
FGFR1 inhibitor + Trametinib
GANT61 + Obatoclax
Gefitinib
Imatinib
Ipilimumab + Vemurafenib
JQ1
Lapatinib
Lapatinib + Trastuzumab
Lenalidomide
Letrozole + Tamoxifen
MK-2206
Neratinib + Vemurafenib
Nilotinib
Niraparib
Olaparib
Omipalisib
Osimertinib
Palbociclib
Palbociclib + Trametinib
Pazopanib
Pictilisib
Ponatinib
Quizartinib
Rucaparib
Ruxolitinib
SU11274
Selumetinib
Selumetinib + Vemurafenib
Sorafenib
Sunitinib
Talazoparib
Tamoxifen
Trametinib
Trametinib + Vemurafenib
Veliparib
Vemurafenib
nutlin-3


59

## Export

In [12]:
def write(handle, dictionary):
    with open(handle, 'w') as json_handle:
        json.dump(dictionary, json_handle, sort_keys=True, indent=4)

write('almanac-gdsc-mappings.json', dictionary)

## Count

In [36]:
columns = ['therapy_name', 'therapy_type', 'therapy_sensitivity', 'therapy_resistance']

almanac = []
for handle in handles:
    dtype = handle.split('/')[-1].split('.tsv')[0]
    tmp = pd.read_csv(handle, sep='\t')
    tmp['dtype'] = dtype
    almanac.append(tmp.loc[:, columns])
    
almanac = pd.concat(almanac, ignore_index=True)
almanac = almanac[~almanac['therapy_name'].isnull()].sort_values('therapy_name')

In [34]:
(almanac['therapy_name'].isin(catalogued_therapies) & ~almanac['therapy_sensitivity'].isnull()).value_counts()

False    336
True     274
dtype: int64