In [3]:
import pandas as pd
import glob as glob
import json

def read_json(handle):
    with open(handle) as f:
        data = json.load(f)
    return data

## Therapies

In [4]:
oncokb = pd.read_csv('sources/oncokb_biomarker_drug_associations.2021-02-04.expanded.tsv', sep='\t')
civic = read_json('sources/evidence_items.civic.2021-02-04.json')

In [6]:
civic_drugs = []
for item in civic:
    drugs = item['drugs']
    for drug in drugs:
        civic_drugs.append(drug['name'])

civic_drugs = list(set(sorted(civic_drugs)))
civic_drugs = [item.lower() for item in civic_drugs]
civic_drugs = sorted(civic_drugs)

In [7]:
almanac_drugs = []

handles = glob.glob('../moalmanac-db/content/*')
almanac = (pd
           .concat([pd.read_csv(handle, sep='\t', usecols=['therapy_name']) for handle in handles])
           .loc[:, 'therapy_name']
          )
for item in almanac.str.lower().dropna().tolist():
    almanac_drugs.extend(item.split(' + '))

almanac_drugs = sorted(list(set(almanac_drugs)))

In [8]:
oncokb_drugs = (oncokb
 .loc[:, 'Drugs (for therapeutic implications only)']
 .dropna()
 .drop_duplicates()
 .sort_values()
 .str.lower()
 .tolist()
)

oncokb_drugs_split = []
for drug in oncokb_drugs:
    oncokb_drugs_split.extend(drug.split(' + '))
    
oncokb_drugs = sorted(list(set(oncokb_drugs_split)))
oncokb_drugs = [item.lower() for item in oncokb_drugs]

In [9]:
idx_civic = pd.Index(civic_drugs)
idx_oncokb = pd.Index(oncokb_drugs)
idx_almanac = pd.Index(almanac_drugs)
idx_all = idx_civic.union(idx_oncokb).union(idx_almanac)

In [10]:
df = pd.DataFrame(0, columns=['CIViC', 'MOAlmanac', 'OncoKB'], index=idx_all)
df.loc[idx_civic, 'CIViC'] = 1
df.loc[idx_almanac, 'MOAlmanac'] = 1
df.loc[idx_oncokb, 'OncoKB'] = 1

for column in df.columns:
    print(column, df[column].eq(1).value_counts()[True])

df.to_csv('drug-comparison.txt', sep='\t', index_label='drug')

CIViC 459
MOAlmanac 125
OncoKB 90


## Genes or features

In [12]:
civic = read_json('sources/variants.civic.2021-02-04.json')
civic_genes = []
for item in civic:
    civic_genes.append(item['entrez_name'])
civic_genes = sorted(list(set(civic_genes)))

In [13]:
handles = glob.glob('..//moalmanac-db/content/*')
handles = [
    '/Users/brendan/Github/moalmanac-db/content/silencing.tsv',
    '/Users/brendan/Github/moalmanac-db/content/germline_variant.tsv',
    #'/Users/brendan/Github/moalmanac-db/content/rearrangement.tsv',
    '/Users/brendan/Github/moalmanac-db/content/knockdown.tsv',
    '/Users/brendan/Github/moalmanac-db/content/somatic_variant.tsv',
    '/Users/brendan/Github/moalmanac-db/content/copy_number.tsv'
]

almanac = pd.concat([pd.read_csv(handle, sep='\t', usecols=['gene']) for handle in handles]).loc[:, 'gene']
fusions = pd.read_csv('/Users/brendan/Github/moalmanac-db/content/rearrangement.tsv', sep='\t', usecols=['gene1', 'gene2'])
almanac = pd.concat([almanac, fusions['gene1'], fusions['gene2']], ignore_index=True)
almanac_genes = almanac.dropna().drop_duplicates().sort_values().tolist()

In [14]:
oncokb_genes = oncokb[~oncokb['Gene'].eq('Other Biomarkers')]['Gene'].sort_values().unique().tolist()

In [15]:
idx_civic = pd.Index(civic_genes)
idx_oncokb = pd.Index(oncokb_genes)
idx_almanac = pd.Index(almanac_genes)
idx_all = idx_civic.union(idx_oncokb).union(idx_almanac)

df = pd.DataFrame(0, columns=['CIViC', 'MOAlmanac', 'OncoKB'], index=idx_all)
df.loc[idx_civic, 'CIViC'] = 1
df.loc[idx_almanac, 'MOAlmanac'] = 1
df.loc[idx_oncokb, 'OncoKB'] = 1

for column in df.columns:
    print(column, df[column].eq(1).value_counts()[True])

df.to_csv('gene-comparison.txt', sep='\t', index_label='gene')

CIViC 440
MOAlmanac 139
OncoKB 155


## PubMed IDs
We compare the union and intersection of PMIDs cited by the Molecular Oncology Almanac (MOAlmanac), CIVIC, and OncoKB. Not every source in MOAlmanac is associated with a PMID, as we directly cite FDA approvals and guidelines for some references. Likewise, OncoKB and CIViC also cite conference abstracts which do not always have PMIDs.

To not directly host the databases used, the raw files of OncoKB and CIViC are not hosted in the Github repository but I do have a dataframe committed, `pmids.txt`. `pmid-comparison.png` is then produced with `pmid-comparison.R`.

In [21]:
civic_pmids = pd.read_csv('sources/pmid.civic.2021-02-04.txt', sep='\t')
civic_pmids = (civic_pmids[civic_pmids['source_type'].eq('PubMed')]
 .loc[:, 'citation_id']
 .drop_duplicates()
 .astype(int)
 .sort_values()
 .tolist()
)

In [23]:
handles = glob.glob('../moalmanac-db/content/*')
almanac = []

columns = ['pmid']

for handle in handles:
    dtype = handle.split('/')[-1].split('.tsv')[0]
    tmp = pd.read_csv(handle, sep='\t')
    tmp['dtype'] = dtype
    almanac.append(tmp.loc[:, columns])
almanac = pd.concat(almanac, ignore_index=True)
almanac_pmids = almanac['pmid'].dropna().astype(int).drop_duplicates().sort_values().tolist()

In [43]:
oncokb = pd.read_csv('sources/oncokb_biomarker_drug_associations.2021-02-04.expanded.tsv', sep='\t')
oncokb_pmids = []
series = (
    oncokb
    .loc[:, 'PMIDs']
    .fillna('')
    .astype(str)
    .str.replace(' ', '')
    .str.split(',')
)

for index in series.index:
    oncokb_pmids.extend(series.loc[index])

oncokb_pmids = sorted(list(set(oncokb_pmids)))
oncokb_pmids.remove('')

In [44]:
idx_civic = pd.Index(pd.Series(civic_pmids).astype(str).replace(' ', '').astype(int).tolist())
idx_moalmanac = pd.Index(pd.Series(almanac_pmids).astype(str).replace(' ', '').astype(int).tolist())
idx_oncokb = pd.Index(pd.Series(oncokb_pmids).astype(str).replace(' ', '').astype(int).tolist())
idx_all = idx_civic.union(idx_moalmanac).union(idx_oncokb)

df = pd.DataFrame(0, columns=['CIViC', 'MOAlmanac', 'OncoKB'], index=idx_all)
df.loc[idx_civic, 'CIViC'] = 1
df.loc[idx_moalmanac, 'MOAlmanac'] = 1
df.loc[idx_oncokb, 'OncoKB'] = 1

for column in df.columns:
    print(column, df[column].eq(1).value_counts()[True])

df.to_csv('pmid-comparison.txt', sep='\t', index_label='pmid') 

CIViC 2785
MOAlmanac 146
OncoKB 896
