## Filtering repoDB to find successful and failed drug repurposing 

Data from http://apps.chiragjpgroup.org/repoDB/

Information about terms available here
https://clinicaltrials.gov/ct2/about-studies/glossary

In [1]:
import collections
import re

import pandas as pd

In [2]:
df = pd.read_csv('repoDB.csv')

success_df = df.query('status == "Approved"').assign(label=1)
fail_df = df.query('status != "Approved"').assign(label=0)

df.head(2)

Unnamed: 0,drug_name,drug_id,ind_name,ind_id,NCT,status,phase,DetailedStatus
0,Lepirudin,DB00001,Heparin-induced thrombocytopenia with thrombosis,C0272275,,Approved,,
1,Cetuximab,DB00002,Squamous cell carcinoma of mouth,C0585362,,Approved,,


### Imperfect manual curation of function failures

Many clinical trials are not completed due to administrative reasons, such as a failure to recruit participants. These should be filtered as the actual treatment effect remains unknown.

In [3]:
fail_df['functional_failure'] = (fail_df['DetailedStatus']
.apply(lambda x: 
       not pd.isna(x)
       and 'complete' not in str(x).lower()
       and 'accru' not in str(x).lower()  # accrual and accrue
       and 'enrol' not in str(x).lower()
       and 'fund' not in str(x).lower()
       and 'recruit' not in str(x).lower()
       and 'availab' not in str(x).lower()
       and 'pi' not in str(x).lower()
       and 'p.i.' not in str(x).lower()
       and 'insurance' not in str(x).lower()
       and 'competitor' not in str(x).lower()
       and 'business' not in str(x).lower()
       and 'supply' not in str(x).lower()
       and 'access' not in str(x).lower()
       and 'eligib' not in str(x).lower()
       and 'admin' not in str(x).lower()
       and 'license' not in str(x).lower()
       and x != 'withdrawn'
       and x != 'New study written'
       and x != 'drug now on market'
      )
)

In [4]:
collections.Counter(fail_df[fail_df['functional_failure'] == True]['DetailedStatus']).most_common()

[('Temporarily stopped for assessment', 50),
 ('Major revisions needed in study', 48),
 ('Financial Sponsor requested termination', 36),
 ('This study will not be written up.', 35),
 ('Discontinued development of G3139 (oblimersen)', 20),
 ('due to safety concerns and lack of efficacy', 16),
 ('Unacceptable morbidity & mortality', 12),
 ('Increased rate of bacterial infections', 10),
 ('Sponsor withdrew support; Study did not progress to Phase II.', 9),
 ('Trial was terminated to allow sponsors to evaluate the future development of the drug program',
  9),
 ('Sponsor withdrew support; Study did not progress to Phase II (Phase I registration    NCT00895960)',
  9),
 ('The study was not activated.', 8),
 ('See termination reason in detailed description.', 8),
 ('Extreme toxicity of Pertuzumab and Erlotinib combination', 8),
 ('The study was stopped due to the inability to determine an acceptable dose with the potential    for further study',
  8),
 ('Data from the C08 study and Avant stu

In [5]:
repo_df = pd.concat([success_df, fail_df], ignore_index=True, sort=False)

xref_df = pd.read_table(
    'https://raw.githubusercontent.com/dhimmel/disease-ontology/'
    '75050ea2d4f60e745d3f3578ae03560a2cc0e444/data/xrefs-prop-slim.tsv')
umls_xref_df = xref_df.query('resource == "UMLS"')

full_df = (repo_df
 .merge(umls_xref_df, left_on='ind_id', right_on='resource_id')
 .drop(['resource', 'resource_id'], axis=1)
)

# Subset data to those which succeeded or which had (by very simplified 
#   heuristics) a functional reason for failure.
pred_df = (
    full_df
    .query('label == 1 or functional_failure == True')
)

pred_df.to_csv('repoDB_filtered.tsv', sep='\t', index=False)