In [1]:
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from io import StringIO
import requests
import os

The codes are based on this notebook

https://github.com/flatkinson/EU-ToxRisk_Tox21/blob/master/0_Tox21_assays.ipynb

In [2]:
aids_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/sourceall/tox21/aids/JSON'
tox21_summary_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/{aid}/summary/JSON"
tox21_description_url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/assay/aid/{aid}/description/JSON'
tox21_data_table = 'https://pubchem.ncbi.nlm.nih.gov/assay/pcget.cgi?query=download&record_type=datatable&actvty=all&aid={aid}'
data_dir = r"C:\Users\hy929891\OneDrive - University of Cambridge\Projects\Imaging\data"

In [3]:
# Get list of AIDs associated with Tox21...
aids = requests.get(aids_url).json()['IdentifierList']['AID']
len(aids)

224

In [4]:
def f(aid): 
    assay = requests.get(tox21_summary_url.format(aid=aid)).json()['AssaySummaries']['AssaySummary'][0]
    assay_name, method = [assay[x] for x in ('Name', 'Method')]
    try:
        gene_id = assay['Target'][0]['GI']
        target = assay['Target'][0]['Name']
    except:
        gene_id, target = '', ''
    protocol = assay['Protocol'][0] if assay_name.endswith(': Summary') else ''
    return aid, assay_name, method, target, gene_id, protocol
tox21_assays_df = pd.DataFrame([f(x) for x in tqdm(aids)], columns=['AID', 'assay_name', 'method', 'target','gene_id', 'protocol'])
tox21_summary_assays_df = tox21_assays_df[tox21_assays_df['method']=='summary']
print(len(tox21_summary_assays_df))
tox21_summary_assays_df.head()

HBox(children=(IntProgress(value=0, max=224), HTML(value='')))


59


Unnamed: 0,AID,assay_name,method,target,gene_id,protocol
4,1347038,Thyrotropin-releasing hormone receptor (TRHR) ...,summary,Thyrotropin-releasing hormone receptor,464921.0,"Please refer to other AIDs, 1346877, 720680, 7..."
5,1347037,Caspase-3/7 induction in CHO-K1 cells by small...,summary,,,"Please refer to other AIDs,1346978 and 1346979..."
6,1347036,Progesterone receptor (PR) small molecule agon...,summary,progesterone receptor [Homo sapiens],82393684.0,"Please refer to other AIDs 1346784, 1346799, 7..."
7,1347035,"TGF-beta/Smad small molecule agonists, qHTS as...",summary,,,"Please refer to other AIDs 1346859, 1346829, 7..."
8,1347034,Caspase-3/7 induction in HepG2 cells by small ...,summary,caspase-3,16516817.0,"Please refer to other AIDs, 1346978 and 134698..."


In [5]:
tox21_summary_assays_df.to_csv('scratch/tox21_summary_assays.csv')

In [6]:
tox21_inchikey = pd.read_csv('tox21_inchikey.csv', index_col=0)
janssen_inchikey = pd.read_csv('janssen_inchikey.csv', header=None)

In [22]:
original_agg_data = []
janssen_overlap_data = []
assay_all = {'aid': [], 'value': [], 'cid': []}
i = 0
for _, assay in tqdm(tox21_summary_assays_df.iterrows(), total=len(tox21_summary_assays_df)):
    aid = assay['AID']
    res = requests.get(tox21_data_table.format(aid=aid)).text
    act_df = pd.read_csv(StringIO(res)).iloc[5:]
    act_df['std_inchi_key'] = act_df['PUBCHEM_CID'].map(tox21_inchikey['std_inchi_key'])
    for idx, row in act_df.iterrows():
        try:
            # cid can be NaN; then remove it.
            assay_all['cid'].append(int(row['PUBCHEM_CID']))
        except:
            continue
        assay_all['aid'].append(aid)
        assay_all['value'].append(row['PUBCHEM_ACTIVITY_OUTCOME'])
    act_df = act_df.drop_duplicates('std_inchi_key')
    s = act_df.groupby('PUBCHEM_ACTIVITY_OUTCOME').agg('count').iloc[:, 0]
    s.name = aid
    original_agg_data.append(s)
    s = act_df[act_df['std_inchi_key'].isin(janssen_inchikey[1])].groupby('PUBCHEM_ACTIVITY_OUTCOME').agg('count').iloc[:, 0]
    s.name = aid
    janssen_overlap_data.append(s)
janssen_overlap_df = pd.concat(janssen_overlap_data, axis=1).T
janssen_overlap_df.columns = [x + '_Janssen' for x in janssen_overlap_df.columns]
original_agg_df = pd.concat(original_agg_data, axis=1).T
original_agg_df.columns = [x + '_original' for x in original_agg_df.columns]
assay_all_df = pd.DataFrame(assay_all)

HBox(children=(IntProgress(value=0, max=59), HTML(value='')))




of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [23]:
assay_all_df.to_csv('scratch/assay_all.csv')
summary_df = pd.concat([janssen_overlap_df, original_agg_df], axis=1).join(tox21_summary_assays_df.set_index('AID'))
summary_df.to_csv('janssen_overlap_summary.csv')
summary_df.head()

Unnamed: 0,Active_Janssen,Inactive_Janssen,Inconclusive_Janssen,Active_original,Inactive_original,Inconclusive_original,assay_name,method,target,gene_id,protocol
1347038,4.0,381.0,11.0,102,6962,200,Thyrotropin-releasing hormone receptor (TRHR) ...,summary,Thyrotropin-releasing hormone receptor,464921.0,"Please refer to other AIDs, 1346877, 720680, 7..."
1347037,19.0,363.0,14.0,180,6943,143,Caspase-3/7 induction in CHO-K1 cells by small...,summary,,,"Please refer to other AIDs,1346978 and 1346979..."
1347036,16.0,363.0,16.0,116,6929,221,Progesterone receptor (PR) small molecule agon...,summary,progesterone receptor [Homo sapiens],82393684.0,"Please refer to other AIDs 1346784, 1346799, 7..."
1347035,,392.0,4.0,6,7163,97,"TGF-beta/Smad small molecule agonists, qHTS as...",summary,,,"Please refer to other AIDs 1346859, 1346829, 7..."
1347034,36.0,335.0,25.0,327,6612,326,Caspase-3/7 induction in HepG2 cells by small ...,summary,caspase-3,16516817.0,"Please refer to other AIDs, 1346978 and 134698..."
