In [1]:
import os
import pandas as pd
import numpy as np
import sys
sys.path.append('../Chemistry')
from standardise import *

In [2]:
folder = r'C:\Users\hy929891\OneDrive - University of Cambridge\Projects\Imaging\data'

In [3]:
jdf = pd.read_csv(os.path.join(folder, 'idr0088-screenA-annotation.csv'))
jcids = jdf['Compound PubChem CID'].unique()

## Overlap with Tox21

Tox21 data can be downloaded here:  
https://tripod.nih.gov/tox21/assays/

In [4]:
tox21 = pd.read_csv(os.path.join(folder, 'tox21_10k_library_info.tsv'), sep='\t')
tcids = tox21['PUBCHEM_CID'].unique()

In [5]:
print('Number of Janssen compounds with cids: {}'.format(len(jcids)))
print('Number of tox21 compounds with cids: {}'.format(len(tcids)))
print('Number of overlap compounds by cid: {}'.format(len(np.intersect1d(jcids, tcids))))

Number of Janssen compounds with cids: 1176
Number of tox21 compounds with cids: 8748
Number of overlap compounds by cid: 334


### Use Mitochondrial Toxicity (MT) as a example

The tox21 data from tripod is problematic. For example: in tox21-mitotox-p1.aggregrated.txt, each line is ended with "`\t\t\n`", which means there are two blank filed, however there should be only one blank field, which is PURITY. So we cannot directly use pandas to read the file. I am going to do a tricky thing

In [43]:
mt = pd.read_csv(os.path.join(folder, 'tox21-mitotox-p1.aggregrated.txt'), sep='\t')
columns = mt.columns.copy()
del mt['PURITY']
mt.columns = columns[1:]
mt.index.name = 'SAMPLE_ID'
mt = mt[pd.notna(mt['PUBCHEM_CID'])]

In [7]:
print("Number of compounds in this enpdoint: {}".format(len(mt[mt['SAMPLE_DATA_TYPE']=='activity'])))

Number of compounds in this enpdoint: 10496


### Overview of the categories in MT

In [44]:
mt[mt['SAMPLE_DATA_TYPE']=='activity'].groupby('ASSAY_OUTCOME').count()

Unnamed: 0_level_0,PROTOCOL_NAME,SAMPLE_DATA_TYPE,CHANNEL_OUTCOME,AC50,EFFICACY,REPRODUCIBILITY,CURVE_RANK,FLAG,CAS,PUBCHEM_CID,PUBCHEM_SID,PURITY_RATING,PURITY_RATING_4M,SAMPLE_NAME,SMILES,TOX21_ID,PURITY
ASSAY_OUTCOME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
active agonist,296,296,296,296,296,296,296,45,296,296,296,281,252,296,296,296,0
active antagonist,1232,1232,1232,1232,1232,1232,1232,244,1232,1232,1232,1219,1085,1232,1232,1232,0
inactive,6728,6728,6728,0,6728,6728,6728,619,6728,6728,6728,6546,5178,6728,6728,6728,0
inconclusive,994,994,994,537,994,994,994,132,994,994,994,967,797,994,994,994,0
inconclusive agonist,369,369,369,195,369,369,369,40,369,369,369,354,291,369,369,369,0
inconclusive agonist (cytotoxic),6,6,6,5,6,6,6,1,6,6,6,6,3,6,6,6,0
inconclusive antagonist,349,349,349,260,349,349,349,66,349,349,349,345,297,349,349,349,0
inconclusive antagonist (cytotoxic),373,373,373,367,373,373,373,52,373,373,373,357,319,373,373,373,0


### Overview of overlap with Janssen in terms of PUBCHEM_CID

In [45]:
mt[(mt['SAMPLE_DATA_TYPE']=='activity')&(mt['PUBCHEM_CID'].isin(jcids))].groupby('ASSAY_OUTCOME').count()

Unnamed: 0_level_0,PROTOCOL_NAME,SAMPLE_DATA_TYPE,CHANNEL_OUTCOME,AC50,EFFICACY,REPRODUCIBILITY,CURVE_RANK,FLAG,CAS,PUBCHEM_CID,PUBCHEM_SID,PURITY_RATING,PURITY_RATING_4M,SAMPLE_NAME,SMILES,TOX21_ID,PURITY
ASSAY_OUTCOME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
active agonist,29,29,29,29,29,29,29,3,29,29,29,27,26,29,29,29,0
active antagonist,54,54,54,54,54,54,54,15,54,54,54,54,53,54,54,54,0
inactive,292,292,292,0,292,292,292,26,292,292,292,290,262,292,292,292,0
inconclusive,54,54,54,27,54,54,54,7,54,54,54,54,52,54,54,54,0
inconclusive agonist,21,21,21,15,21,21,21,1,21,21,21,20,19,21,21,21,0
inconclusive antagonist,23,23,23,18,23,23,23,5,23,23,23,23,21,23,23,23,0
inconclusive antagonist (cytotoxic),22,22,22,22,22,22,22,2,22,22,22,22,21,22,22,22,0


### Overview of overlap with Janssen in terms of InchiKey

In [10]:
tox21 = pd.read_csv('tox21_inchikey.csv', index_col=0)

In [16]:
smis = jdf['Compound SMILES'].unique()
mols = pd.Series(smis, index=smis).apply(read_standardise)
inchi_keys = mols.apply(Chem.MolToInchiKey)



In [46]:
mt['std_inchi_key'] = mt['PUBCHEM_CID'].map(tox21['std_inchi_key'])

In [47]:
mt[(mt['SAMPLE_DATA_TYPE']=='activity')&(mt['std_inchi_key'].isin(inchi_keys.values))].groupby('ASSAY_OUTCOME').count()

Unnamed: 0_level_0,PROTOCOL_NAME,SAMPLE_DATA_TYPE,CHANNEL_OUTCOME,AC50,EFFICACY,REPRODUCIBILITY,CURVE_RANK,FLAG,CAS,PUBCHEM_CID,PUBCHEM_SID,PURITY_RATING,PURITY_RATING_4M,SAMPLE_NAME,SMILES,TOX21_ID,PURITY,std_inchi_key
ASSAY_OUTCOME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
active agonist,51,51,51,51,51,51,51,5,51,51,51,49,45,51,51,51,0,51
active antagonist,72,72,72,72,72,72,72,22,72,72,72,72,70,72,72,72,0,72
inactive,413,413,413,0,413,413,413,40,413,413,413,411,363,413,413,413,0,413
inconclusive,77,77,77,38,77,77,77,13,77,77,77,77,75,77,77,77,0,77
inconclusive agonist,38,38,38,28,38,38,38,2,38,38,38,37,35,38,38,38,0,38
inconclusive antagonist,27,27,27,20,27,27,27,7,27,27,27,27,25,27,27,27,0,27
inconclusive antagonist (cytotoxic),25,25,25,25,25,25,25,2,25,25,25,25,24,25,25,25,0,25
