In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
folder = r'C:\Users\hy929891\OneDrive - University of Cambridge\Projects\Imaging\data'

In [63]:
jdf = pd.read_csv(os.path.join(folder, 'idr0088-screenA-annotation.csv'))
jcids = jdf['Compound PubChem CID'].unique()

## Overlap with Tox21

Tox21 data can be downloaded here:  
https://tripod.nih.gov/tox21/assays/

In [64]:
tox21 = pd.read_csv(os.path.join(folder, 'tox21_10k_library_info.tsv'), sep='\t')
tcids = tox21['PUBCHEM_CID'].unique()

In [76]:
print('Number of Janssen compounds with cids: {}'.format(len(jcids)))
print('Number of tox21 compounds with cids: {}'.format(len(tcids)))
print('Number of overlap compounds by cid: {}'.format(len(np.intersect1d(jcids, tcids))))

Number of Janssen compounds with cids: 1176
Number of tox21 compounds with cids: 8748
Number of overlap compounds by cid: 334


### Use Mitochondrial Toxicity (MT) as a example

The tox21 data from tripod is problematic. For example: in tox21-mitotox-p1.aggregrated.txt, each line is ended with "`\t\t\n`", which means there are two blank filed, however there should be only one blank field, which is PURITY. So we cannot directly use pandas to read the file. I am going to do a tricky thing

In [47]:
mt = pd.read_csv(os.path.join(folder, 'tox21-mitotox-p1.aggregrated.txt'), sep='\t')
columns = mt.columns.copy()
del mt['PURITY']
mt.columns = columns[1:]
mt.index.name = 'SAMPLE_ID'

In [77]:
print("Number of compounds in this enpdoint: {}".format(len(mt[mt['SAMPLE_DATA_TYPE']=='activity'])))

Number of compounds in this enpdoint: 10496


### Overview of the categories in MT

In [73]:
mt[mt['SAMPLE_DATA_TYPE']=='activity'].groupby('ASSAY_OUTCOME').count()

Unnamed: 0_level_0,PROTOCOL_NAME,SAMPLE_DATA_TYPE,CHANNEL_OUTCOME,AC50,EFFICACY,REPRODUCIBILITY,CURVE_RANK,FLAG,CAS,PUBCHEM_CID,PUBCHEM_SID,PURITY_RATING,PURITY_RATING_4M,SAMPLE_NAME,SMILES,TOX21_ID,PURITY
ASSAY_OUTCOME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
active agonist,297,297,297,297,297,297,297,45,297,296,297,282,252,297,296,297,0
active antagonist,1263,1263,1263,1263,1263,1263,1263,245,1263,1232,1263,1250,1116,1263,1232,1263,0
inactive,6810,6810,6810,0,6810,6810,6810,624,6810,6728,6810,6625,5250,6810,6729,6810,0
inconclusive,1010,1010,1010,543,1010,1010,1010,133,1010,994,1010,982,810,1010,994,1010,0
inconclusive agonist,377,377,377,201,377,377,377,40,377,369,377,362,298,377,370,377,0
inconclusive agonist (cytotoxic),7,7,7,6,7,7,7,1,7,6,7,7,4,7,7,7,0
inconclusive antagonist,356,356,356,267,356,356,356,66,356,349,356,352,304,356,349,356,0
inconclusive antagonist (cytotoxic),376,376,376,370,376,376,376,52,376,373,376,360,322,376,373,376,0


### Overview of overlap with Janssen

In [74]:
mt[(mt['SAMPLE_DATA_TYPE']=='activity')&(mt['PUBCHEM_CID'].isin(jcids))].groupby('ASSAY_OUTCOME').count()

Unnamed: 0_level_0,PROTOCOL_NAME,SAMPLE_DATA_TYPE,CHANNEL_OUTCOME,AC50,EFFICACY,REPRODUCIBILITY,CURVE_RANK,FLAG,CAS,PUBCHEM_CID,PUBCHEM_SID,PURITY_RATING,PURITY_RATING_4M,SAMPLE_NAME,SMILES,TOX21_ID,PURITY
ASSAY_OUTCOME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
active agonist,30,30,30,30,30,30,30,3,30,29,30,28,26,30,29,30,0
active antagonist,85,85,85,85,85,85,85,16,85,54,85,85,84,85,54,85,0
inactive,374,374,374,0,374,374,374,31,374,292,374,369,334,374,293,374,0
inconclusive,70,70,70,33,70,70,70,8,70,54,70,69,65,70,54,70,0
inconclusive agonist,29,29,29,21,29,29,29,1,29,21,29,28,26,29,22,29,0
inconclusive agonist (cytotoxic),1,1,1,1,1,1,1,0,1,0,1,1,1,1,1,1,0
inconclusive antagonist,30,30,30,25,30,30,30,5,30,23,30,30,28,30,23,30,0
inconclusive antagonist (cytotoxic),25,25,25,25,25,25,25,2,25,22,25,25,24,25,22,25,0
