# QSAR Analysis of Telomerase Inhibitors Part 1 - Data Acquisition

Import libraries

In [36]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

Search target for telomerase

In [37]:
target = new_client.target
target_query = target.search("telomerase")
targets = pd.DataFrame.from_dict(target_query)
targets

Unnamed: 0,cross_references,organism,pref_name,score,species_group_flag,target_chembl_id,target_components,target_type,tax_id
0,"[{'xref_id': 'O14746', 'xref_name': None, 'xre...",Homo sapiens,Telomerase reverse transcriptase,21.0,False,CHEMBL2916,"[{'accession': 'O14746', 'component_descriptio...",SINGLE PROTEIN,9606
1,[],Rattus norvegicus,Telomerase reverse transcriptase,21.0,False,CHEMBL3108654,"[{'accession': 'Q673L6', 'component_descriptio...",SINGLE PROTEIN,10116
2,[],Homo sapiens,Prostaglandin E synthase 3,13.0,False,CHEMBL3341580,"[{'accession': 'Q15185', 'component_descriptio...",SINGLE PROTEIN,9606


Select and retrieve bioactivity data for Telomerase reverse Telomerase reverse transcriptase on Homo sapiens

In [38]:
selected_target = targets.target_chembl_id[0]
selected_target

'CHEMBL2916'

Retrieve the bioactivity data that are reported as IC50 values

In [39]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
df = pd.DataFrame.from_dict(res)
df

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,106658,[],CHEMBL813598,Inhibitory activity against telomerase extract...,B,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Telomerase reverse transcriptase,9606,,,IC50,uM,UO_0000065,,7.3
1,,106659,[],CHEMBL813598,Inhibitory activity against telomerase extract...,B,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Telomerase reverse transcriptase,9606,,,IC50,uM,UO_0000065,,100.0
2,,107796,[],CHEMBL813598,Inhibitory activity against telomerase extract...,B,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Telomerase reverse transcriptase,9606,,,IC50,uM,UO_0000065,,100.0
3,,107797,[],CHEMBL813598,Inhibitory activity against telomerase extract...,B,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Telomerase reverse transcriptase,9606,,,IC50,uM,UO_0000065,,9.5
4,,107798,[],CHEMBL813598,Inhibitory activity against telomerase extract...,B,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Telomerase reverse transcriptase,9606,,,IC50,uM,UO_0000065,,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
893,,20639333,[],CHEMBL4612658,Inhibition of telomerase in human PC3 cell-fre...,B,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Telomerase reverse transcriptase,9606,,,IC50,uM,UO_0000065,,9.1
894,,20639334,[],CHEMBL4612658,Inhibition of telomerase in human PC3 cell-fre...,B,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Telomerase reverse transcriptase,9606,,,IC50,uM,UO_0000065,,5.6
895,,23172268,[],CHEMBL4814656,Inhibition of telomerase in human SGC-7901 cel...,B,,,BAO_0000190,BAO_0000221,...,Homo sapiens,Telomerase reverse transcriptase,9606,,,IC50,uM,UO_0000065,,0.98
896,,23172269,[],CHEMBL4814656,Inhibition of telomerase in human SGC-7901 cel...,B,,,BAO_0000190,BAO_0000221,...,Homo sapiens,Telomerase reverse transcriptase,9606,,,IC50,uM,UO_0000065,,1.92


Save the result as csv

In [40]:
df.to_csv('datasets/telomerase_01_raw.csv', index=False)

Drop missing values and duplicate

In [41]:
df_2 = df.loc[df.standard_value.notna()]
df_2 = df_2.loc[df.canonical_smiles.notna()]
df_2 = df_2.drop_duplicates(['canonical_smiles'])
df_2

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,106658,[],CHEMBL813598,Inhibitory activity against telomerase extract...,B,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Telomerase reverse transcriptase,9606,,,IC50,uM,UO_0000065,,7.3
1,,106659,[],CHEMBL813598,Inhibitory activity against telomerase extract...,B,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Telomerase reverse transcriptase,9606,,,IC50,uM,UO_0000065,,100.0
2,,107796,[],CHEMBL813598,Inhibitory activity against telomerase extract...,B,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Telomerase reverse transcriptase,9606,,,IC50,uM,UO_0000065,,100.0
3,,107797,[],CHEMBL813598,Inhibitory activity against telomerase extract...,B,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Telomerase reverse transcriptase,9606,,,IC50,uM,UO_0000065,,9.5
4,,107798,[],CHEMBL813598,Inhibitory activity against telomerase extract...,B,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Telomerase reverse transcriptase,9606,,,IC50,uM,UO_0000065,,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
892,,20639332,[],CHEMBL4612658,Inhibition of telomerase in human PC3 cell-fre...,B,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Telomerase reverse transcriptase,9606,,,IC50,uM,UO_0000065,,6.0
893,,20639333,[],CHEMBL4612658,Inhibition of telomerase in human PC3 cell-fre...,B,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Telomerase reverse transcriptase,9606,,,IC50,uM,UO_0000065,,9.1
894,,20639334,[],CHEMBL4612658,Inhibition of telomerase in human PC3 cell-fre...,B,,,BAO_0000190,BAO_0000219,...,Homo sapiens,Telomerase reverse transcriptase,9606,,,IC50,uM,UO_0000065,,5.6
895,,23172268,[],CHEMBL4814656,Inhibition of telomerase in human SGC-7901 cel...,B,,,BAO_0000190,BAO_0000221,...,Homo sapiens,Telomerase reverse transcriptase,9606,,,IC50,uM,UO_0000065,,0.98


Create a dataframe with only 3 columns (molecule_chembl_id,canonical_smiles, and standard_value)

In [42]:
columns = ['molecule_chembl_id','canonical_smiles','standard_value']
df_3 = df_2[columns]
df_3

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value
0,CHEMBL314057,CCN(CC)CC.O=C(N[C@H](Cc1c[nH]c2ccccc12)C(=O)NC...,7300.0
1,CHEMBL266842,CC(C)(C)OC(=O)N[C@H](Cc1cc2ccccc2[nH]1)C(=O)N[...,100000.0
2,CHEMBL314847,CC(C)(C)OC(=O)N[C@H](Cc1c[nH]c2ccccc12)C(=O)NC...,100000.0
3,CHEMBL86984,CC(C)(C)OC(=O)N1CCC[C@@H]1C(=O)N[C@H](Cc1c[nH]...,9500.0
4,CHEMBL87554,CCN(CC)CC.N[C@H](Cc1cc2ccccc2[nH]1)C(=O)N[C@H]...,100000.0
...,...,...,...
892,CHEMBL4632724,Cc1cn([C@H]2C[C@H](n3cc(COC(=O)c4ccc(S(N)(=O)=...,6000.0
893,CHEMBL4637598,Cc1cn([C@H]2C[C@H](n3cc(CNC(=O)c4ccc(S(N)(=O)=...,9100.0
894,CHEMBL4634543,Cc1cn([C@H]2C[C@H](n3cc(C[Se]c4ccc(NC(=S)Nc5cc...,5600.0
895,CHEMBL4857212,O=C(OCCn1c([N+](=O)[O-])cnc1/C=C/c1ccc2ccccc2c...,980.0


Save new dataframe as csv

In [43]:
df_3.to_csv("datasets/telomerase_02_preprocessed.csv", index=False)

Labeling compounds as either being active, inactive or intermediate

In [44]:
df_4 = pd.read_csv("datasets/telomerase_02_preprocessed.csv")

In [45]:
bioactivity_threshold = []

for i in df_4.standard_value:
  if float(i) >= 10000:
    bioactivity_threshold.append("inactive")
  elif float(i) <= 1000:
    bioactivity_threshold.append("active")
  else:
    bioactivity_threshold.append("intermediate")

In [46]:
bioactivity_class = pd.Series(bioactivity_threshold, name='class')
df_5 = pd.concat([df_4, bioactivity_class], axis=1)
df_5

Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,class
0,CHEMBL314057,CCN(CC)CC.O=C(N[C@H](Cc1c[nH]c2ccccc12)C(=O)NC...,7300.0,intermediate
1,CHEMBL266842,CC(C)(C)OC(=O)N[C@H](Cc1cc2ccccc2[nH]1)C(=O)N[...,100000.0,inactive
2,CHEMBL314847,CC(C)(C)OC(=O)N[C@H](Cc1c[nH]c2ccccc12)C(=O)NC...,100000.0,inactive
3,CHEMBL86984,CC(C)(C)OC(=O)N1CCC[C@@H]1C(=O)N[C@H](Cc1c[nH]...,9500.0,intermediate
4,CHEMBL87554,CCN(CC)CC.N[C@H](Cc1cc2ccccc2[nH]1)C(=O)N[C@H]...,100000.0,inactive
...,...,...,...,...
660,CHEMBL4632724,Cc1cn([C@H]2C[C@H](n3cc(COC(=O)c4ccc(S(N)(=O)=...,6000.0,intermediate
661,CHEMBL4637598,Cc1cn([C@H]2C[C@H](n3cc(CNC(=O)c4ccc(S(N)(=O)=...,9100.0,intermediate
662,CHEMBL4634543,Cc1cn([C@H]2C[C@H](n3cc(C[Se]c4ccc(NC(=S)Nc5cc...,5600.0,intermediate
663,CHEMBL4857212,O=C(OCCn1c([N+](=O)[O-])cnc1/C=C/c1ccc2ccccc2c...,980.0,active


Save new dataframe as csv

In [47]:
df_5.to_csv("datasets/telomerase_03_curated.csv", index=False)