# Prediction of pharmacogenomic associations pipeline

This chemoinformatic and bioinformatic pipeline uses datasets to select conserved druggable modules between yeasts and Trypanosomes

----

## Import modules and functions

In [59]:
import pandas as pd
import os
import pipeline_functions
from importlib import reload # reload 
reload(pipeline_functions)

<module 'pipeline_functions' from '/big/lab/mercedesdg/yeast-repo/yeast_repo_pipeline/pipeline_functions.py'>

## Import datasets

In [60]:
#Data import
gdi = pd.read_csv("data/gene_drug_interaction",sep='\t')

#Data for genes
yeast_genes_data = pipeline_functions.read_dataframe_columns("data/yeast.genes.list",'\t',['species_code', 'gene_name', 'omcl'])
tryp_genes_data = pipeline_functions.read_dataframe_columns("data/tryp.genes.list",'\t',['species_code', 'gene_name', 'omcl'])
essential_gene_data = pipeline_functions.read_dataframe_columns('data/tbr.essentialOrthologs.list',',')
                                     
#Data for compounds
gdi_inchikey_data = pipeline_functions.smiles_to_inchikey(gdi,'smiles')
gdi_inchikey_data = gdi_inchikey_data[['orf','inchiKey']]                               
tested_compounds_data = pipeline_functions.read_dataframe_columns('data/tested_drugs_inchikey.csv',',')
available_compounds_data = pipeline_functions.read_dataframe_columns('data/drug_availability_inchikey.csv',',')


[15:38:42] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[15:38:43] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[15:38:43] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[15:38:43] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[15:38:43] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[15:38:43] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[15:38:43] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[15:38:43] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[15:38:43] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[15:38:43] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[15:38:43] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[15:38:43] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[15:38:43] Explicit valence for atom # 0 Cl, 1, is greater than permitted
[15:38:43] Explicit valence for atom #

# Genes filtering

In [61]:
# Assign OMCL to S. cerevisiae genes
genes_data_yeast_intersection = pipeline_functions.data_intersection(gdi_inchikey_data,yeast_genes_data,'orf','gene_name',"OMCL in Yeast",'orf')
genes_data_yeast = genes_data_yeast_intersection[0]
genes_data_yeast = genes_data_yeast[['gene_name','omcl','inchiKey']]
genes_data_yeast = genes_data_yeast.drop_duplicates()
genes_data_yeast = genes_data_yeast.dropna()


In [62]:
# Select T. cruzi genes with shared OMCL
genes_data_yeast_tryp_intersection = pipeline_functions.data_intersection(genes_data_yeast ,tryp_genes_data,'omcl','omcl',"OMCL in Tryps", "omcl")
genes_data_yeast_tryp = genes_data_yeast_tryp_intersection[0]
genes_data_yeast_tryp = genes_data_yeast_tryp[['gene_name_y','omcl','inchiKey']]
genes_data_yeast_tryp = genes_data_yeast_tryp.drop_duplicates()
genes_data_yeast_tryp = genes_data_yeast_tryp.dropna()

In [63]:
# Select Essential genes
genes_data_yeast_tryp_essential_intersection = pipeline_functions.data_intersection(genes_data_yeast_tryp,essential_gene_data,'omcl','omcl',"Essentiality", "omcl")
genes_data_yeast_tryp_essential = genes_data_yeast_tryp_essential_intersection[0]

In [64]:
genes_data_yeast_tryp_essential

Unnamed: 0,omcl,gene_name_y,inchiKey
0,OG5_127262,YHR010W,BOFQWVMAQOTZIW-UHFFFAOYSA-N
1,OG5_127262,YHR010W,FODFUEDBIXOGNY-AKXQMUJXSA-N
2,OG5_127262,YHR010W,RTKIYFITIVXBLE-WKWSCTOISA-N
3,OG5_127262,YHR010W,ISIMKBKTIVYXIT-PQSRJMQZSA-N
4,OG5_127262,YHR010W,HKVAMNSJSFKALM-GKUWKFKPSA-N
...,...,...,...
128824,OG5_126984,YJL138C,FAPWRFPIFSIZLT-UHFFFAOYSA-M
128825,OG5_126984,YJL138C,ALYNCZNDIQEVRV-UHFFFAOYSA-N
128826,OG5_126984,YJL138C,PICXIOQBANWBIZ-UHFFFAOYSA-N
128827,OG5_126984,YJL138C,VQQVWGVXDIPORV-UHFFFAOYSA-N


In [65]:
# Set promiscuity threshold
umbral=200
genes_count = genes_data_yeast_tryp_essential.groupby('gene_name_y').nunique()
genes_count = genes_count.sort_values(by=['inchiKey'])
genes_count = genes_count[genes_count['inchiKey']<umbral]
genes_count = genes_count.unstack(level=-1).reset_index().drop('level_0', axis=1)
genes_count = genes_count.drop_duplicates()
gene_selection = pd.merge(left=gdi_inchikey_data,right=genes_count,how="inner",left_on=['orf'], right_on=['gene_name_y'])
gene_selection = gene_selection[['orf','inchiKey']]

In [66]:
#Create a df with deleted data
not_selected_genes = pd.concat([genes_data_yeast_intersection[1], genes_data_yeast_tryp_intersection[1], genes_data_yeast_tryp_essential_intersection[1]], ignore_index=False)

In [67]:
#See results for selected data
full_dataset_length = gdi_inchikey_data['orf'].nunique()
assign_omcl_length = genes_data_yeast['gene_name'].nunique()
select_t_cruzi_length = genes_data_yeast_tryp['gene_name_y'].nunique()
select_essential_length = genes_data_yeast_tryp_essential['gene_name_y'].nunique()
delete_promiscuous_length = gene_selection['orf'].nunique()

results_selected_genes = pd.DataFrame({
    'Filter': ['Full Dataset Length', 'Assign OMCL to S. cerevisiae genes',
                'Select T. cruzi genes with shared OMCL',
                'Select Essential genes', 'Delete Promiscuous genes'],
    'Count': [full_dataset_length, assign_omcl_length, select_t_cruzi_length,
              select_essential_length, delete_promiscuous_length]
})

print(results_selected_genes)

                                   Filter  Count
0                     Full Dataset Length   5811
1      Assign OMCL to S. cerevisiae genes   5008
2  Select T. cruzi genes with shared OMCL   1780
3                  Select Essential genes   1043
4                Delete Promiscuous genes    995


In [68]:
#See results for not selected data
count_not_selected_genes = not_selected_genes['Filter Name'].value_counts()
results_not_selected_genes = pd.DataFrame(count_not_selected_genes).reset_index()
results_not_selected_genes.columns = ['Filter', 'Count']
print(results_not_selected_genes)

          Filter  Count
0  OMCL in Tryps   2867
1   Essentiality    610
2  OMCL in Yeast    570


# Analysis of genes filter

In [70]:
results_analysis_yeast = pipeline_functions.data_intersection(gdi_inchikey_data,genes_data_yeast,'orf','gene_name','Merged data','orf')

In [71]:
results_analysis_yeast = results_analysis_yeast[0]
results_analysis_yeast = results_analysis_yeast[['gene_name','omcl','inchiKey_y']]
results_analysis_yeast = results_analysis_yeast.drop_duplicates()
results_analysis_yeast = results_analysis_yeast.dropna()

In [72]:
results_analysis_yeast_tryp = pipeline_functions.data_intersection(gdi_inchikey_data,genes_data_yeast_tryp,'orf','gene_name_y','Merged data','orf')

In [73]:
results_analysis_yeast_tryp = results_analysis_yeast_tryp[0]
results_analysis_yeast_tryp = results_analysis_yeast_tryp[['gene_name_y','omcl','inchiKey_y']]
results_analysis_yeast_tryp = results_analysis_yeast_tryp.drop_duplicates()
results_analysis_yeast_tryp = results_analysis_yeast_tryp.dropna()

In [74]:
print("All associations with ortholog")
print(len(results_analysis_yeast_tryp))
print("All genes with ortholog")
print(len(results_analysis_yeast_tryp['gene_name_y'].unique()))
print("All compounds with ortholog")
print(len(results_analysis_yeast_tryp['inchiKey_y'].unique()))

All associations with ortholog
88891
All genes with ortholog
1780
All compounds with ortholog
2422


In [75]:
results_analysis_yeast_tryp_essential = pipeline_functions.data_intersection(gdi_inchikey_data,genes_data_yeast_tryp_essential,'orf','gene_name_y','Merged data','orf')

In [76]:
results_analysis_yeast_tryp_essential = results_analysis_yeast_tryp_essential[0]
results_analysis_yeast_tryp_essential = results_analysis_yeast_tryp_essential[['gene_name_y','omcl','inchiKey_y']]
results_analysis_yeast_tryp_essential = results_analysis_yeast_tryp_essential.drop_duplicates()
results_analysis_yeast_tryp_essential = results_analysis_yeast_tryp_essential.dropna()

In [77]:
print("All essential associations")
print(len(results_analysis_yeast_tryp_essential))
print("All essential genes")
print(len(results_analysis_yeast_tryp_essential['gene_name_y'].unique()))
print("All essential compounds")
print(len(results_analysis_yeast_tryp_essential['inchiKey_y'].unique()))

All essential associations
53807
All essential genes
1043
All essential compounds
2188


In [78]:
results_analysis_yeast_tryp_essential_merged = pipeline_functions.data_intersection(gdi_inchikey_data,gene_selection,'orf','orf','Merged data','orf')

In [79]:
results_analysis_yeast_tryp_essential_merged = results_analysis_yeast_tryp_essential_merged[0]
results_analysis_yeast_tryp_essential_merged = results_analysis_yeast_tryp_essential_merged[['orf','inchiKey_y']]
results_analysis_yeast_tryp_essential_merged = results_analysis_yeast_tryp_essential_merged.drop_duplicates()
results_analysis_yeast_tryp_essential_merged = results_analysis_yeast_tryp_essential_merged.dropna()

In [80]:
print("All merged associations")
print(len(results_analysis_yeast_tryp_essential_merged))
print("All merged genes")
print(len(results_analysis_yeast_tryp_essential_merged['orf'].unique()))
print("All merged compounds")
print(len(results_analysis_yeast_tryp_essential_merged['inchiKey_y'].unique()))

All merged associations
40271
All merged genes
995
All merged compounds
1343


# Compounds filtering

In [81]:
# Select novel compounds
compounds_data_tested_intersection = pipeline_functions.data_not_in_intersection(gdi_inchikey_data,tested_compounds_data,'inchiKey','inchikey','Tested compound')
compounds_data_tested = compounds_data_tested_intersection[0]
compounds_data_tested = compounds_data_tested[['orf','inchiKey']]
compounds_data_tested = compounds_data_tested.drop_duplicates()
compounds_data_tested = compounds_data_tested.dropna()

In [82]:
# Select commercially available compounds
compounds_data_tested_available_intersection = pipeline_functions.data_intersection(gdi_inchikey_data,available_compounds_data,'inchiKey','inchikey','Commercially available','inchiKey')
compounds_data_tested_available = compounds_data_tested_available_intersection[0]
compounds_data_tested_available = compounds_data_tested_available[['orf','inchiKey','smiles']]
compounds_data_tested_available = compounds_data_tested_available.drop_duplicates()
compounds_data_tested_available = compounds_data_tested_available.dropna()

In [83]:
# Select drug like and lead like compounds
compounds_data_tested_available_druglike_function = pipeline_functions.drug_likness(compounds_data_tested_available,'smiles')
compounds_data_tested_available_druglike = compounds_data_tested_available_druglike_function[0]
compounds_data_tested_available_druglike = compounds_data_tested_available_druglike[['orf','inchiKey','smiles']]
compounds_data_tested_available_druglike = compounds_data_tested_available_druglike.drop_duplicates()
compounds_data_tested_available_druglike = compounds_data_tested_available_druglike.dropna()

In [84]:
# Set promiscuity threshold
umbral=20
drug_count = compounds_data_tested_available_druglike.groupby('inchiKey').nunique()
drug_count = drug_count.sort_values(by=['orf'])
drug_count = drug_count[drug_count['orf']<umbral]
drug_count = drug_count.unstack(level=-1).reset_index().drop('level_0', axis=1)
drug_count = drug_count.drop_duplicates()

drug_selection = pd.merge(left=gdi_inchikey_data,right=drug_count,how="inner",left_on=['inchiKey'], right_on=['inchiKey'])

In [85]:
#Create a df with deleted data
not_selected_compounds = pd.concat([compounds_data_tested_intersection[1], compounds_data_tested_available_intersection[1], compounds_data_tested_available_druglike_function[1]], ignore_index=False)

In [86]:
drug_selection.head(2)

Unnamed: 0,orf,inchiKey,0
0,YDR334W,ONWXNHPOAGOMTG-UHFFFAOYSA-N,14
1,YDR334W,ONWXNHPOAGOMTG-UHFFFAOYSA-N,1


In [87]:
#See results for selected data
full_dataset_length = gdi_inchikey_data['inchiKey'].nunique()
tested_length = compounds_data_tested['inchiKey'].nunique()
avaiable_length = compounds_data_tested_available['inchiKey'].nunique()
druglike_length = compounds_data_tested_available_druglike['inchiKey'].nunique()
delete_promiscuous_length = drug_selection['inchiKey'].nunique()

results_selected_compounds = pd.DataFrame({
    'Filter': ['Full Dataset Length', 'Delete tested compounds',
                'Select commercially available compounds',
                'Select druglike compounds', 'Delete Promiscuous compounds'],
    'Count': [full_dataset_length, tested_length, avaiable_length,
              druglike_length, delete_promiscuous_length]
})

print(results_selected_compounds)

                                    Filter  Count
0                      Full Dataset Length   2788
1                  Delete tested compounds   2739
2  Select commercially available compounds   1217
3                Select druglike compounds   1148
4             Delete Promiscuous compounds    767


In [88]:
#See results for not selected data
count_not_selected_compounds = not_selected_compounds['Filter Name'].value_counts()
results_not_selected_compounds = pd.DataFrame(count_not_selected_compounds).reset_index()
results_not_selected_compounds.columns = ['Filter', 'Count']
print(results_not_selected_compounds)

                   Filter  Count
0            Not Druglike  30123
1         Tested compound  24570
2  Commercially available   1571


# Analysis of compounds filter

In [89]:
results_analysis_tested = pipeline_functions.data_intersection(gdi_inchikey_data,compounds_data_tested,'inchiKey','inchiKey','Merged data','inchiKey')

In [90]:
results_analysis_tested = results_analysis_tested[0]
results_analysis_tested = results_analysis_tested[['orf_y','inchiKey']]
results_analysis_tested = results_analysis_tested.drop_duplicates()
results_analysis_tested = results_analysis_tested.dropna()

In [91]:
print("All novel associations")
print(len(results_analysis_tested))
print("All novel genes")
print(len(results_analysis_tested['orf_y'].unique()))
print("All novel compounds")
print(len(results_analysis_tested['inchiKey'].unique()))

All novel associations
247479
All novel genes
5808
All novel compounds
2739


In [92]:
results_analysis_tested_available = pipeline_functions.data_intersection(gdi_inchikey_data,compounds_data_tested_available,'inchiKey','inchiKey','Merged data','inchiKey')

In [93]:
results_analysis_tested_available = results_analysis_tested_available[0]
results_analysis_tested_available = results_analysis_tested_available[['orf_y','inchiKey']]
results_analysis_tested_available = results_analysis_tested_available.drop_duplicates()
results_analysis_tested_available = results_analysis_tested_available.dropna()

In [94]:
print("All available associations")
print(len(results_analysis_tested_available))
print("All available genes")
print(len(results_analysis_tested_available['orf_y'].unique()))
print("All available compounds")
print(len(results_analysis_tested_available['inchiKey'].unique()))

All available associations
52564
All available genes
5222
All available compounds
1217


In [95]:
compounds_data_tested_available_druglike = pipeline_functions.data_intersection(gdi_inchikey_data,compounds_data_tested_available_druglike,'inchiKey','inchiKey','Merged data','inchiKey')

In [96]:
compounds_data_tested_available_druglike = compounds_data_tested_available_druglike[0]
compounds_data_tested_available_druglike = compounds_data_tested_available_druglike[['orf_y','inchiKey']]
compounds_data_tested_available_druglike = compounds_data_tested_available_druglike.drop_duplicates()
compounds_data_tested_available_druglike = compounds_data_tested_available_druglike.dropna()

In [97]:
print("All druglike associations")
print(len(compounds_data_tested_available_druglike))
print("All druglike genes")
print(len(compounds_data_tested_available_druglike['orf_y'].unique()))
print("All druglike compounds")
print(len(compounds_data_tested_available_druglike['inchiKey'].unique()))

All druglike associations
46454
All druglike genes
5119
All druglike compounds
1148


# Data merge

This step is memory-intensive and can take a long time to complete.

Feel free to take a break or make a cup of coffee while it runs.

In [98]:
results = pipeline_functions.data_intersection(drug_selection,gene_selection,'inchiKey','inchiKey','Merged data','inchiKey')

In [99]:
results_selected = results[0]
results_selected = results_selected[['orf_x','inchiKey']]
results_selected = results_selected.rename(columns={'orf_x':'orf'}) 
results_selected = results_selected.drop_duplicates()
results_selected = results_selected.dropna()

In [100]:
print("All merged associations")
print(len(results_selected))
print("All merged genes")
print(len(results_selected['orf'].unique()))
print("All merged compounds")
print(len(results_selected['inchiKey'].unique()))

All merged associations
294
All merged genes
99
All merged compounds
211


In [101]:
results_selected.to_csv('pipeline_results/gdi_results.csv',index=False)

In [102]:
not_selected_data = pd.concat([results[1], not_selected_genes, not_selected_compounds], ignore_index=False)

In [103]:
not_selected_data = not_selected_data[['inchiKey','orf','Filter Name']]

In [104]:
#See results for not selected data
count_not_selected_data = not_selected_data['Filter Name'].value_counts()
results_not_selected_data = pd.DataFrame(count_not_selected_data).reset_index()
results_not_selected_data.columns = ['Filter', 'Count']
print(results_not_selected_data)

                   Filter  Count
0            Not Druglike  30123
1         Tested compound  24570
2           OMCL in Tryps   2867
3  Commercially available   1571
4            Essentiality    610
5           OMCL in Yeast    570
6             Merged data    556
