# PRIDE NLP
NLP NHS for drug information knowledge graph

## Imports

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import os
import gzip
import pylab as plt
import json
from nltk.corpus import gutenberg
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.manifold import TSNE
import collections
from string import punctuation
import requests
from bs4 import BeautifulSoup
from sklearn.preprocessing import OneHotEncoder
from pandas.io.json import json_normalize

Using TensorFlow backend.


## Data Preprocessing
Worked with files downloaded from:
http://ctdbase.org/downloads/;jsessionid=74BC65C36365CF4BCC193C670F9C670E#cd

In [36]:
#Chemical disease associations
#Fields:
#ChemicalName
#ChemicalID (MeSH identifier)
#CasRN (CAS Registry Number, if available)
#DiseaseName
#DiseaseID (MeSH or OMIM identifier)
#DirectEvidence ('|'-delimited list)
#InferenceGeneSymbol
#InferenceScore
#OmimIDs ('|'-delimited list)
#PubMedIDs ('|'-delimited list)

chemdisease = pd.read_csv('CTD_chemicals_diseases.csv.gz', compression='gzip',  skiprows=range(1, 29), 
                       names = ['ChemicalName',
                               'ChemicalID',
                               'CasRN',
                               'DiseaseName',
                               'DiseaseID',
                               'DirectEvidence',
                               'InferenceGeneSymbol',
                               'InferenceScore',
                               'OmimIDs',
                               'PubMedIDs'])
chemdisease = chemdisease[1:]
chemdisease.head()

Unnamed: 0,ChemicalName,ChemicalID,CasRN,DiseaseName,DiseaseID,DirectEvidence,InferenceGeneSymbol,InferenceScore,OmimIDs,PubMedIDs
1,06-Paris-LA-66 protocol,C046983,,Precursor Cell Lymphoblastic Leukemia-Lymphoma,MESH:D054198,therapeutic,,,,4519131
2,10074-G5,C534883,,Adenocarcinoma,MESH:D000230,,MYC,4.08,,26432044
3,10074-G5,C534883,,Adenocarcinoma of Lung,MESH:D000077192,,MYC,4.32,,26656844|27602772
4,10074-G5,C534883,,Alopecia,MESH:D000505,,AR,4.51,,15902657
5,10074-G5,C534883,,Androgen-Insensitivity Syndrome,MESH:D013734,,AR,6.86,300068|312300,1303262|8281139


In [37]:
#limit to only parkinson
parkinsongenesdf = chemdisease[chemdisease['DiseaseName'].str.contains('parkinson', flags=re.IGNORECASE, regex=True)].reset_index(drop=True)
#drop fields we aren't interested in
parkinsongenesdf.drop(['CasRN', 'DirectEvidence', 'InferenceGeneSymbol', 
                       'InferenceScore', 'OmimIDs','PubMedIDs'], axis = 1, inplace = True)
parkinsongenesdf.head()

Unnamed: 0,ChemicalName,ChemicalID,DiseaseName,DiseaseID
0,"10,11-dihydro-10-hydroxycarbamazepine",C039775,Parkinson Disease,MESH:D010300
1,"10,11-dihydro-10-hydroxycarbamazepine",C039775,"Parkinson Disease, Secondary",MESH:D010302
2,10-(6'-ubiquinonyl)decyltriphenylphosphonium b...,C476756,Parkinson Disease,MESH:D010300
3,10-(6'-ubiquinonyl)decyltriphenylphosphonium b...,C476756,Parkinson Disease,MESH:D010300
4,10-(6'-ubiquinonyl)decyltriphenylphosphonium b...,C476756,Parkinson Disease,MESH:D010300


In [5]:
#Chemical–GO enriched associations
#Fields:
#ChemicalName
#ChemicalID (MeSH identifier)
#CasRN (CAS Registry Number, if available)
#Ontology
#GOTermName
#GOTermID
#HighestGOLevel
#PValue
#CorrectedPValue
#TargetMatchQty
#TargetTotalQty
#BackgroundMatchQty
#BackgroundTotalQty

chemenrich = pd.read_csv('CTD_chemicals_diseases.csv.gz', compression='gzip',  skiprows=range(1, 29), 
                       names = ['ChemicalName',
                               'ChemicalID',
                               'CasRN',
                               'Ontology',
                               'GOTermName',
                               'GOTermID',
                               'HighestGOLevel',
                               'PValue',
                               'CorrectedPValue',
                               'TargetMatchQty',
                               'TargetTotalQty',
                               'BackgroundMatchQty',
                               'BackgroundTotalQty'])
chemenrich = chemenrich[1:]
chemenrich.head()

Unnamed: 0,ChemicalName,ChemicalID,CasRN,Ontology,GOTermName,GOTermID,HighestGOLevel,PValue,CorrectedPValue,TargetMatchQty,TargetTotalQty,BackgroundMatchQty,BackgroundTotalQty
1,06-Paris-LA-66 protocol,C046983,,Precursor Cell Lymphoblastic Leukemia-Lymphoma,MESH:D054198,therapeutic,,,,4519131,,,
2,10074-G5,C534883,,Adenocarcinoma,MESH:D000230,,MYC,4.08,,26432044,,,
3,10074-G5,C534883,,Adenocarcinoma of Lung,MESH:D000077192,,MYC,4.32,,26656844|27602772,,,
4,10074-G5,C534883,,Alopecia,MESH:D000505,,AR,4.51,,15902657,,,
5,10074-G5,C534883,,Androgen-Insensitivity Syndrome,MESH:D013734,,AR,6.86,300068|312300,1303262|8281139,,,


In [38]:
#limit to only parkinson
parkinsonendf = chemenrich[chemenrich['Ontology'].str.contains('parkinson', flags=re.IGNORECASE, regex=True)].reset_index(drop=True)
parkinsonendf.drop(['CasRN', 'GOTermID', 'HighestGOLevel', 'CorrectedPValue', 
                    'TargetTotalQty', 'BackgroundMatchQty','BackgroundTotalQty'], axis = 1, inplace = True)
parkinsonendf.head()

Unnamed: 0,ChemicalName,ChemicalID,Ontology,GOTermName,PValue,TargetMatchQty
0,"10,11-dihydro-10-hydroxycarbamazepine",C039775,Parkinson Disease,MESH:D010300,4.27,20558393
1,"10,11-dihydro-10-hydroxycarbamazepine",C039775,"Parkinson Disease, Secondary",MESH:D010302,4.56,26457621
2,10-(6'-ubiquinonyl)decyltriphenylphosphonium b...,C476756,Parkinson Disease,MESH:D010300,17.9,28215578
3,10-(6'-ubiquinonyl)decyltriphenylphosphonium b...,C476756,Parkinson Disease,MESH:D010300,17.9,21318773
4,10-(6'-ubiquinonyl)decyltriphenylphosphonium b...,C476756,Parkinson Disease,MESH:D010300,17.9,17188257


In [45]:
#Gene vocabulary
#GeneSymbol
#GeneName
#GeneID (NCBI Gene identifier)
#AltGeneIDs (alternative NCBI Gene identifiers; '|'-delimited list)
#Synonyms ('|'-delimited list)
#BioGRIDIDs ('|'-delimited list)
#PharmGKBIDs ('|'-delimited list)
#UniprotIDs ('|'-delimited list)

genesdf = pd.read_csv('CTD_genes.csv.gz', compression='gzip', skiprows=range(1, 29),
                      names = ['GeneSymbol',
                               'GeneName',
                               'GeneID',
                               'AltGeneIDs',
                               'Synonyms',
                               'BioGRIDIDs',
                               'PharmGKBIDs',
                               'UniprotIDs'])
genesdf = genesdf[1:]
genesdf.drop(['AltGeneIDs','Synonyms','BioGRIDIDs',
                           'PharmGKBIDs','UniprotIDs'], axis = 1, inplace = True)
genesdf.head()

Unnamed: 0,GeneSymbol,GeneName,GeneID
1,03B03F,"DNA segment, 03B03F (Research Genetics)",27777.0
2,03B03R,"DNA segment, 03B03R (Research Genetics)",27778.0
3,03.MMHAP34FRA.SEQ,"DNA segment, 03.MMHAP34FRA.seq",53288.0
4,064YA,,5658107.0
5,102G4T7,"DNA segment, 102g4T7",56573.0


In [43]:
#Chemical–gene interaction
#Fields:
#ChemicalName
#ChemicalID (MeSH identifier)
#CasRN (CAS Registry Number, if available)
#GeneSymbol
#GeneID (NCBI Gene identifier)
#GeneForms ('|'-delimited list)
#Organism (scientific name)
#OrganismID (NCBI Taxonomy identifier)
#Interaction
#InteractionActions ('|'-delimited list)
#PubMedIDs ('|'-delimited list)

chemgene = pd.read_csv('CTD_chem_gene_ixns.csv.gz', compression='gzip', skiprows=range(1, 29), 
                       names = ['ChemicalName',
                               'ChemicalID',
                               'CasRN',
                               'GeneSymbol',
                               'GeneID',
                               'GeneForms',
                               'Organism',
                               'OrganismID',
                               'Interaction',
                               'InteractionActions',
                               'PubMedIDs'])
chemgene = chemgene[1:]
chemgene.drop(['CasRN','PubMedIDs'], axis = 1, inplace = True)
chemgene.head()

Unnamed: 0,ChemicalName,ChemicalID,GeneSymbol,GeneID,GeneForms,Organism,OrganismID,Interaction,InteractionActions
1,10074-G5,C534883,AR,367.0,protein,Homo sapiens,9606.0,10074-G5 affects the reaction [MYC protein res...,affects^reaction|increases^expression
2,10074-G5,C534883,AR,367.0,protein,Homo sapiens,9606.0,10074-G5 inhibits the reaction [EPHB2 protein ...,decreases^reaction|increases^expression
3,10074-G5,C534883,AR,367.0,protein,Homo sapiens,9606.0,10074-G5 results in decreased expression of AR...,decreases^expression
4,10074-G5,C534883,AR,367.0,protein,Homo sapiens,9606.0,10074-G5 results in decreased expression of AR...,decreases^expression
5,10074-G5,C534883,EPHB2,2048.0,protein,Homo sapiens,9606.0,10074-G5 inhibits the reaction [EPHB2 protein ...,decreases^reaction|increases^expression


In [11]:
#Top Chemical–gene interaction types: 
# CTD curates chemical–gene and –protein interactions in vertebrates and invertebrates using this hierarchical vocabulary of interaction types
#Fields:
#TypeName
#Code
#Description
#ParentCode

geneinteractiontypes = pd.read_csv('CTD_chem_gene_ixn_types.csv')
geneinteractiontypes.head()

Unnamed: 0,# TypeName,Code,Description,ParentCode
0,abundance,abu,The abundance of a chemical (if chemical synth...,
1,activity,act,An elemental function of a molecule.,
2,binding,b,A molecular interaction.,
3,cotreatment,w,Involving the use of two or more chemicals sim...,
4,expression,exp,The expression of a gene product.,


## merge and create dfs

Merge and create dfs with information that may be useful for knowledge graph:
gene, chemical, protein

In [44]:
#merge parkinsongenesdf and chemgene for Parkinson's
chem_df_merged = parkinsongenesdf.merge(chemgene, on=['ChemicalID','ChemicalName'], how='left')
chem_df_merged.head()

Unnamed: 0,ChemicalName,ChemicalID,DiseaseName,DiseaseID,GeneSymbol,GeneID,GeneForms,Organism,OrganismID,Interaction,InteractionActions
0,"10,11-dihydro-10-hydroxycarbamazepine",C039775,Parkinson Disease,MESH:D010300,ABCB1,5243.0,protein,Homo sapiens,9606.0,ABCB1 protein results in increased transport o...,increases^transport
1,"10,11-dihydro-10-hydroxycarbamazepine",C039775,Parkinson Disease,MESH:D010300,CYP2C19,1557.0,protein,,,"10,11-dihydro-10-hydroxycarbamazepine results ...",decreases^activity
2,"10,11-dihydro-10-hydroxycarbamazepine",C039775,"Parkinson Disease, Secondary",MESH:D010302,ABCB1,5243.0,protein,Homo sapiens,9606.0,ABCB1 protein results in increased transport o...,increases^transport
3,"10,11-dihydro-10-hydroxycarbamazepine",C039775,"Parkinson Disease, Secondary",MESH:D010302,CYP2C19,1557.0,protein,,,"10,11-dihydro-10-hydroxycarbamazepine results ...",decreases^activity
4,10-(6'-ubiquinonyl)decyltriphenylphosphonium b...,C476756,Parkinson Disease,MESH:D010300,BAX,581.0,protein,Mus musculus,10090.0,10-(6'-ubiquinonyl)decyltriphenylphosphonium b...,decreases^reaction|increases^expression


In [47]:
#Add gene name field
chemgene_df_merged = chem_df_merged.merge(genesdf, on=['GeneID','GeneSymbol'], how='left')
chemgene_df_merged.head()

Unnamed: 0,ChemicalName,ChemicalID,DiseaseName,DiseaseID,GeneSymbol,GeneID,GeneForms,Organism,OrganismID,Interaction,InteractionActions,GeneName
0,"10,11-dihydro-10-hydroxycarbamazepine",C039775,Parkinson Disease,MESH:D010300,ABCB1,5243.0,protein,Homo sapiens,9606.0,ABCB1 protein results in increased transport o...,increases^transport,ATP binding cassette subfamily B member 1
1,"10,11-dihydro-10-hydroxycarbamazepine",C039775,Parkinson Disease,MESH:D010300,CYP2C19,1557.0,protein,,,"10,11-dihydro-10-hydroxycarbamazepine results ...",decreases^activity,cytochrome P450 family 2 subfamily C member 19
2,"10,11-dihydro-10-hydroxycarbamazepine",C039775,"Parkinson Disease, Secondary",MESH:D010302,ABCB1,5243.0,protein,Homo sapiens,9606.0,ABCB1 protein results in increased transport o...,increases^transport,ATP binding cassette subfamily B member 1
3,"10,11-dihydro-10-hydroxycarbamazepine",C039775,"Parkinson Disease, Secondary",MESH:D010302,CYP2C19,1557.0,protein,,,"10,11-dihydro-10-hydroxycarbamazepine results ...",decreases^activity,cytochrome P450 family 2 subfamily C member 19
4,10-(6'-ubiquinonyl)decyltriphenylphosphonium b...,C476756,Parkinson Disease,MESH:D010300,BAX,581.0,protein,Mus musculus,10090.0,10-(6'-ubiquinonyl)decyltriphenylphosphonium b...,decreases^reaction|increases^expression,"BCL2 associated X, apoptosis regulator"


## Knowledge graph

In [48]:
# https://towardsdatascience.com/creating-knowledge-graphs-from-resumes-and-traver-56016426f4fb
# https://networkx.org/documentation/stable/tutorial.html
# https://towardsdatascience.com/from-dataframe-to-network-graph-bbb35c8ab675

import networkx as nx
G = nx.Graph()

In [None]:
edge_dict = {}
edge_dict['Mathew'] = languages_mathew
# create a directed-graph from a dataframe
G=nx.from_dict_of_lists(edge_dict,create_using=nx.MultiDiGraph())
plt.figure(figsize=(12,12))
pos = nx.spring_layout(G)
nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos, node_size = 4500)
plt.show()

edge_dict = {}

edge_dict[names[0]] = languages1
edge_dict[names[1]] = languages2
edge_dict[names[2]] = languages2

G=nx.from_dict_of_lists(edge_dict,create_using=nx.MultiDiGraph())
plt.figure(figsize=(12,12))

pos = nx.circular_layout(G)
nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos, node_size = 4500, font_size=18)
plt.show()

In [None]:
G= nx.from_pandas_edgelist(chemgene_df_merged, "DiseaseName", "ChemicalName", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

In [None]:
from matplotlib.pyplot import figure
figure(figsize=(10, 8))
nx.draw_shell(G, with_labels=True)