Identify relationships between Prostate cancer associated SNPs and genes

In [1]:
# Load the required packages
from neo4j import GraphDatabase
import pandas as pd

In [None]:
# Set the genomic parameters
delta_bp = 1000000
FC_cutoff = 1.5
p_value = 0.0001

In [103]:
# Setup a connection to the database
driver = GraphDatabase.driver("bolt://localhost:7687")
session =  driver.session()

In [3]:
# Read in the data of Farashi et al. that is used as a reference set
farashi = pd.read_excel("Input sets/Farashi/41568_2018_87_MOESM1_ESM-3.xls", header = 1)

# Remove the empty rows at the bottom
farashi = farashi[farashi["SNP ID"].notnull() & farashi["Target/assigned/e-Gene"].notnull()]

In [4]:
# Replace fault-inducing postfixes for snp data
farashi["SNP ID"] = farashi["SNP ID"].str.strip()
farashi["SNP ID"].replace("(_A)|(_C)", "", regex = True, inplace = True)

# Remove entries without valid rs identifiers
farashi.drop(farashi[farashi["SNP ID"].str.startswith("chr")].index, inplace = True)

Not all gene names appear to be correct, and could therefore be found in PathwayStudio. These were therefore searched on https://www.ncbi.nlm.nih.gov/gene/ to identify their updaed names

In [5]:
# Create separate entries for cells with delimited genes
farashi = farashi.assign(gene=farashi["Target/assigned/e-Gene"].str.split(';')).explode('gene')

# Some gene names are outdated/erroneous. These are manually mapped to correct genes as based on ncbi.org/gene
gene_replacement_dictionary = { "MSMB1" : "MSMB",
                                "MSMB2" : "MSMB",
                                "NCOA4-1" : "NCOA4P1",
                                "NCOA4-3" : "NCOA4P3", 
                                "ANKRD5" : "ANKEF1", 
                                "C6orf228" : "SMIM13",
                                "c-MYC, FoxA1 binding" : "MYC",
                                "HoxB13" : "HOXB13",
                                "LASS2" : "CERS2",
                                "C10orf32" : "BORCS7",
                                "LOC100505761" : "RPARP-AS1",
                                "LOC100505495" : "PCAT19",
                                "WDR52" : "CFAP44",
                                "HCG4P6" : "HCG4B",
                                "LOC285830" : "HLA-F-AS1",
                                "RAB7L1" : "RAB29",
                                "LOC284578" : "MFSD4A-AS1",
                                "AGAP7" : "AGAP7P",
                                "C2orf43" : "LDAH"}

farashi["Target/assigned/e-Gene"] = farashi["Target/assigned/e-Gene"].replace(gene_replacement_dictionary)

# Strip whitespaces
farashi["SNP ID"] = farashi["SNP ID"].str.strip()

In [6]:
# TODO: Aantal genen en SNPs per studie laten zien

In [None]:
# Negatieven identificeren die binnen bepaalde range liggen
import pyensembl

In [7]:
# Remove entries that are based on a single eQTL study, or that are found in the exon/coding region
eQTL_studies = ["Dadaev T. et al. 2018", "Grisanzio, C. et al.  2012", "X. xu et al. 2014", "Thibodeau S.N.  et al. 2015"]
coding_snps = ["Coding region", "exonic"]

In [88]:
nodes = session.run("Match (n) WHERE n.name IN " + str(list(set(farashi["Target/assigned/e-Gene"]))) + " RETURN n").value()

In [9]:
# Get the SNP properties from dbsnp
import myvariant
mv = myvariant.MyVariantInfo()
   
dbsnp = mv.querymany(list(set(farashi["SNP ID"])), scopes='dbsnp.rsid', fields='dbsnp', returnall = True)

querying 1-1000...done.
querying 1001-1140...done.
Finished.
459 input query terms found dup hits:
	[('rs17882350', 2), ('rs13190729', 2), ('rs76837457', 2), ('rs73106451', 2), ('rs28749132', 2), ('rs
50 input query terms found no hit:
	['rs527768094', 'rs397887654', 'rs139078838', 'rs572943237', 'rs5875234', 'rs527588882', 'rs57334129


In [10]:
# Validate the mappings

In [12]:
# Extract the gene expression data

In [127]:
# Get the incoming and outgoing paths of the genes
direct = pd.DataFrame(session.run("Match p = (n)-[r]-(x:Protein)" +
                                  " WHERE n.name IN " + str(list(farashi["Target/assigned/e-Gene"])) + 
                                  " RETURN startnode(r).name AS start," +
                                  " type(r) AS relationship," + 
                                  " r.pmid AS pmid," +
                                  " length(r.pmid) AS npmid," + 
                                  " endnode(r).name AS end").data())

In [118]:
# Get the indirect outgoing paths
outgoing = pd.DataFrame(session.run("Match p = (n)-[r]->(x:Protein)-[r2]->(y:Protein)" +
                                    " WHERE n.name IN " + str(list(set(farashi["Target/assigned/e-Gene"]))) +
                                    " RETURN startnode(r).name AS start," +
                                    " type(r) AS relationship1," +
                                    " r.pmid AS pmid1," +
                                    " length(r.pmid) AS npmid1," +
                                    " endnode(r).name AS middle," +
                                    " type(r2) AS relationship2," +
                                    " r2.pmid AS pmid2," +
                                    " length(r2.pmid) AS npmid2," +
                                    " endnode(r2).name AS end").data())

In [123]:
# Get the indirect incoming paths
outgoing = pd.DataFrame(session.run("Match p = (x:Protein)-[r]->(y:Protein)-[r2]->(n)"
                                    " WHERE n.name IN " + str(list(set(farashi["Target/assigned/e-Gene"]))) +
                                    " RETURN startnode(r).name AS start," +
                                    " type(r) AS relationship1," +
                                    " r.pmid AS pmid1," +
                                    " length(r.pmid) AS npmid1," +
                                    " endnode(r).name AS middle," +
                                    " type(r2) AS relationship2," +
                                    " r2.pmid AS pmid2," +
                                    " length(r2.pmid) AS npmid2," +
                                    " endnode(r2).name AS end").data())

In [None]:
## Get shapes
# Triangles query
# Not exists clauses toevoegen
# nodes laten returnen om unieke combinaties te filteren
triangles = "Match p = (n)--(i:Protein)--(i2:Protein)--(n) WHERE n.name = '" + 'KLRK1' + "' RETURN COUNT(DISTINCT(p))"
#squares = "Match p = (n)--(i:Protein)--(i2:Protein)--(i3:Protein)--(n) WHERE n.name = '" +  + "' RETURN COUNT(p)"
#diagonalSquares = "Match p = (n)-[r]-(i:Protein)-[r2]-(i2:Protein)-[r3]-(i3:Protein)-[r4]-(n)-[r5]-(i2:Protein) WHERE n.name = '" +  + "' RETURN COUNT(p)"
#diagonalSquares2 = "Match p = (i3:Protein)-[r5]-(n)-[r]-(i:Protein)-[r2]-(i2:Protein)-[r3]-(i3:Protein)-[r4]-(i:Protein) WHERE n.name = '" +  + "' RETURN COUNT(p)"

test = session.run(triangles)

In [None]:
# Calculate network statistics such as eigenvector centrality 