# Extract protein-protein interaction networks to be used in various network analyses tools

In [6]:
# Load the required packages
from neo4j import GraphDatabase
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 2000)

In [8]:
# Setup a connection to the database
driver = GraphDatabase.driver("bolt://localhost:7687")
session =  driver.session()

In [9]:
# Set the cutoff for minimal expression
min_expression = 21
FC_cutoff = 1.5
FDR_cutoff = 0.0001

## Load the gene expression data which can be used to filter the network

In [4]:
# Extract the gene expression data
cols = pd.read_csv("Input sets/Gene expression data/design_matrix_prostate_unpaired.txt", delimiter = "\t")
reads = pd.read_csv("Input sets/Gene expression data/expression_matrix_prostate_clean.txt", delimiter = "\t")
# Read in the differential expression calculations
dex = pd.read_csv("Input sets/Gene expression data/Galaxy37-[edgeR_DGE_on_2__design_matrix_prostate_unpaired.txt_-_differentially_expressed_genes].tabular.annotated.txt", 
                  delimiter = "\t", index_col = 0, names = ["ENSEMBL", "gene", "logFC", "logCPM", "LR", "pValue", "FDR"], header = 0)
dex["ENSEMBL"].replace("(\.\d+)", "", regex = True, inplace = True)
# We know that the logFC is a 2 log. However, I prefer absolute values rather than logs
dex["FC"] = 2**dex["logFC"]

# Assign differential expression to the genes based on the previously set parameters
dex["Diff expression"] = "Unchanged"
dex.loc[(dex["FDR"] <= FDR_cutoff) & (dex["FC"] <= 1/FC_cutoff), "Diff expression"] = "Underexpressed"
dex.loc[(dex["FDR"] <= FDR_cutoff) & (dex["FC"] >= FC_cutoff), "Diff expression"] = "Overexpressed"

# Get the raw reads data
reads["total"] = reads.sum(axis = 1, numeric_only = True)
reads["freq"] = ((reads[list(cols["samplename"])] > 0) * 1).sum(axis = 1)
reads["gene_ids"].replace("(\.\d+)", "", regex = True, inplace = True)

reads = reads.merge(dex, left_on = "gene_ids", right_on = "ENSEMBL", how = "outer")

We also extract the location and other properties of the genes, to filter out the non-protein coding and mitochondrial genes.

In [5]:
# Initiate the connection to the database (first time you may have to run ensembl.download() and ensembl.index())
# To align with our reference set, we take a Ensembl version from 2019
from pyensembl import EnsemblRelease
ensembl = EnsemblRelease(92)

# Function to extract the data (and prevent superfluous queries)
def getEnsemblData(id):
    try:
        data = ensembl.gene_by_id(id)
        return pd.Series({"gene name" : data.gene_name, 
                          "chromosome" : data.contig, 
                          "gene start" : data.start,
                          "gene stop" : data.end,
                          "protein_coding" : data.is_protein_coding})
    except ValueError:
        return pd.Series({"gene name" : None, 
                          "chromosome" : None, 
                          "gene start" : None,
                          "gene stop" : None,
                          "protein_coding" : None})

In [6]:
# Remove the postifxes of the gene identifiers
reads["gene_ids"].replace("\.\d+", "", regex = True, inplace = True)
reads[["gene name", "chromosome", "gene start", "gene stop", "protein_coding"]] = reads["gene_ids"].apply(lambda x: getEnsemblData(x))

In [7]:
# PathwayStudio primarily contains protein coding gene entries. All non-protein coding entries are therefore removed
reads = reads[reads["protein_coding"] == True]

# Drop the mitochondrial genes and the entries that could not be found
reads.drop(reads[reads["chromosome"].isin(["MT", None])].index, inplace = True)

# Only removes 3897 edges, so perhaps not worthwhile

In [10]:
# Extract all the protein-protein interaction data along with some other info
PPI = pd.DataFrame(session.run("MATCH (p:Protein)-[r]->(p2:Protein) RETURN DISTINCT p.`Ensembl ID` AS ENSEMBL1, type(r) AS predicate, r.pmid AS PMID, length(r.pmid) as nPMID, p2.`Ensembl ID` AS ENSEMBL2").data())
PPI = PPI.explode("ENSEMBL1").explode("ENSEMBL2")
PPI["ENSEMBL1"].replace("\.\d+", "", regex = True, inplace = True)
PPI["ENSEMBL2"].replace("\.\d+", "", regex = True, inplace = True)
PPI = PPI[PPI["ENSEMBL1"].isin(reads["gene_ids"]) & PPI["ENSEMBL2"].isin(reads["gene_ids"])]
PPI.drop_duplicates(['ENSEMBL1', 'predicate', 'nPMID', 'ENSEMBL2'], keep = "first", inplace = True)

NameError: name 'reads' is not defined

In [9]:
# Add the ENSEMBL and expression data to the PPI information
PPI = PPI.merge(reads[["gene_ids", "freq"]], left_on = "ENSEMBL1", right_on = "gene_ids", how = "inner")
PPI = PPI.merge(reads[["gene_ids", "freq"]], left_on = "ENSEMBL2", right_on = "gene_ids", how = "inner", suffixes = [" 1", " 2"])
PPI.drop(columns = ["gene_ids 1", "gene_ids 2"], inplace = True)

In [10]:
# Write away the data so that it can be reloaded quicker in the future
PPI.to_csv("~/git/Post-GWAS/Complete set of protein-protein interactions extracted from PathwayStudio.csv", index = False)

In [3]:
# Load the pre-extracted data
PPI = pd.read_csv("Complete set of protein-protein interactions extracted from PathwayStudio.csv")

In [4]:
# Create undirected PPI interaction network, both unfiltered and filtered
# Create a dictionary for the ENSEMBL identifiers
ensembl_dict = dict(zip(set(PPI["ENSEMBL1"]).union(set(PPI["ENSEMBL2"])), range(1, len(set(PPI["ENSEMBL1"]).union(set(PPI["ENSEMBL2"]))) + 1)))
PPI["ENSEMBL1"].replace(ensembl_dict, inplace = True)
PPI["ENSEMBL2"].replace(ensembl_dict, inplace = True)

PPI_backup = PPI.copy()

PPI = PPI.groupby(["ENSEMBL1", "ENSEMBL2"])["nPMID"].sum().to_frame().reset_index()

PPI.drop_duplicates(['ENSEMBL1', 'ENSEMBL2'], keep = "first", inplace = True)

In [5]:
# Write away the mapping file for all graph files
import json

with open('ENSEMBL_mappings.json', 'w') as fp:
    json.dump(ensembl_dict, fp)
#ensembl_dict = json.load("ENSEMBL_mappings.json")

In [5]:
# Create an undirected copy of the data
undirected = PPI[["ENSEMBL1", "ENSEMBL2"]].copy()
undirected[:] = np.sort(undirected.values, axis=1)[:,::]
undirected.drop_duplicates(inplace = True)

In [6]:
### Node2Vec
Node2Vec = PPI[["ENSEMBL1", "ENSEMBL2", "nPMID"]]

Node2Vec = Node2Vec[Node2Vec['nPMID'] != 0]

Node2Vec.to_csv("Node2vec/unfiltered protein protein interactions with weights.csv", index = False, sep = " ", header = False)
undirected.to_csv("Node2vec/unfiltered protein protein interactions without weights.csv", index = False, sep = " ", header = False)

In [None]:
### Struc2vec
undirected.to_csv("Struc2vec/unfiltered protein protein interactions without weights.csv", index = False, sep = " ", header = False)

In [7]:
### DIAMOND & Network statistics
undirected.to_csv("unfiltered_protein_protein_interactions.csv", index = False, header = False)

In [19]:
# EVOKE needs the number of nodes and edges as first parameters, so these are calculated here
undirected = pd.DataFrame({"ENSEMBL1" : [len(set(undirected["ENSEMBL1"]).union(set(undirected["ENSEMBL2"])))], 
                           "ENSEMBL2" : [len(undirected)]}).append(undirected)
undirected.to_csv("EVOKE/unfiltered_protein_protein_interactions_EVOKE.csv", index = False, header = False, sep = " ")

In [17]:
# Filter the networks: removes 24618 edges, i.e. 6.4% of the total number of edges. Perhaps not worthwhile
PPI = PPI.merge(PPI_backup[["ENSEMBL1", "ENSEMBL2", "freq 1", "freq 2"]], on = ["ENSEMBL1", "ENSEMBL2"], how = "left")

PPI = PPI[(PPI["freq 1"] >= min_expression) & (PPI["freq 2"] >= min_expression)]
undirected = PPI[["ENSEMBL1", "ENSEMBL2"]].copy()
undirected[:] = np.sort(undirected.values, axis=1)[:,::]
undirected.drop_duplicates(inplace = True)

undirected = pd.DataFrame({"ENSEMBL1" : [len(set(undirected["ENSEMBL1"]).union(set(undirected["ENSEMBL2"])))], 
                           "ENSEMBL2" : [len(undirected)]}).append(undirected)

In [18]:
### EVOKE, DIAMOND & Network statistics
undirected.to_csv("filtered protein protein interactions.csv", index = False, header = False)

### Node2Vec
PPI[["ENSEMBL1", "ENSEMBL2", "nPMID"]].to_csv("Node2vec/filtered protein protein interactions with weights.csv", index = False)
PPI[["ENSEMBL1", "ENSEMBL2"]].to_csv("Node2vec/filtered protein protein interactions without weights.csv", index = False)

In [19]:
# Predicate data
## Without gene expression, unfiltered

## With gene expression, unfiltered

## Without gene expression, unfiltered

## With gene expression, filtered

In [20]:
# TODO rdf2vec features