# Create metapath-based features

In [1]:
# Load the required packages
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 2000)

In [5]:
# Set the cutoff for minimal expression
min_expression = 21
FC_cutoff = 1.5
FDR_cutoff = 0.0001
include_gene_expression = False

In [3]:
# Load the protein-protein interactions
PPI = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Complete set of protein-protein interactions extracted from PathwayStudio.csv")

In [6]:
# Read in the differential expression calculations
dex = pd.read_csv("../Input sets/Gene expression data/Galaxy37-[edgeR_DGE_on_2__design_matrix_prostate_unpaired.txt_-_differentially_expressed_genes].tabular.annotated.txt", 
                  delimiter = "\t", index_col = 0, names = ["ENSEMBL", "gene", "logFC", "logCPM", "LR", "pValue", "FDR"], header = 0)
dex["ENSEMBL"].replace("(\.\d+)", "", regex = True, inplace = True)
# We know that the logFC is a 2 log. However, I prefer absolute values rather than logs
dex["FC"] = 2**dex["logFC"]

# Assign differential expression to the genes based on the previously set parameters
if include_gene_expression:
    dex["Diff expression"] = "Unchanged"
    dex.loc[(dex["FDR"] <= FDR_cutoff) & (dex["FC"] <= 1/FC_cutoff), "Diff expression"] = "Underexpressed"
    dex.loc[(dex["FDR"] <= FDR_cutoff) & (dex["FC"] >= FC_cutoff), "Diff expression"] = "Overexpressed"

In [7]:
# Create features based on direct paths
if include_gene_expression:
    PPI = PPI.merge(dex[["ENSEMBL", "Diff expression"]], left_on = "ENSEMBL1", right_on = "ENSEMBL", how = "inner")
    PPI = PPI.merge(dex[["ENSEMBL", "Diff expression"]], left_on = "ENSEMBL2", right_on = "ENSEMBL", how = "inner", suffixes = [" start", " end"])

# Create the types of paths
if include_gene_expression:
    PPI["direct_incoming"] = PPI["Diff expression start"] + "_" + PPI["predicate"]
    PPI["direct_outgoing"] = PPI["predicate"] + "_" + PPI["Diff expression end"]

    # Count the frequencies of the types of paths
    direct_incoming = PPI.groupby(["ENSEMBL end", "direct_incoming"])["ENSEMBL start"].nunique().unstack(level = 1, fill_value = 0)
    direct_outgoing = PPI.groupby(["ENSEMBL start", "direct_outgoing"])["ENSEMBL end"].nunique().unstack(level = 1, fill_value = 0)
    
else:
    direct_incoming = PPI.groupby(["ENSEMBL2", "predicate"])["ENSEMBL1"].nunique().unstack(level = 1, fill_value = 0)
    direct_outgoing = PPI.groupby(["ENSEMBL1", "predicate"])["ENSEMBL2"].nunique().unstack(level = 1, fill_value = 0)
    direct_incoming.columns = [x + "_direct_incoming" for x in list(direct_incoming)]
    direct_outgoing.columns = [x + "_direct_outgoing" for x in list(direct_outgoing)]

In [None]:
#f = direct_incoming.merge(direct_outgoing, how = "outer")

In [None]:
# Create the indirect outgoing paths
if include_gene_expression:
    indirect_outgoing = PPI[["ENSEMBL1", "ENSEMBL2", "direct_outgoing"]].merge(
        PPI[["ENSEMBL1", "predicate", "ENSEMBL2", "direct_outgoing"]],
        left_on = "ENSEMBL2", right_on = "ENSEMBL1", how = "inner", suffixes = [" start", " end"])
else:
    indirect_outgoing = PPI[["ENSEMBL1", "ENSEMBL2", "predicate"]].merge(
        PPI[["ENSEMBL1", "predicate", "ENSEMBL2"]],
        left_on = "ENSEMBL2", right_on = "ENSEMBL1", how = "inner", suffixes = [" start", " end"])
    
# Remove circular paths and duplicates
indirect_outgoing = indirect_outgoing[indirect_outgoing["ENSEMBL1 start"] != indirect_outgoing["ENSEMBL2 end"]]
indirect_outgoing.drop_duplicates(inplace = True)

# Create the feature
if include_gene_expression:
    indirect_outgoing["indirect_outgoing"] = indirect_outgoing["direct_outgoing start"] + "_" + indirect_outgoing["direct_outgoing end"]
else:
    indirect_outgoing["indirect_outgoing"] = "indirect_outgoing_" + indirect_outgoing["predicate start"] + "_" + indirect_outgoing["predicate end"]
indirect_outgoing = indirect_outgoing.groupby(["ENSEMBL1 start", "indirect_outgoing"])["ENSEMBL2 end"].nunique().unstack(level = 1, fill_value = 0)

In [None]:
#f = f.merge(indirect_outgoing, how = "outer")
outgoing = direct_outgoing.merge(indirect_outgoing, how = "outer")

In [None]:
# Create the indirect incoming paths
if include_gene_expression:
    indirect_incoming = PPI[PPI["ENSEMBL2"].isin(PPI["ENSEMBL1"][PPI["ENSEMBL2"].isin(f["gene_ids"])])][["ENSEMBL1", "ENSEMBL2", "direct_incoming"]].merge(
                    PPI[PPI["ENSEMBL2"].isin(f["gene_ids"])][["ENSEMBL1", "ENSEMBL2", "direct_incoming"]], how = "inner", left_on = "ENSEMBL2", right_on = "ENSEMBL1",
                    suffixes = [" start", " end"])
else:
    indirect_incoming = PPI[PPI["ENSEMBL2"].isin(PPI["ENSEMBL1"][PPI["ENSEMBL2"].isin(f["gene_ids"])])][["ENSEMBL1", "predicate", "ENSEMBL2"]].merge(
                    PPI[PPI["ENSEMBL2"].isin(f["gene_ids"])][["ENSEMBL1", "predicate", "ENSEMBL2"]], how = "inner", left_on = "ENSEMBL2", right_on = "ENSEMBL1",
                    suffixes = [" start", " end"])
    
# Remove circular paths and duplicates
indirect_incoming = indirect_incoming[indirect_incoming["ENSEMBL1 start"] != indirect_incoming["ENSEMBL2 end"]]
indirect_incoming.drop_duplicates(inplace = True)

# Create the feature
if include_gene_expression:
    indirect_incoming["indirect_incoming"] = indirect_incoming["direct_incoming start"] + "_" + indirect_incoming["direct_incoming end"]
else:
    indirect_incoming["indirect_incoming"] = "indirect_incoming_" + indirect_incoming["predicate start"] + "_" + indirect_incoming["predicate end"]
indirect_incoming = indirect_incoming.groupby(["ENSEMBL2 end", "indirect_incoming"])["ENSEMBL1 start"].nunique().unstack(level = 1, fill_value = 0)

In [None]:
f = f.merge(indirect_incoming, how = "outer")

In [None]:
# Make all NA values 0
f = f.fillna(0)
# Write away the data
f.to_csv("/Users/vlietstraw/git/Post-GWAS/Predicates/PathwayStudio predicate features PPI.csv")