# Calculate the features for the network statistics for leave-SNP-out

In [1]:
# Load the required packages
import networkx as nx
import pandas as pd
import numpy as np
import json
from datetime import datetime
from itertools import product
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.utils import shuffle
import sklearn.metrics

In [2]:
writeOut = True

In [3]:
with open("/Users/vlietstraw/git/Post-GWAS/ENSEMBL_mappings.json", "r") as fp:
    ensembl_dict = json.load(fp)

In [4]:
# Load the graph data
with open("/Users/vlietstraw/git/Post-GWAS/unfiltered_protein_protein_interactions.csv", 'rb') as input_file:
    #next(input_file, '')   # skip a line
    G = nx.read_edgelist(input_file, delimiter=',', nodetype = int)

In [5]:
# Calculate the degrees of the nodes (first metric)
degrees = dict(G.degree())

In [6]:
# Calculate the number of connections between nodes and disease genes (second metrics)
def calculate1N(nodeID, diseaseProteins, graph):
    if nodeID in diseaseProteins:
        diseaseProteins.remove(nodeID)
    neighbours = set(dict(graph[nodeID]).keys())
    dp_neighbours = neighbours.intersection(diseaseProteins)
    return len(dp_neighbours)/len(neighbours)

In [7]:
# Calculate the number of indirect connectinos between nodes and disease genes (third metric)
def calculate2N(nodeID, diseaseProteins, graph):
    if nodeID in diseaseProteins:
        diseaseProteins.remove(nodeID)
    indirect_neighbours_dict = dict(nx.single_source_shortest_path_length(G, source = nodeID, cutoff = 2))
    indirect_neighbours = pd.DataFrame({"nodeID" : indirect_neighbours_dict.keys(), "pathLength" : indirect_neighbours_dict.values()})
    indirect_neighbours = indirect_neighbours[indirect_neighbours["pathLength"] == 2]
    dp_indirect_neighbours = set(indirect_neighbours["nodeID"]).intersection(diseaseProteins)
    if len(indirect_neighbours) > 0:
        return len(dp_indirect_neighbours)/len(indirect_neighbours)
    else:
        return -1

In [8]:
# Average distance to disease genes (fourth metric)
def getAverageDPDistance(nodeID, diseaseProteins, graph):
    if nodeID in diseaseProteins:
        diseaseProteins.remove(nodeID)
    shortestPaths = dict(nx.single_source_shortest_path_length(graph, source = nodeID))
    dp_shortestPaths = [shortestPaths[x] if x in shortestPaths.keys() else float('inf') for x in diseaseProteins]
    output = sum(dp_shortestPaths)/len(dp_shortestPaths)
    if output != float('inf'):
        return output
    else:
        return -1

In [9]:
# Positive topology coefficient (fifth metric)
def calculateTopologyCoeff(nodeID, diseaseProteins, graph):
    coeffs = []
    
    candidate = set(dict(graph[nodeID]).keys())
    for dis in diseaseProteins:
        dp = set(dict(graph[dis]).keys())

        overlap = dp.intersection(candidate)
        if len(overlap) > 0:
            coeffs.append(len(overlap) / min(len(dp), len(candidate)))    
    if len(coeffs) > 0:
        return sum(coeffs) / len(coeffs)
    else:
        return -1

In [10]:
ML_algorithms = ["LR", "SVM", "DT", "KNN1", "KNN3", "KNN5", "KNN7", "KNN9", "RF"]
all_bp_distances = [25, 50, 100, 500, 1000, 2000, "depict"]
refsets = ["Teslovich", "DeRycke", "farashi", "farashi p-value cutoff"]

all_metrics = pd.DataFrame(list(product(refsets, all_bp_distances, ML_algorithms)), columns = ["refset", "bp distance", "algorithm"])

In [11]:
if writeOut:
    all_metrics = all_metrics[all_metrics["bp distance"] == "depict"]
    all_metrics = all_metrics[((all_metrics["algorithm"] == "RF") & (all_metrics["refset"] == "farashi")) |
                ((all_metrics["algorithm"] == "LR") & (all_metrics["refset"] == "farashi p-value cutoff")) |
                ((all_metrics["algorithm"] == "RF") & (all_metrics["refset"] == "DeRycke")) |
                ((all_metrics["algorithm"] == "LR") & (all_metrics["refset"] == "Teslovich"))
               ]

In [12]:
#Initialize emtpy variables
distance_history = 0

for am_index, am_values in all_metrics.iterrows():
    print("Predicting row " + str(am_index + 1) + " of " + str(len(all_metrics)))
    
    if distance_history != am_values["bp distance"]:
        if am_values["refset"] == "farashi":
            ref = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Input sets/Farashi/Farashi full 2000000 bp distance no pvalue filtering.csv")

        if am_values["refset"] == "farashi p-value cutoff":
            ref = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Input sets/Farashi/Farashi full 2000000 bp distance no pvalue filtering.csv")
            ref = ref[ref["GWAS/eQTL p-value¥"] <= float("5e-8")]

        if am_values["refset"] == "DeRycke":
            ref = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Input sets/DeRycke/DeRycke reference set.csv", delimiter = ";")
            ref.columns = ["SNP ID", "chromosome", "location", "gene_ids", "gene name", "gene start", "gene stop", "Diff expression", "Class", "bp distance absolute", "bp distance", "Gene rank"]

        if am_values["refset"] == "Teslovich":
            ref = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Input sets/Teslovich/Teslovich reference set.csv")
            ref.columns = ["SNP ID", "chromosome", "location", "P", "gene_ids", "gene name", "gene start", "gene stop", "Class", "bp distance absolute", "bp distance", "Gene rank"]

        ref["nodeID"] = [ensembl_dict[x] if x in ensembl_dict.keys() else None for x in ref["gene_ids"]]

        ref = shuffle(ref)
        
        # Set bp distance cutoff
        if am_values["bp distance"] != "depict":
            max_bp_distance = am_values["bp distance"]
            max_bp_distance = max_bp_distance * 1000
            ref = ref[ref["bp distance absolute"] <= max_bp_distance]
        elif am_values["bp distance"] == "depict":
            if am_values["refset"] == "farashi":
                depict = pd.read_csv("~/git/DEPICT/outcomes/Farashi complete 2nd round/farashi_no_pvalue_filtering_geneprioritization.txt", sep = "\t")
            if am_values["refset"] == "farashi p-value cutoff":
                depict = pd.read_csv("~/git/DEPICT/outcomes/Farashi complete 2nd round/farashi_default_pvalue_filtering_geneprioritization.txt", sep = "\t")
            if am_values["refset"] == "DeRycke":
                depict = pd.read_csv("~/git/DEPICT/outcomes/DeRycke/DeRycke_output_geneprioritization.txt", sep = "\t")
            if am_values["refset"] == "Teslovich":
                depict = pd.read_csv("~/git/DEPICT/outcomes/Teslovich for paper Wytze/Teslovich_output_geneprioritization.txt", sep = "\t")
            depict["nodeID"] = [ensembl_dict[x] if x in ensembl_dict.keys() else None for x in depict["Ensembl gene ID"]]


            depict["Locus"] = depict["Locus"].astype(str).apply(lambda x: x.split(";"))
            depict = depict.explode("Locus")

            snp_replacement_dict = {"rs113645266" : "rs6557271",
                            "rs150282463" : "rs13137700",
                            "rs67276543" : "rs34884832"}
            depict["Locus"] = depict["Locus"].replace(snp_replacement_dict)

            depict = depict[["Locus", "nodeID"]]
            depict.columns = ["SNP ID", "nodeID"]

            ref = ref.merge(depict, on = ["SNP ID", "nodeID"], how = "inner")

        # Drop all unmappable candidates
        ref.dropna(subset = ["nodeID"], inplace = True)
        ref["nodeID"] = ref["nodeID"].astype(int)

        # Drop all SNPs which no longer have a positive case
        pos_counts = ref.groupby("SNP ID")["Class"].sum()
        ref = ref[~ref["SNP ID"].isin(pos_counts[pos_counts == 0].index)]
        
        f = ref.groupby("nodeID")["Class"].sum()
        f[f > 1] = 1
        f = pd.DataFrame(f)
        
        f["degree"] = [degrees[x] for x in list(f.index)]
        f["1N index"] = [calculate1N(x, set(f.index).intersection(set(ref["nodeID"][(ref["Class"] == 1) & (~ref["chromosome"].isin(ref["chromosome"][ref["nodeID"] == x]))])), G) for x in list(f.index)]
        f["2N index"] = [calculate2N(x, set(f.index).intersection(set(ref["nodeID"][(ref["Class"] == 1) & (~ref["chromosome"].isin(ref["chromosome"][ref["nodeID"] == x]))])), G) for x in list(f.index)]
        f["Average DP Distance"] = [getAverageDPDistance(x, set(f.index).intersection(set(ref["nodeID"][(ref["Class"] == 1) & (~ref["chromosome"].isin(ref["chromosome"][ref["nodeID"] == x]))])), G) for x in list(f.index)]
        f["Topology coefficient"] = [calculateTopologyCoeff(x, set(f.index).intersection(set(ref["nodeID"][(ref["Class"] == 1) & (~ref["chromosome"].isin(ref["chromosome"][ref["nodeID"] == x]))])), G) for x in list(f.index)]
    
    if !writeOut:
        distance_history = am_values["bp distance"]

    outcomes2 = pd.DataFrame()
    train_auc_score2 = []
    train_auc_rank2 = []

    # In[12]:

    classifier = am_values["algorithm"]
    
    # Perform leave-SNP-out cross validation
    chromosomes = list(set(ref["chromosome"]))

    for chrom in chromosomes:
        print("Predicting candidates for chromosome " + str(chrom))

        f_test = f[f.index.isin(ref["nodeID"][ref["chromosome"] == chrom])].copy()
        f_train = f[f.index.isin(ref["nodeID"][ref["chromosome"] != chrom])].copy()

        train_class = f["Class"][f.index.isin(f_train.index)]
        test_class = f["Class"][f.index.isin(f_test.index)]

        f_test.drop(columns = ["Class"], inplace = True)
        f_train.drop(columns = ["Class"], inplace = True)

        if classifier == "SVM":
            clf = SVR(gamma="auto")
        if classifier == "DT":
            clf = DecisionTreeRegressor()
        if classifier == "KNN1":
            clf = KNeighborsRegressor(n_neighbors = 1)
        if classifier == "KNN3":
            clf = KNeighborsRegressor(n_neighbors = 3)
        if classifier == "KNN5" and len(f_train) >= 5:
            clf = KNeighborsRegressor(n_neighbors = 5)
        if classifier == "KNN5" and len(f_train) < 5:
            continue
        if classifier == "KNN7" and len(f_train) >= 7:
            clf = KNeighborsRegressor(n_neighbors = 7)
        if classifier == "KNN7" and len(f_train) < 7:
            continue
        if classifier == "KNN9" and len(f_train) >= 9:
            clf = KNeighborsRegressor(n_neighbors = 9)
        if classifier == "KNN9" and len(f_train) < 9:
            continue
        if classifier == "LR":
            from warnings import filterwarnings
            filterwarnings('ignore')
            clf = LogisticRegression()
        if classifier == "RF":
            clf = RandomForestRegressor(n_estimators = 1000, n_jobs = -1, max_features = "sqrt", max_depth = 5)

        clf.fit(np.array(f_train), np.array(train_class))

        if classifier == "LR":
            outcomes2 = pd.concat([outcomes2, pd.DataFrame({"predicted" : clf.predict_proba(f_test)[:,1],
                                                "Class" : test_class,
                                                "chromosome" : chrom,
                                                "nodeID" : f_test.index})])
        else:
            outcomes2 = pd.concat([outcomes2, pd.DataFrame({"predicted" : clf.predict(f_test),
                                                            "Class" : test_class,
                                                            "chromosome" : chrom,
                                                            "nodeID" : f_test.index})])
    if len(outcomes2) > 0:
        outcomes2.index.name = None
        outcomes2 = outcomes2.sort_values(["chromosome", "predicted"], ascending = False)
        outcomes2["For-chromosome rank"] = outcomes2.groupby("chromosome").cumcount() + 1

        # In[29]:


        chromosomes = list(set(outcomes2["chromosome"]))
        aucs = []
        for chrom in chromosomes:
          fpr, tpr, thresholds = sklearn.metrics.roc_curve(outcomes2["Class"][outcomes2["chromosome"] == chrom], -outcomes2["For-chromosome rank"][outcomes2["chromosome"] == chrom], pos_label = 1)
          aucs.append(sklearn.metrics.auc(fpr, tpr))
        all_metrics.at[am_index, "ROC-AUC per chromosome"] = sum(aucs)/len(aucs)


        # In[30]:


        ref = ref.merge(outcomes2[["nodeID", "predicted"]], on = "nodeID", how = "left")

        all_metrics.at[am_index, "Recall snps"] = len(set(ref["SNP ID"]))
        all_metrics.at[am_index, "Recall entries"] = sum(ref["Class"])
        all_metrics.at[am_index, "Recall genes"] = len(set(ref["nodeID"][ref["Class"] == 1]))

        # In[31]:


        ref = ref.sort_values(["SNP ID", "predicted"], ascending = False)
        
        SNP_temp = 0
        counter = 0
        prediction_temp = 9999
        for indx, row in ref.iterrows():
            if SNP_temp != row["SNP ID"]:
                SNP_temp = row["SNP ID"]
                counter = 1
                prediction_temp = row["predicted"]
            elif SNP_temp == row["SNP ID"] and prediction_temp != row["predicted"]:
                counter += 1
                prediction_temp = row["predicted"]
            ref.at[indx, "For-SNP rank"] = counter

        if writeOut:
            ref_out = ref[["SNP ID", "nodeID", "predicted", "Class", "For-SNP rank"]]
            ref_out["For-SNP rank"] = ref_out["For-SNP rank"].astype(int)
            ref_out.to_csv("/Users/vlietstraw/git/Post-GWAS/Network statistics/" + am_values["refset"] + " "+ am_values["algorithm"] + " " + am_values["bp distance"] + " " + datetime.today().strftime("%d-%m-%Y") + ".csv", sep = ";", index = False)

        # In[32]:


        fpr, tpr, thresholds = sklearn.metrics.roc_curve(ref["Class"], -ref["For-SNP rank"], pos_label = 1)
        all_metrics.at[am_index, "ROC-AUC overall (lco)"] = sklearn.metrics.auc(fpr, tpr) * 100


        # In[33]:


        # Calculate the ROC-AUC for every SNP and average the result
        SNPS2 = list(set(ref["SNP ID"]))
        aucs = []
        for snp in SNPS2:
          if len(set(ref["Class"][ref["SNP ID"] == snp])) == 1:
              aucs.append(list(set(ref["Class"][ref["SNP ID"] == snp]))[0])
          else:
              fpr, tpr, thresholds = sklearn.metrics.roc_curve(ref["Class"][ref["SNP ID"] == snp], -ref["For-SNP rank"][ref["SNP ID"] == snp], pos_label = 1)
              aucs.append(sklearn.metrics.auc(fpr, tpr))
        all_metrics.at[am_index, "ROC-AUC - mean per snpl (lco)"] = sum(aucs)/len(aucs)


        # In[34]:


        # Calculate hits @1
        all_metrics.at[am_index, "Hits@1(lco)"] = sum(ref["Class"][(ref["Class"] == 1) & (ref["For-SNP rank"] == 1)])


        # In[35]:


        # Calculate hits @3
        all_metrics.at[am_index, "Hits@3(lco)"] = sum(ref["Class"][(ref["Class"] == 1) & (ref["For-SNP rank"] <= 3)])


        # In[36]:


        # Calculate hits @5
        all_metrics.at[am_index, "Hits@5(lco)"] = sum(ref["Class"][(ref["Class"] == 1) & (ref["For-SNP rank"] <= 5)])


        # In[37]:


        # Calculate hits @10
        all_metrics.at[am_index, "Hits@10(lco)"] = sum(ref["Class"][(ref["Class"] == 1) & (ref["For-SNP rank"] <= 10)])


        # In[38]:


        all_metrics.at[am_index, "Mean rank (lco)"] = ref["For-SNP rank"][(ref["Class"] == 1)].mean()


        # In[39]:


        all_metrics.at[am_index, "Median rank (lco)"] = ref["For-SNP rank"][ref["Class"] == 1].quantile(q = [0,0.25,0.5,0.75,1])[.50]
        ref.drop(columns = ["predicted", "For-SNP rank"], inplace = True)

Predicting row 55 of 4
Predicting candidates for chromosome 1
Predicting candidates for chromosome 2
Predicting candidates for chromosome 3
Predicting candidates for chromosome 4
Predicting candidates for chromosome 5
Predicting candidates for chromosome 6
Predicting candidates for chromosome 7
Predicting candidates for chromosome 8
Predicting candidates for chromosome 9
Predicting candidates for chromosome 10
Predicting candidates for chromosome 11
Predicting candidates for chromosome 12
Predicting candidates for chromosome 15
Predicting candidates for chromosome 16
Predicting candidates for chromosome 17
Predicting candidates for chromosome 19
Predicting candidates for chromosome 20
Predicting candidates for chromosome 22
Predicting row 126 of 4
Predicting candidates for chromosome 1
Predicting candidates for chromosome 20
Predicting candidates for chromosome 17
Predicting candidates for chromosome 5
Predicting candidates for chromosome 12
Predicting candidates for chromosome 6
Predi

In [13]:
all_metrics.to_csv("/Users/vlietstraw/git//Post-GWAS/Network statistics/Leave-chromosome-out all metrics " + datetime.today().strftime("%d-%m-%Y") + ".csv", sep = ";", decimal = ",", index = False)