# Calculate the features for the network statistics for leave-SNP-out

In [1]:
# Load the required packages
import networkx as nx
import pandas as pd
import json
from datetime import datetime
from itertools import product
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
with open("/Users/vlietstraw/git/Post-GWAS/ENSEMBL_mappings.json", "r") as fp:
    ensembl_dict = json.load(fp)

In [3]:
# Load the graph data
with open("/Users/vlietstraw/git/Post-GWAS/unfiltered_protein_protein_interactions.csv", 'rb') as input_file:
    #next(input_file, '')   # skip a line
    G = nx.read_edgelist(input_file, delimiter=',', nodetype = int)

In [4]:
# Calculate the degrees of the nodes (first metric)
degrees = dict(G.degree())

In [5]:
# Calculate the number of connections between nodes and disease genes (second metrics)
def calculate1N(nodeID, diseaseProteins, graph):
    if nodeID in diseaseProteins:
        diseaseProteins.remove(nodeID)
    neighbours = set(dict(graph[nodeID]).keys())
    dp_neighbours = neighbours.intersection(diseaseProteins)
    return len(dp_neighbours)/len(neighbours)

In [6]:
# Calculate the number of indirect connectinos between nodes and disease genes (third metric)
def calculate2N(nodeID, diseaseProteins, graph):
    if nodeID in diseaseProteins:
        diseaseProteins.remove(nodeID)
    indirect_neighbours_dict = dict(nx.single_source_shortest_path_length(G, source = nodeID, cutoff = 2))
    indirect_neighbours = pd.DataFrame({"nodeID" : indirect_neighbours_dict.keys(), "pathLength" : indirect_neighbours_dict.values()})
    indirect_neighbours = indirect_neighbours[indirect_neighbours["pathLength"] == 2]
    dp_indirect_neighbours = set(indirect_neighbours["nodeID"]).intersection(diseaseProteins)
    if len(indirect_neighbours) > 0:
        return len(dp_indirect_neighbours)/len(indirect_neighbours)
    else:
        return -1

In [7]:
# Average distance to disease genes (fourth metric)
def getAverageDPDistance(nodeID, diseaseProteins, graph):
    if nodeID in diseaseProteins:
        diseaseProteins.remove(nodeID)
    shortestPaths = dict(nx.single_source_shortest_path_length(graph, source = nodeID))
    dp_shortestPaths = [shortestPaths[x] if x in shortestPaths.keys() else float('inf') for x in diseaseProteins]
    output = sum(dp_shortestPaths)/len(dp_shortestPaths)
    if output != float('inf'):
        return output
    else:
        return -1

In [8]:
# Positive topology coefficient (fifth metric)
def calculateTopologyCoeff(nodeID, diseaseProteins, graph):
    coeffs = []
    
    candidate = set(dict(graph[nodeID]).keys())
    for dis in diseaseProteins:
        dp = set(dict(graph[dis]).keys())

        overlap = dp.intersection(candidate)
        if len(overlap) > 0:
            coeffs.append(len(overlap) / min(len(dp), len(candidate)))    
    if len(coeffs) > 0:
        return sum(coeffs) / len(coeffs)
    else:
        return -1

In [9]:
ML_algorithms = ["LR", "SVM", "DT", "KNN1", "KNN3", "KNN5", "KNN7", "KNN9", "RF"]
all_bp_distances = [100, 500, 1000, 2000]
refsets = ["Teslovich", "DeRycke", "farashi", "farashi p-value cutoff"]

all_metrics = pd.DataFrame(list(product(refsets, all_bp_distances, ML_algorithms)), columns = ["refset", "bp distance", "algorithm"])

In [10]:
#Initialize emtpy variables
distance_history = 0

for am_index, am_values in all_metrics.iterrows():
    print("Predicting row " + str(am_index + 1) + " of " + str(len(all_metrics)))
    
    if distance_history != am_values["bp distance"]:
        if am_values["refset"] == "farashi":
            ref = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Input sets/Farashi/Farashi full 2000000 bp distance no pvalue filtering.csv")

        if am_values["refset"] == "farashi p-value cutoff":
            ref = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Input sets/Farashi/Farashi full 2000000 bp distance no pvalue filtering.csv")
            ref = ref[ref["GWAS/eQTL p-value¥"] <= float("5e-8")]

        if am_values["refset"] == "DeRycke":
            ref = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Input sets/DeRycke/DeRycke reference set.csv")
            ref.columns = ["SNP ID", "chromosome", "location", "gene_ids", "gene name", "gene start", "gene stop", "Diff expression", "Class", "bp distance absolute", "bp distance", "Gene rank"]

        if am_values["refset"] == "Teslovich":
            ref = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Input sets/Teslovich/Teslovich reference set.csv")
            ref.columns = ["SNP ID", "chromosome", "location", "P", "gene_ids", "gene name", "gene start", "gene stop", "Class", "bp distance absolute", "bp distance", "Gene rank"]

        ref["nodeID"] = [ensembl_dict[x] if x in ensembl_dict.keys() else None for x in ref["gene_ids"]]

        # Set bp distance cutoff
        max_bp_distance = am_values["bp distance"]
        max_bp_distance = max_bp_distance * 1000
        ref = ref[ref["bp distance absolute"] <= max_bp_distance]

        # Drop all unmappable candidates
        ref.dropna(subset = ["nodeID"], inplace = True)
        ref["nodeID"] = ref["nodeID"].astype(int)

        # Drop all SNPs which no longer have a positive case
        pos_counts = ref.groupby("SNP ID")["Class"].sum()
        ref = ref[~ref["SNP ID"].isin(pos_counts[pos_counts == 0].index)]
        
        f = ref.groupby("nodeID")["Class"].sum()
        f[f > 1] = 1
        f = pd.DataFrame(f)

        f["degree"] = [degrees[x] for x in list(f.index)]
        f["1N index"] = [calculate1N(x, set(f.index[f["Class"] == 1]), G) for x in list(f.index)]
        f["2N index"] = [calculate2N(x, set(f.index[f["Class"] == 1]), G) for x in list(f.index)]
        f["Average DP Distance"] = [getAverageDPDistance(x, set(f.index[f["Class"] == 1]), G) for x in list(f.index)]
        f["Topology coefficient"] = [calculateTopologyCoeff(x, set(f.index[f["Class"] == 1]), G) for x in list(f.index)]
        
    distance_history = am_values["bp distance"]

    outcomes = pd.DataFrame()
    train_auc_score = []
    train_auc_rank = []

    # In[12]:

    classifier = am_values["algorithm"]
    
    # Perform leave-SNP-out cross validation
    SNPs = list(set(ref["SNP ID"]))
    for snp in SNPs:
        #print("Predicting candidates for " + snp + ", number " + str(SNPs.index(snp) + 1) + " out of " + str(len(SNPs)))

        f_test = f[f.index.isin(ref[ref["SNP ID"] == snp]["nodeID"])].copy()
        f_train = f[~f.index.isin(f_test.index)].copy()

        train_class = f["Class"][f.index.isin(f_train.index)]
        test_class = f["Class"][f.index.isin(f_test.index)]

        f_test.drop(columns = ["Class"], inplace = True)
        f_train.drop(columns = ["Class"], inplace = True)

        if classifier == "SVM":
            clf = SVR(gamma="auto")
        if classifier == "DT":
            clf = DecisionTreeRegressor()
        if classifier == "KNN1":
            clf = KNeighborsRegressor(n_neighbors = 1)
        if classifier == "KNN3":
            clf = KNeighborsRegressor(n_neighbors = 3)
        if classifier == "KNN5":
            clf = KNeighborsRegressor(n_neighbors = 5)
        if classifier == "KNN7":
            clf = KNeighborsRegressor(n_neighbors = 7)
        if classifier == "KNN9":
            clf = KNeighborsRegressor(n_neighbors = 9)
        if classifier == "LR":
            from warnings import filterwarnings
            filterwarnings('ignore')
            clf = LogisticRegression()
        if classifier == "RF":
            clf = RandomForestRegressor(n_estimators = 1000, n_jobs = -1, max_features = "sqrt", max_depth = 5)

        clf.fit(f_train, train_class)

        outcomes = pd.concat([outcomes, pd.DataFrame({"predicted" : clf.predict(f_test),
                                                        "SNP ID" : snp,
                                                        "nodeID" : f_test.index})])

    outcomes = outcomes.merge(ref[["SNP ID", "nodeID", "Class"]], on = ["SNP ID", "nodeID"], how = "left")
    
    outcomes = outcomes.sort_values(["SNP ID", "predicted"], ascending = False)
    outcomes["For-SNP rank"] = outcomes.groupby("SNP ID").cumcount() + 1


    # In[ ]:

    all_metrics.at[am_index, "Recall snps"] = len(set(outcomes["SNP ID"]))
    all_metrics.at[am_index, "Recall genes"] = sum(outcomes["Class"])


    import sklearn.metrics

    fpr, tpr, thresholds = sklearn.metrics.roc_curve(outcomes["Class"], -outcomes["For-SNP rank"], pos_label = 1)
    all_metrics.at[am_index, "ROC-AUC overall (lso)"] = sklearn.metrics.auc(fpr, tpr) * 100


    # In[21]:


    # Calculate the ROC-AUC for every SNP and average the result
    SNPS2 = list(set(outcomes["SNP ID"]))
    aucs = []
    for snp in SNPS2:
      if len(set(outcomes["Class"][outcomes["SNP ID"] == snp])) == 1:
          aucs.append(list(set(outcomes["Class"][outcomes["SNP ID"] == snp]))[0])
      else:
          fpr, tpr, thresholds = sklearn.metrics.roc_curve(outcomes["Class"][outcomes["SNP ID"] == snp], -outcomes["For-SNP rank"][outcomes["SNP ID"] == snp], pos_label = 1)
          aucs.append(sklearn.metrics.auc(fpr, tpr))
    all_metrics.at[am_index, "ROC-AUC - mean per snpl (lso)"] = sum(aucs)/len(aucs)


    # In[22]:


    # Calculate hits @1
    all_metrics.at[am_index, "Hits@1(lso)"] = sum(outcomes["Class"][(outcomes["Class"] == 1) & (outcomes["For-SNP rank"] == 1)])


    # In[23]:


    # Calculate hits @3
    all_metrics.at[am_index, "Hits@3(lso)"] = sum(outcomes["Class"][(outcomes["Class"] == 1) & (outcomes["For-SNP rank"] <= 3)])


    # In[24]:


    # Calculate hits @5
    all_metrics.at[am_index, "Hits@5(lso)"] = sum(outcomes["Class"][(outcomes["Class"] == 1) & (outcomes["For-SNP rank"] <= 5)])


    # In[25]:


    # Calculate hits @10
    all_metrics.at[am_index, "Hits@10(lso)"] = sum(outcomes["Class"][(outcomes["Class"] == 1) & (outcomes["For-SNP rank"] <= 10)])


    # In[26]:


    all_metrics.at[am_index, "Mean rank (lso)"] = outcomes["For-SNP rank"][(outcomes["Class"] == 1)].mean()


    # In[27]:


    all_metrics.at[am_index, "Median rank (lso)"] = outcomes["For-SNP rank"][outcomes["Class"] == 1].quantile(q = [0,0.25,0.5,0.75,1])[0.50]

Predicting row 1 of 144
Predicting row 2 of 144
Predicting row 3 of 144
Predicting row 4 of 144
Predicting row 5 of 144
Predicting row 6 of 144
Predicting row 7 of 144
Predicting row 8 of 144
Predicting row 9 of 144
Predicting row 10 of 144
Predicting row 11 of 144
Predicting row 12 of 144
Predicting row 13 of 144
Predicting row 14 of 144
Predicting row 15 of 144
Predicting row 16 of 144
Predicting row 17 of 144
Predicting row 18 of 144
Predicting row 19 of 144
Predicting row 20 of 144
Predicting row 21 of 144
Predicting row 22 of 144
Predicting row 23 of 144
Predicting row 24 of 144
Predicting row 25 of 144
Predicting row 26 of 144
Predicting row 27 of 144
Predicting row 28 of 144
Predicting row 29 of 144
Predicting row 30 of 144
Predicting row 31 of 144
Predicting row 32 of 144
Predicting row 33 of 144
Predicting row 34 of 144
Predicting row 35 of 144
Predicting row 36 of 144
Predicting row 37 of 144
Predicting row 38 of 144
Predicting row 39 of 144
Predicting row 40 of 144
Predictin

In [11]:
all_metrics.to_csv("/Users/vlietstraw/git//Post-GWAS/Network statistics/Leave-SNP-out all metrics " + datetime.today().strftime("%d-%m-%Y") + ".csv", sep = ";", decimal = ",", index = False)