# Prioritization comparison & classification analysis

In [1]:
# Load the required packages
import pandas as pd
import json
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import sklearn.metrics
from datetime import datetime

In [2]:
refset = "farashi p-value cutoff" # ["farashi", "farashi p-value cutoff", "DeRycke", "Teslovich"]
classifier = "LR" # ["LR", "RF", "SVM"]

In [3]:
with open("/Users/vlietstraw/git/Post-GWAS/ENSEMBL_mappings.json", "r") as fp:
    ensembl_dict = json.load(fp)

In [4]:
if refset == "farashi":
    # Load the reference set
    ref = pd.read_csv("~/git/Post-GWAS/Input sets/Farashi/Farashi full 2000000 bp distance no pvalue filtering.csv")
    ref["nodeID"] = [ensembl_dict[x] if x in ensembl_dict.keys() else None for x in ref["gene_ids"]]
    
    # Load the DIAMOnD results
    diamond = pd.read_csv("~/git/Post-GWAS/DIAMOND/farashi diamond predictions with bp distance depict.csv", delimiter = ";", decimal = ",")
    diamond.columns = ["nodeID", "unknown1", "unknown2", "DIAMOND predicted", "chromosome", "Class", "DIAMOND For-SNP rank"]
    diamond = diamond[["nodeID", "DIAMOND predicted", "chromosome", "DIAMOND For-SNP rank"]]
    diamond["chromosome"] = diamond["chromosome"].astype(str)
    ref["chromosome"] = ref["chromosome"].astype(str)
    diamond.drop(columns = ["DIAMOND For-SNP rank"], inplace = True)
    diamond_temp = ref[["nodeID", "chromosome", "SNP ID"]].merge(diamond, on = ["nodeID", "chromosome"])
    diamond_temp = diamond_temp.sort_values(["SNP ID", "DIAMOND predicted"], ascending = True)
    diamond_temp["DIAMOND For SNP rank"] = diamond_temp.groupby("SNP ID").cumcount() + 1
    
    # Merge with DIAMOND
    ref = ref.merge(diamond_temp, how = "left", on = ["nodeID", "SNP ID", "chromosome"])
    
    # Load the DEPICT results
    depict = pd.read_csv("~/git/DEPICT/outcomes/Farashi complete 2nd round/farashi_no_pvalue_filtering_geneprioritization.txt", sep = "\t")
    depict["Locus"] = depict["Locus"].astype(str).apply(lambda x: x.split(";"))
    depict = depict.explode("Locus")

    snp_replacement_dict = {"rs113645266" : "rs6557271",
                    "rs150282463" : "rs13137700",
                    "rs67276543" : "rs34884832"}
    depict["Locus"] = depict["Locus"].replace(snp_replacement_dict)

    depict = depict[["Locus", "Ensembl gene ID", "Nominal P value"]]
    depict.columns = ["SNP ID", "gene_ids", "DEPICT p-value"]
    depict = depict.sort_values(["SNP ID", "DEPICT p-value"], ascending = True)
    depict["DEPICT For SNP rank"] = depict.groupby("SNP ID").cumcount() + 1

    ref = ref.merge(depict, on = ["SNP ID", "gene_ids"], how = "inner")
    
    # Load the EVOKE results
    
    EVOKE = pd.read_csv("~/git/Post-GWAS/EVOKE/farashi normal RF depict 17-09-2021.csv", delimiter = ";")
    EVOKE.columns = ["SNP ID", "nodeID", "EVOKE_score", "Class", "EVOKE For SNP rank"]
    ref = ref.merge(EVOKE, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    # Load the network distance results
    
    distance = pd.read_csv("~/git/Post-GWAS/Network statistics/farashi RF depict 20-09-2021.csv", delimiter = ";")
    distance.columns = ["SNP ID", "nodeID", "distance_score", "Class", "network distance For SNP rank"]
    ref = ref.merge(distance, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    # Load the node2vec results
    
    node2vec_normal = pd.read_csv("~/git/Post-GWAS/Node2vec/farashi normal SVM depict 18-09-2021.csv", delimiter = ";")
    node2vec_normal.columns = ["SNP ID", "nodeID", "node2vec_normal_score", "Class", "node2vec normal For SNP rank"]
    ref = ref.merge(node2vec_normal, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    node2vec_graphlet = pd.read_csv("~/git/Post-GWAS/Node2vec/farashi graphlet LR depict 18-09-2021.csv", delimiter = ";")
    node2vec_graphlet.columns = ["SNP ID", "nodeID", "node2vec_graphlet_score", "Class", "node2vec graphlet For SNP rank"]
    ref = ref.merge(node2vec_graphlet, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    node2vec_autoencode = pd.read_csv("~/git/Post-GWAS/Node2vec/farashi autoencode LR depict 18-09-2021.csv", delimiter = ";")
    node2vec_autoencode.columns = ["SNP ID", "nodeID", "node2vec_autoencode_score", "Class", "node2vec autoencode For SNP rank"]
    ref = ref.merge(node2vec_autoencode, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    node2vec_combi = pd.read_csv("~/git/Post-GWAS/Node2vec/farashi combi SVM depict 18-09-2021.csv", delimiter = ";")
    node2vec_combi.columns = ["SNP ID", "nodeID", "node2vec_combi_score", "Class", "node2vec combi For SNP rank"]
    ref = ref.merge(node2vec_combi, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    # Load the predicates results
    
    predicates = pd.read_csv("~/git/Post-GWAS/Predicates/farashi incoming LR depict 17-09-2021.csv", delimiter = ";")
    predicates.columns = ["SNP ID", "gene_ids", "predicates_score", "Class", "predicates For SNP rank"]
    ref = ref.merge(predicates, how = "left", on = ["SNP ID", "gene_ids", "Class"])
    
    # Load the RDF2vec results
    
    rdf2vec = pd.read_csv("~/git/Post-GWAS/RDF2vec/farashi normal SVM depict 17-09-2021.csv", delimiter = ";")
    rdf2vec.columns = ["SNP ID", "gene_ids", "rdf2vec_score", "Class", "rdf2vec For SNP rank"]
    ref = ref.merge(rdf2vec, how = "left", on = ["SNP ID", "gene_ids", "Class"])
    
    # Load the Struc2vec results
    
    struc2vec = pd.read_csv("~/git/Post-GWAS/Struc2vec/farashi normal KNN9 depict 17-09-2021.csv", delimiter = ";")
    struc2vec.columns = ["SNP ID", "nodeID", "struc2vec_score", "Class", "struc2vec For SNP rank"]
    ref = ref.merge(struc2vec, how = "left", on = ["SNP ID", "nodeID", "Class"])

In [5]:
if refset == "farashi p-value cutoff":
    # Load the reference set
    ref = pd.read_csv("~/git/Post-GWAS/Input sets/Farashi/Farashi full 2000000 bp distance no pvalue filtering.csv")
    ref["nodeID"] = [ensembl_dict[x] if x in ensembl_dict.keys() else None for x in ref["gene_ids"]]
    
    # Load the DIAMOnD results
    diamond = pd.read_csv("~/git/Post-GWAS/DIAMOND/farashi p-value cutoff diamond predictions with bp distance depict.csv", delimiter = ";", decimal = ",")
    diamond.columns = ["nodeID", "unknown1", "unknown2", "DIAMOND predicted", "chromosome", "Class", "DIAMOND For-SNP rank"]
    diamond = diamond[["nodeID", "DIAMOND predicted", "chromosome", "DIAMOND For-SNP rank"]]
    diamond["chromosome"] = diamond["chromosome"].astype(str)
    ref["chromosome"] = ref["chromosome"].astype(str)
    diamond.drop(columns = ["DIAMOND For-SNP rank"], inplace = True)
    diamond_temp = ref[["nodeID", "chromosome", "SNP ID"]].merge(diamond, on = ["nodeID", "chromosome"])
    diamond_temp = diamond_temp.sort_values(["SNP ID", "DIAMOND predicted"], ascending = True)
    diamond_temp["DIAMOND For SNP rank"] = diamond_temp.groupby("SNP ID").cumcount() + 1
    
    # Merge with DIAMOND
    ref = ref.merge(diamond_temp, how = "left", on = ["nodeID", "SNP ID", "chromosome"])
    
    # Load the DEPICT results
    depict = pd.read_csv("~/git/DEPICT/outcomes/Farashi complete 2nd round/farashi_default_pvalue_filtering_geneprioritization.txt", sep = "\t")
    depict["Locus"] = depict["Locus"].astype(str).apply(lambda x: x.split(";"))
    depict = depict.explode("Locus")

    snp_replacement_dict = {"rs113645266" : "rs6557271",
                    "rs150282463" : "rs13137700",
                    "rs67276543" : "rs34884832"}
    depict["Locus"] = depict["Locus"].replace(snp_replacement_dict)

    depict = depict[["Locus", "Ensembl gene ID", "Nominal P value"]]
    depict.columns = ["SNP ID", "gene_ids", "DEPICT p-value"]
    depict = depict.sort_values(["SNP ID", "DEPICT p-value"], ascending = True)
    depict["DEPICT For SNP rank"] = depict.groupby("SNP ID").cumcount() + 1

    ref = ref.merge(depict, on = ["SNP ID", "gene_ids"], how = "inner")
    
    # Load the EVOKE results
    
    EVOKE = pd.read_csv("~/git/Post-GWAS/EVOKE/farashi p-value cutoff log KNN7 depict 17-09-2021.csv", delimiter = ";")
    EVOKE.columns = ["SNP ID", "nodeID", "EVOKE_score", "Class", "EVOKE For SNP rank"]
    ref = ref.merge(EVOKE, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    # Load the network distance results
    
    distance = pd.read_csv("~/git/Post-GWAS/Network statistics/farashi p-value cutoff LR depict 20-09-2021.csv", delimiter = ";")
    distance.columns = ["SNP ID", "nodeID", "distance_score", "Class", "network distance For SNP rank"]
    ref = ref.merge(distance, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    # Load the node2vec results
    
    node2vec_normal = pd.read_csv("~/git/Post-GWAS/Node2vec/farashi p-value cutoff normal KNN7 depict 18-09-2021.csv", delimiter = ";")
    node2vec_normal.columns = ["SNP ID", "nodeID", "node2vec_normal_score", "Class", "node2vec normal For SNP rank"]
    ref = ref.merge(node2vec_normal, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    node2vec_graphlet = pd.read_csv("~/git/Post-GWAS/Node2vec/farashi p-value cutoff graphlet KNN3 depict 18-09-2021.csv", delimiter = ";")
    node2vec_graphlet.columns = ["SNP ID", "nodeID", "node2vec_graphlet_score", "Class", "node2vec graphlet For SNP rank"]
    ref = ref.merge(node2vec_graphlet, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    node2vec_autoencode = pd.read_csv("~/git/Post-GWAS/Node2vec/farashi p-value cutoff autoencode LR depict 18-09-2021.csv", delimiter = ";")
    node2vec_autoencode.columns = ["SNP ID", "nodeID", "node2vec_autoencode_score", "Class", "node2vec autoencode For SNP rank"]
    ref = ref.merge(node2vec_autoencode, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    node2vec_combi = pd.read_csv("~/git/Post-GWAS/Node2vec/farashi p-value cutoff combi SVM depict 18-09-2021.csv", delimiter = ";")
    node2vec_combi.columns = ["SNP ID", "nodeID", "node2vec_combi_score", "Class", "node2vec combi For SNP rank"]
    ref = ref.merge(node2vec_combi, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    # Load the predicates results
    
    predicates = pd.read_csv("~/git/Post-GWAS/Predicates/farashi p-value cutoff incoming KNN9 depict 17-09-2021.csv", delimiter = ";")
    predicates.columns = ["SNP ID", "gene_ids", "predicates_score", "Class", "predicates For SNP rank"]
    ref = ref.merge(predicates, how = "left", on = ["SNP ID", "gene_ids", "Class"])
    
    # Load the RDF2vec results
    
    rdf2vec = pd.read_csv("~/git/Post-GWAS/RDF2vec/farashi p-value cutoff normal RF depict 17-09-2021.csv", delimiter = ";")
    rdf2vec.columns = ["SNP ID", "gene_ids", "rdf2vec_score", "Class", "rdf2vec For SNP rank"]
    ref = ref.merge(rdf2vec, how = "left", on = ["SNP ID", "gene_ids", "Class"])
    
    # Load the Struc2vec results
    
    struc2vec = pd.read_csv("~/git/Post-GWAS/Struc2vec/farashi p-value cutoff graphlet KNN7 depict 17-09-2021.csv", delimiter = ";")
    struc2vec.columns = ["SNP ID", "nodeID", "struc2vec_score", "Class", "struc2vec For SNP rank"]
    ref = ref.merge(struc2vec, how = "left", on = ["SNP ID", "nodeID", "Class"])

In [6]:
if refset == "DeRycke":
    # Load the reference set
    ref = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Input sets/DeRycke/DeRycke reference set.csv", delimiter = ";")
    ref.columns = ["SNP ID", "chromosome", "location", "gene_ids", "gene name", "gene start", "gene stop", "Diff expression", "Class", "bp distance absolute", "bp distance", "Gene rank"]
    ref["nodeID"] = [ensembl_dict[x] if x in ensembl_dict.keys() else None for x in ref["gene_ids"]]
    
    # Load the DIAMOnD results
    diamond = pd.read_csv("~/git/Post-GWAS/DIAMOND/DeRycke diamond predictions with bp distance depict.csv", delimiter = ";", decimal = ",")
    diamond.columns = ["nodeID", "unknown1", "unknown2", "DIAMOND predicted", "chromosome", "Class", "DIAMOND For-SNP rank"]
    diamond = diamond[["nodeID", "DIAMOND predicted", "chromosome", "DIAMOND For-SNP rank"]]
    diamond["chromosome"] = diamond["chromosome"].astype(str)
    ref["chromosome"] = ref["chromosome"].astype(str)
    diamond.drop(columns = ["DIAMOND For-SNP rank"], inplace = True)
    diamond_temp = ref[["nodeID", "chromosome", "SNP ID"]].merge(diamond, on = ["nodeID", "chromosome"])
    diamond_temp = diamond_temp.sort_values(["SNP ID", "DIAMOND predicted"], ascending = True)
    diamond_temp["DIAMOND For SNP rank"] = diamond_temp.groupby("SNP ID").cumcount() + 1
    
    # Merge with DIAMOND
    ref = ref.merge(diamond_temp, how = "left", on = ["nodeID", "SNP ID", "chromosome"])
    
    # Load the DEPICT results
    depict = pd.read_csv("~/git/DEPICT/outcomes/DeRycke/DeRycke_output_geneprioritization.txt", sep = "\t")
    depict["Locus"] = depict["Locus"].astype(str).apply(lambda x: x.split(";"))
    depict = depict.explode("Locus")

    snp_replacement_dict = {"rs113645266" : "rs6557271",
                    "rs150282463" : "rs13137700",
                    "rs67276543" : "rs34884832"}
    depict["Locus"] = depict["Locus"].replace(snp_replacement_dict)

    depict = depict[["Locus", "Ensembl gene ID", "Nominal P value"]]
    depict.columns = ["SNP ID", "gene_ids", "DEPICT p-value"]
    depict = depict.sort_values(["SNP ID", "DEPICT p-value"], ascending = True)
    depict["DEPICT For SNP rank"] = depict.groupby("SNP ID").cumcount() + 1

    ref = ref.merge(depict, on = ["SNP ID", "gene_ids"], how = "inner")
    
    # Load the EVOKE results
    
    EVOKE = pd.read_csv("~/git/Post-GWAS/EVOKE/DeRycke log DT depict 17-09-2021.csv", delimiter = ";")
    EVOKE.columns = ["SNP ID", "nodeID", "EVOKE_score", "Class", "EVOKE For SNP rank"]
    ref = ref.merge(EVOKE, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    # Load the network distance results
    
    distance = pd.read_csv("~/git/Post-GWAS/Network statistics/DeRycke RF depict 20-09-2021.csv", delimiter = ";")
    distance.columns = ["SNP ID", "nodeID", "distance_score", "Class", "network distance For SNP rank"]
    ref = ref.merge(distance, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    # Load the node2vec results
    
    node2vec_normal = pd.read_csv("~/git/Post-GWAS/Node2vec/DeRycke normal KNN9 depict 18-09-2021.csv", delimiter = ";")
    node2vec_normal.columns = ["SNP ID", "nodeID", "node2vec_normal_score", "Class", "node2vec normal For SNP rank"]
    ref = ref.merge(node2vec_normal, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    node2vec_graphlet = pd.read_csv("~/git/Post-GWAS/Node2vec/DeRycke graphlet LR depict 18-09-2021.csv", delimiter = ";")
    node2vec_graphlet.columns = ["SNP ID", "nodeID", "node2vec_graphlet_score", "Class", "node2vec graphlet For SNP rank"]
    ref = ref.merge(node2vec_graphlet, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    node2vec_autoencode = pd.read_csv("~/git/Post-GWAS/Node2vec/DeRycke autoencode KNN3 depict 18-09-2021.csv", delimiter = ";")
    node2vec_autoencode.columns = ["SNP ID", "nodeID", "node2vec_autoencode_score", "Class", "node2vec autoencode For SNP rank"]
    ref = ref.merge(node2vec_autoencode, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    node2vec_combi = pd.read_csv("~/git/Post-GWAS/Node2vec/DeRycke combi LR depict 18-09-2021.csv", delimiter = ";")
    node2vec_combi.columns = ["SNP ID", "nodeID", "node2vec_combi_score", "Class", "node2vec combi For SNP rank"]
    ref = ref.merge(node2vec_combi, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    # Load the predicates results
    
    predicates = pd.read_csv("~/git/Post-GWAS/Predicates/DeRycke outgoing DT depict 17-09-2021.csv", delimiter = ";")
    predicates.columns = ["SNP ID", "gene_ids", "predicates_score", "Class", "predicates For SNP rank"]
    ref = ref.merge(predicates, how = "left", on = ["SNP ID", "gene_ids", "Class"])
    
    # Load the RDF2vec results
    
    rdf2vec = pd.read_csv("~/git/Post-GWAS/RDF2vec/DeRycke autoencode SVM depict 17-09-2021.csv", delimiter = ";")
    rdf2vec.columns = ["SNP ID", "gene_ids", "rdf2vec_score", "Class", "rdf2vec For SNP rank"]
    ref = ref.merge(rdf2vec, how = "left", on = ["SNP ID", "gene_ids", "Class"])
    
    # Load the Struc2vec results
    
    struc2vec = pd.read_csv("~/git/Post-GWAS/Struc2vec/DeRycke combi KNN3 depict 17-09-2021.csv", delimiter = ";")
    struc2vec.columns = ["SNP ID", "nodeID", "struc2vec_score", "Class", "struc2vec For SNP rank"]
    ref = ref.merge(struc2vec, how = "left", on = ["SNP ID", "nodeID", "Class"])

In [7]:
if refset == "Teslovich":
    # Load the reference set
    ref = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Input sets/Teslovich/Teslovich reference set.csv")
    ref.columns = ["SNP ID", "chromosome", "location", "P", "gene_ids", "gene name", "gene start", "gene stop", "Class", "bp distance absolute", "bp distance", "Gene rank"]
    ref["nodeID"] = [ensembl_dict[x] if x in ensembl_dict.keys() else None for x in ref["gene_ids"]]
    
    # Load the DIAMOnD results
    diamond = pd.read_csv("~/git/Post-GWAS/DIAMOND/Teslovich diamond predictions with bp distance depict.csv", delimiter = ";", decimal = ",")
    diamond.columns = ["nodeID", "unknown1", "unknown2", "DIAMOND predicted", "chromosome", "Class", "DIAMOND For-SNP rank"]
    diamond = diamond[["nodeID", "DIAMOND predicted", "chromosome", "DIAMOND For-SNP rank"]]
    diamond["chromosome"] = diamond["chromosome"].astype(str)
    ref["chromosome"] = ref["chromosome"].astype(str)
    diamond.drop(columns = ["DIAMOND For-SNP rank"], inplace = True)
    diamond_temp = ref[["nodeID", "chromosome", "SNP ID"]].merge(diamond, on = ["nodeID", "chromosome"])
    diamond_temp = diamond_temp.sort_values(["SNP ID", "DIAMOND predicted"], ascending = True)
    diamond_temp["DIAMOND For SNP rank"] = diamond_temp.groupby("SNP ID").cumcount() + 1
    
    # Merge with DIAMOND
    ref = ref.merge(diamond_temp, how = "left", on = ["nodeID", "SNP ID", "chromosome"])
    
    # Load the DEPICT results
    depict = pd.read_csv("~/git/DEPICT/outcomes/Teslovich for paper Wytze/Teslovich_output_geneprioritization.txt", sep = "\t")
    depict["Locus"] = depict["Locus"].astype(str).apply(lambda x: x.split(";"))
    depict = depict.explode("Locus")

    snp_replacement_dict = {"rs113645266" : "rs6557271",
                    "rs150282463" : "rs13137700",
                    "rs67276543" : "rs34884832"}
    depict["Locus"] = depict["Locus"].replace(snp_replacement_dict)

    depict = depict[["Locus", "Ensembl gene ID", "Nominal P value"]]
    depict.columns = ["SNP ID", "gene_ids", "DEPICT p-value"]
    depict = depict.sort_values(["SNP ID", "DEPICT p-value"], ascending = True)
    depict["DEPICT For SNP rank"] = depict.groupby("SNP ID").cumcount() + 1

    ref = ref.merge(depict, on = ["SNP ID", "gene_ids"], how = "inner")
    
    # Load the EVOKE results
    
    EVOKE = pd.read_csv("~/git/Post-GWAS/EVOKE/Teslovich log LR depict 17-09-2021.csv", delimiter = ";")
    EVOKE.columns = ["SNP ID", "nodeID", "EVOKE_score", "Class", "EVOKE For SNP rank"]
    ref = ref.merge(EVOKE, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    # Load the network distance results
    
    distance = pd.read_csv("~/git/Post-GWAS/Network statistics/Teslovich LR depict 20-09-2021.csv", delimiter = ";")
    distance.columns = ["SNP ID", "nodeID", "distance_score", "Class", "network distance For SNP rank"]
    ref = ref.merge(distance, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    # Load the node2vec results
    
    node2vec_normal = pd.read_csv("~/git/Post-GWAS/Node2vec/Teslovich normal KNN1 depict 18-09-2021.csv", delimiter = ";")
    node2vec_normal.columns = ["SNP ID", "nodeID", "node2vec_normal_score", "Class", "node2vec normal For SNP rank"]
    ref = ref.merge(node2vec_normal, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    node2vec_graphlet = pd.read_csv("~/git/Post-GWAS/Node2vec/Teslovich graphlet KNN1 depict 18-09-2021.csv", delimiter = ";")
    node2vec_graphlet.columns = ["SNP ID", "nodeID", "node2vec_graphlet_score", "Class", "node2vec graphlet For SNP rank"]
    ref = ref.merge(node2vec_graphlet, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    node2vec_autoencode = pd.read_csv("~/git/Post-GWAS/Node2vec/Teslovich autoencode SVM depict 18-09-2021.csv", delimiter = ";")
    node2vec_autoencode.columns = ["SNP ID", "nodeID", "node2vec_autoencode_score", "Class", "node2vec autoencode For SNP rank"]
    ref = ref.merge(node2vec_autoencode, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    node2vec_combi = pd.read_csv("~/git/Post-GWAS/Node2vec/Teslovich combi SVM depict 18-09-2021.csv", delimiter = ";")
    node2vec_combi.columns = ["SNP ID", "nodeID", "node2vec_combi_score", "Class", "node2vec combi For SNP rank"]
    ref = ref.merge(node2vec_combi, how = "left", on = ["SNP ID", "nodeID", "Class"])
    
    # Load the predicates results
    
    predicates = pd.read_csv("~/git/Post-GWAS/Predicates/Teslovich outgoing SVM depict 17-09-2021.csv", delimiter = ";")
    predicates.columns = ["SNP ID", "gene_ids", "predicates_score", "Class", "predicates For SNP rank"]
    ref = ref.merge(predicates, how = "left", on = ["SNP ID", "gene_ids", "Class"])
    
    # Load the RDF2vec results
    
    rdf2vec = pd.read_csv("~/git/Post-GWAS/RDF2vec/Teslovich autoencode RF depict 17-09-2021.csv", delimiter = ";")
    rdf2vec.columns = ["SNP ID", "gene_ids", "rdf2vec_score", "Class", "rdf2vec For SNP rank"]
    ref = ref.merge(rdf2vec, how = "left", on = ["SNP ID", "gene_ids", "Class"])
    
    # Load the Struc2vec results
    
    struc2vec = pd.read_csv("~/git/Post-GWAS/Struc2vec/Teslovich normal KNN5 depict 17-09-2021.csv", delimiter = ";")
    struc2vec.columns = ["SNP ID", "nodeID", "struc2vec_score", "Class", "struc2vec For SNP rank"]
    ref = ref.merge(struc2vec, how = "left", on = ["SNP ID", "nodeID", "Class"])

In [8]:
# Create classifier based on rankings (with different combinations)
scores = ["bp distance absolute", "DEPICT p-value", "predicates_score", "node2vec_normal_score", "node2vec_graphlet_score",
          "node2vec_autoencode_score", "node2vec_combi_score", "distance_score", "EVOKE_score", "rdf2vec_score", "struc2vec_score"] # "DIAMOND predicted",
ranks = ["Gene rank", "DEPICT For SNP rank", "predicates For SNP rank", "node2vec normal For SNP rank", "node2vec graphlet For SNP rank", "node2vec autoencode For SNP rank" , "node2vec combi For SNP rank" ,
         "network distance For SNP rank", "EVOKE For SNP rank", "rdf2vec For SNP rank", "struc2vec For SNP rank"] # "DIAMOND For SNP rank",

scores_combos = list(combinations(scores, 2)) + list(combinations(scores, 3)) + list(combinations(scores, 4)) + list(combinations(scores, 5)) + list(combinations(scores, 6)) + \
                list(combinations(scores, 7)) + list(combinations(scores, 8)) + list(combinations(scores, 9)) + list(combinations(scores, 10)) + [scores]
ranks_combos = list(combinations(ranks, 2)) + list(combinations(ranks, 3)) + list(combinations(ranks, 4)) + list(combinations(ranks, 5)) + list(combinations(ranks, 6)) + \
                list(combinations(ranks, 7)) + list(combinations(ranks, 8)) + list(combinations(ranks, 9)) + list(combinations(ranks, 10)) + [ranks]

In [9]:
# Compare rankings
ref["lowest rank"] = ref[ranks].min(axis = 0)
ref[(ref["lowest rank"] == 1) & (ref["Class"] == 1)]
ref[(ref["lowest rank"] <= 3) & (ref["Class"] == 1)]
ref["lowest rank"][ref["Class"] == 1].mean()

nan

In [10]:
scores_df = pd.DataFrame(index = range(1, len(scores_combos) + 1), columns = scores)
scores_df = scores_df.fillna(False)
indx = 1
for s in scores_combos:
    #print("Analyzing " + str(indx) + " out of " + str(len(scores_df)))
    scores_df.at[indx, list(s)] = True
    
    f = ref.copy()
    f = f[["chromosome", "Class", "SNP ID", "gene_ids"] + list(s)]
    f.dropna(subset = list(s), inplace = True)
    
    # Drop all SNPs which no longer have a positive case
    pos_counts = f.groupby("SNP ID")["Class"].sum()
    f = f[~f["SNP ID"].isin(pos_counts[pos_counts == 0].index)]
    
    outcomes = pd.DataFrame()

    chromosomes = list(set(f["chromosome"]))

    for chrom in chromosomes:
        #print("Predicting candidates for chromosome " + str(chrom))

        f_test = f[f["chromosome"] == chrom].copy()
        f_train = f[f["chromosome"] != chrom].copy()

        train_class = f_train["Class"]
        test_class = f_test["Class"]
        
        test_snps = f_test["SNP ID"]
        test_genes = f_test["gene_ids"]

        f_test.drop(columns = ["Class", "chromosome", "SNP ID", "gene_ids"], inplace = True)
        f_train.drop(columns = ["Class", "chromosome", "SNP ID", "gene_ids"], inplace = True)
        
        if classifier == "SVM":
            clf = SVR(gamma="auto")
        if classifier == "LR":
            from warnings import filterwarnings
            filterwarnings('ignore')
            clf = LogisticRegression(max_iter = 10000)
        if classifier == "RF":
            clf = RandomForestRegressor(n_estimators = 1000, n_jobs = -1, max_features = "sqrt", max_depth = 5)
        clf.fit(f_train, train_class)
        
        if classifier == "LR":
            outcomes = pd.concat([outcomes, pd.DataFrame({"predicted" : clf.predict_proba(f_test)[:,1],
                                                           "chromsome" : chrom,
                                                           "Class" : test_class,
                                                           "SNP ID" : test_snps,
                                                           "ENSEMBL" : test_genes})])
        else:
            outcomes = pd.concat([outcomes, pd.DataFrame({"predicted" : clf.predict(f_test),
                                               "chromsome" : chrom,
                                               "Class" : test_class,
                                               "SNP ID" : test_snps,
                                               "ENSEMBL" : test_genes})])
            
    if len(outcomes) > 0:
        outcomes = outcomes.sort_values(["SNP ID", "predicted"], ascending = False)
        outcomes["For-SNP rank"] = outcomes.groupby("SNP ID").cumcount() + 1

        scores_df.at[indx, "Recall snps"] = len(set(outcomes["SNP ID"]))
        scores_df.at[indx, "Recall genes"] = len(set(outcomes["ENSEMBL"]))
        scores_df.at[indx, "Recall entries"] = sum(outcomes["Class"])

        fpr, tpr, thresholds = sklearn.metrics.roc_curve(outcomes["Class"], -outcomes["For-SNP rank"], pos_label = 1)
        scores_df.at[indx, "ROC-AUC overall (lso)"] = sklearn.metrics.auc(fpr, tpr) * 100

        # Calculate the ROC-AUC for every SNP and average the result
        SNPS2 = list(set(outcomes["SNP ID"]))
        aucs = []
        for snp in SNPS2:
          if len(set(outcomes["Class"][outcomes["SNP ID"] == snp])) == 1:
              aucs.append(list(set(outcomes["Class"][outcomes["SNP ID"] == snp]))[0])
          else:
              fpr, tpr, thresholds = sklearn.metrics.roc_curve(outcomes["Class"][outcomes["SNP ID"] == snp], outcomes["predicted"][outcomes["SNP ID"] == snp], pos_label = 1)
              aucs.append(sklearn.metrics.auc(fpr, tpr))
        scores_df.at[indx, "ROC-AUC - mean per snpl (lso)"] = sum(aucs)/len(aucs)


        outcomes = outcomes.sort_values(["SNP ID", "predicted"], ascending = False)
        SNP_temp = 0
        counter = 0
        prediction_temp = 9999
        for indx2, row in outcomes.iterrows():
            if SNP_temp != row["SNP ID"]:
                SNP_temp = row["SNP ID"]
                counter = 1
                prediction_temp = row["predicted"]
            elif SNP_temp == row["SNP ID"] and prediction_temp != row["predicted"]:
                counter += 1
                prediction_temp = row["predicted"]
            ref.at[indx2, "For-SNP rank"] = counter

        # In[22]:

        # Calculate hits @1
        scores_df.at[indx, "Hits@1(lso)"] = sum(outcomes["Class"][(outcomes["Class"] == 1) & (outcomes["For-SNP rank"] == 1)])

        # In[23]:

        # Calculate hits @3
        scores_df.at[indx, "Hits@3(lso)"] = sum(outcomes["Class"][(outcomes["Class"] == 1) & (outcomes["For-SNP rank"] <= 3)])

        # In[24]:

        # Calculate hits @5
        scores_df.at[indx, "Hits@5(lso)"] = sum(outcomes["Class"][(outcomes["Class"] == 1) & (outcomes["For-SNP rank"] <= 5)])

        # In[25]:

        # Calculate hits @10
        scores_df.at[indx, "Hits@10(lso)"] = sum(outcomes["Class"][(outcomes["Class"] == 1) & (outcomes["For-SNP rank"] <= 10)])

        # In[26]:

        scores_df.at[indx, "Mean rank (lso)"] = outcomes["For-SNP rank"][(outcomes["Class"] == 1)].mean()

        # In[27]:

        scores_df.at[indx, "Median rank (lso)"] = outcomes["For-SNP rank"][outcomes["Class"] == 1].quantile(q = [0,0.25,0.5,0.75,1])[0.50]

    indx += 1
scores_df.to_csv("~/git/Post-GWAS/Ensemble results scores " + refset + " with classifier " + classifier + " " + datetime.today().strftime("%d-%m-%Y") + ".csv", index = False, sep = ";", decimal = ",")

In [11]:
ranks_df = pd.DataFrame(index = range(1, len(ranks_combos) + 1), columns = ranks)
ranks_df = ranks_df.fillna(False)
indx = 1
for s in ranks_combos:
    #print("Analyzing " + str(indx) + " out of " + str(len(ranks_combos)))
    ranks_df.at[indx, list(s)] = True
    
    f = ref.copy()
    f = f[["chromosome", "Class", "SNP ID", "gene_ids"] + list(s)]
    f.dropna(subset = list(s), inplace = True)
    
    # Drop all SNPs which no longer have a positive case
    pos_counts = f.groupby("SNP ID")["Class"].sum()
    f = f[~f["SNP ID"].isin(pos_counts[pos_counts == 0].index)]
    
    outcomes = pd.DataFrame()

    chromosomes = list(set(f["chromosome"]))

    for chrom in chromosomes:
        # print("Predicting candidates for chromosome " + str(chrom))

        f_test = f[f["chromosome"] == chrom].copy()
        f_train = f[f["chromosome"] != chrom].copy()

        train_class = f_train["Class"]
        test_class = f_test["Class"]
        
        test_snps = f_test["SNP ID"]
        test_genes = f_test["gene_ids"]

        f_test.drop(columns = ["Class", "chromosome", "SNP ID", "gene_ids"], inplace = True)
        f_train.drop(columns = ["Class", "chromosome", "SNP ID", "gene_ids"], inplace = True)
        
        if classifier == "SVM":
            clf = SVR(gamma="auto")
        if classifier == "LR":
            from warnings import filterwarnings
            filterwarnings('ignore')
            clf = LogisticRegression(max_iter = 10000)
        if classifier == "RF":
            clf = RandomForestRegressor(n_estimators = 1000, n_jobs = -1, max_features = "sqrt", max_depth = 5)
        clf.fit(f_train, train_class)

        if classifier == "LR":
            outcomes = pd.concat([outcomes, pd.DataFrame({"predicted" : clf.predict_proba(f_test)[:,1],
                                                           "chromsome" : chrom,
                                                           "Class" : test_class,
                                                           "SNP ID" : test_snps,
                                                           "ENSEMBL" : test_genes})])
        else:
            outcomes = pd.concat([outcomes, pd.DataFrame({"predicted" : clf.predict(f_test),
                                               "chromsome" : chrom,
                                               "Class" : test_class,
                                               "SNP ID" : test_snps,
                                               "ENSEMBL" : test_genes})])
            
    if len(outcomes) > 0:
        outcomes = outcomes.sort_values(["SNP ID", "predicted"], ascending = False)
        outcomes["For-SNP rank"] = outcomes.groupby("SNP ID").cumcount() + 1

        ranks_df.at[indx, "Recall snps"] = len(set(outcomes["SNP ID"]))
        ranks_df.at[indx, "Recall genes"] = len(set(outcomes["ENSEMBL"]))
        ranks_df.at[indx, "Recall entries"] = sum(outcomes["Class"])

        fpr, tpr, thresholds = sklearn.metrics.roc_curve(outcomes["Class"], -outcomes["For-SNP rank"], pos_label = 1)
        ranks_df.at[indx, "ROC-AUC overall (lso)"] = sklearn.metrics.auc(fpr, tpr) * 100

        # Calculate the ROC-AUC for every SNP and average the result
        SNPS2 = list(set(outcomes["SNP ID"]))
        aucs = []
        for snp in SNPS2:
          if len(set(outcomes["Class"][outcomes["SNP ID"] == snp])) == 1:
              aucs.append(list(set(outcomes["Class"][outcomes["SNP ID"] == snp]))[0])
          else:
              fpr, tpr, thresholds = sklearn.metrics.roc_curve(outcomes["Class"][outcomes["SNP ID"] == snp], outcomes["predicted"][outcomes["SNP ID"] == snp], pos_label = 1)
              aucs.append(sklearn.metrics.auc(fpr, tpr))
        ranks_df.at[indx, "ROC-AUC - mean per snpl (lso)"] = sum(aucs)/len(aucs)


        outcomes = outcomes.sort_values(["SNP ID", "predicted"], ascending = False)
        SNP_temp = 0
        counter = 0
        prediction_temp = 9999
        for indx2, row in outcomes.iterrows():
            if SNP_temp != row["SNP ID"]:
                SNP_temp = row["SNP ID"]
                counter = 1
                prediction_temp = row["predicted"]
            elif SNP_temp == row["SNP ID"] and prediction_temp != row["predicted"]:
                counter += 1
                prediction_temp = row["predicted"]
            ref.at[indx2, "For-SNP rank"] = counter

        # In[22]:

        # Calculate hits @1
        ranks_df.at[indx, "Hits@1(lso)"] = sum(outcomes["Class"][(outcomes["Class"] == 1) & (outcomes["For-SNP rank"] == 1)])

        # In[23]:

        # Calculate hits @3
        ranks_df.at[indx, "Hits@3(lso)"] = sum(outcomes["Class"][(outcomes["Class"] == 1) & (outcomes["For-SNP rank"] <= 3)])

        # In[24]:

        # Calculate hits @5
        ranks_df.at[indx, "Hits@5(lso)"] = sum(outcomes["Class"][(outcomes["Class"] == 1) & (outcomes["For-SNP rank"] <= 5)])

        # In[25]:

        # Calculate hits @10
        ranks_df.at[indx, "Hits@10(lso)"] = sum(outcomes["Class"][(outcomes["Class"] == 1) & (outcomes["For-SNP rank"] <= 10)])

        # In[26]:

        ranks_df.at[indx, "Mean rank (lso)"] = outcomes["For-SNP rank"][(outcomes["Class"] == 1)].mean()

        # In[27]:

        ranks_df.at[indx, "Median rank (lso)"] = outcomes["For-SNP rank"][outcomes["Class"] == 1].quantile(q = [0,0.25,0.5,0.75,1])[0.50]

    indx += 1
ranks_df.to_csv("~/git/Post-GWAS/Ensemble results ranks " + refset + " with classifier " + classifier + " " + datetime.today().strftime("%d-%m-%Y") + ".csv", index = False, sep = ";", decimal = ",")