# Calculate performance based on genetic distance

In [1]:
# Load the required packages
import pandas as pd
from itertools import product
import sklearn.metrics

In [2]:
all_bp_distances = [25, 50, 100, 500, 1000, 2000, "depict"]
refsets = ["farashi", "farashi p-value cutoff", "DeRycke", "Teslovich"]

In [3]:
all_metrics = pd.DataFrame(list(product(refsets, all_bp_distances)), columns = ["refset", "bp distance"])

In [4]:
for am_index, am_values in all_metrics.iterrows():
    # Load the reference set
    if am_values["refset"] == "farashi":
        ref = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Input sets/Farashi/Farashi full 2000000 bp distance no pvalue filtering.csv")

    if am_values["refset"] == "farashi p-value cutoff":
        ref = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Input sets/Farashi/Farashi full 2000000 bp distance no pvalue filtering.csv")
        ref = ref[ref["GWAS/eQTL p-value¥"] <= float("5e-8")]

    if am_values["refset"] == "DeRycke":
        ref = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Input sets/DeRycke/DeRycke reference set.csv", delimiter = ";")
        ref.columns = ["SNP ID", "chromosome", "location", "gene_ids", "gene name", "gene start", "gene stop", "Diff expression", "Class", "bp distance absolute", "bp distance", "Gene rank"]

    if am_values["refset"] == "Teslovich":
        ref = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Input sets/Teslovich/Teslovich reference set.csv")
        ref.columns = ["SNP ID", "chromosome", "location", "P", "gene_ids", "gene name", "gene start", "gene stop", "Class", "bp distance absolute", "bp distance", "Gene rank"]
        
    if am_values["bp distance"] != "depict":
        max_bp_distance = am_values["bp distance"]
        max_bp_distance = max_bp_distance * 1000
        ref = ref[ref["bp distance absolute"] <= max_bp_distance]
    elif am_values["bp distance"] == "depict":
        if am_values["refset"] == "farashi":
            depict = pd.read_csv("~/git/DEPICT/outcomes/Farashi complete 2nd round/farashi_no_pvalue_filtering_geneprioritization.txt", sep = "\t")
        if am_values["refset"] == "farashi p-value cutoff":
            depict = pd.read_csv("~/git/DEPICT/outcomes/Farashi complete 2nd round/farashi_default_pvalue_filtering_geneprioritization.txt", sep = "\t")
        if am_values["refset"] == "DeRycke":
            depict = pd.read_csv("~/git/DEPICT/outcomes/DeRycke/DeRycke_output_geneprioritization.txt", sep = "\t")
        if am_values["refset"] == "Teslovich":
            depict = pd.read_csv("~/git/DEPICT/outcomes/Teslovich for paper Wytze/Teslovich_output_geneprioritization.txt", sep = "\t")

        depict["Locus"] = depict["Locus"].astype(str).apply(lambda x: x.split(";"))
        depict = depict.explode("Locus")

        snp_replacement_dict = {"rs113645266" : "rs6557271",
                        "rs150282463" : "rs13137700",
                        "rs67276543" : "rs34884832"}
        depict["Locus"] = depict["Locus"].replace(snp_replacement_dict)

        depict = depict[["Locus", "Ensembl gene ID"]]
        depict.columns = ["SNP ID", "gene_ids"]

        ref = ref.merge(depict, on = ["SNP ID", "gene_ids"], how = "inner")
        
        # Drop all unmappable candidates
        ref.dropna(subset = ["gene_ids"], inplace = True)
        
        ref = ref.sort_values(["SNP ID", "bp distance absolute"], ascending = False)
        ref["Gene rank"] = ref.groupby("SNP ID").cumcount() + 1
    
    # Drop all SNPs which no longer have a positive case
    pos_counts = ref.groupby("SNP ID")["Class"].sum()
    ref = ref[~ref["SNP ID"].isin(pos_counts[pos_counts == 0].index)]
    
    all_metrics.at[am_index, "Recall snps"] = len(set(ref["SNP ID"]))
    all_metrics.at[am_index, "Recall combinations"] = sum(ref["Class"])
    
    fpr, tpr, thresholds = sklearn.metrics.roc_curve(ref["Class"], -ref["Gene rank"], pos_label = 1)
    all_metrics.at[am_index, "ROC-AUC overall (lso)"] = sklearn.metrics.auc(fpr, tpr) * 100

    # Calculate the ROC-AUC for every SNP and average the result
    SNPS2 = list(set(ref["SNP ID"]))
    aucs = []
    for snp in SNPS2:
      if len(set(ref["Class"][ref["SNP ID"] == snp])) == 1:
          aucs.append(list(set(ref["Class"][ref["SNP ID"] == snp]))[0])
      else:
          fpr, tpr, thresholds = sklearn.metrics.roc_curve(ref["Class"][ref["SNP ID"] == snp], -ref["Gene rank"][ref["SNP ID"] == snp], pos_label = 1)
          aucs.append(sklearn.metrics.auc(fpr, tpr))
    all_metrics.at[am_index, "ROC-AUC - mean per snpl (lso)"] = sum(aucs)/len(aucs)


    # In[22]:


    # Calculate hits @1
    all_metrics.at[am_index, "Hits@1(lso)"] = sum(ref["Class"][(ref["Class"] == 1) & (ref["Gene rank"] == 1)])


    # In[23]:


    # Calculate hits @3
    all_metrics.at[am_index, "Hits@3(lso)"] = sum(ref["Class"][(ref["Class"] == 1) & (ref["Gene rank"] <= 3)])


    # In[24]:


    # Calculate hits @5
    all_metrics.at[am_index, "Hits@5(lso)"] = sum(ref["Class"][(ref["Class"] == 1) & (ref["Gene rank"] <= 5)])


    # In[25]:


    # Calculate hits @10
    all_metrics.at[am_index, "Hits@10(lso)"] = sum(ref["Class"][(ref["Class"] == 1) & (ref["Gene rank"] <= 10)])


    # In[26]:


    all_metrics.at[am_index, "Mean rank (lso)"] = ref["Gene rank"][(ref["Class"] == 1)].mean()


    # In[27]:


    all_metrics.at[am_index, "Median rank (lso)"] = ref["Gene rank"][ref["Class"] == 1].quantile(q = [0,0.25,0.5,0.75,1])[0.50]
    
    all_metrics.at[am_index, "Mean number of candidates for every SNP"] = ref.groupby("SNP ID")["Gene rank"].nunique().mean()
    all_metrics.at[am_index, "Mean number of negative candidates for every SNP"] = ref[ref["Class"] == 0].groupby("SNP ID")["Gene rank"].nunique().mean()
    all_metrics.at[am_index, "Mean number of positive candidates for every SNP"] = ref[ref["Class"] == 1].groupby("SNP ID")["Gene rank"].nunique().mean()
    all_metrics.at[am_index, "Median number of candidates for every SNP"] = ref.groupby("SNP ID")["Gene rank"].nunique().quantile(q = [0,0.25,0.5,0.75,1])[0.50]
    all_metrics.at[am_index, "Median number of negative candidates for every SNP"] = ref[ref["Class"] == 0].groupby("SNP ID")["Gene rank"].nunique().quantile(q = [0,0.25,0.5,0.75,1])[0.50]
    all_metrics.at[am_index, "Min number of candidates for every SNP"] = ref.groupby("SNP ID")["Gene rank"].nunique().min()
    all_metrics.at[am_index, "Max number of candidates for every SNP"] = ref.groupby("SNP ID")["Gene rank"].nunique().max()
    all_metrics.at[am_index, "Min number of negative candidates for every SNP"] = ref[ref["Class"] == 0].groupby("SNP ID")["Gene rank"].nunique().min()
    all_metrics.at[am_index, "Max number of negative candidates for every SNP"] = ref[ref["Class"] == 0].groupby("SNP ID")["Gene rank"].nunique().max()
    all_metrics.at[am_index, "Total number of negative genes"] = len(set(ref[ref["Class"] == 0]["gene_ids"]))
    all_metrics.at[am_index, "Total number of positive genes"] = len(set(ref[ref["Class"] == 1]["gene_ids"]))
    all_metrics.at[am_index, "Total number of negative cases"] = len(ref[ref["Class"] == 0])
    all_metrics.at[am_index, "Total number of cases"] = len(ref)

In [5]:
all_metrics.to_csv("/Users/vlietstraw/git/Post-GWAS/all_variations_performance_metrics based on genetic distance.csv", sep = ";", decimal = ",", index = False)