# Create Teslovich reference set

In [1]:
# Load the required packages
import pandas as pd

In [2]:
max_distance = 2000000
distance_filter = True

In [3]:
# Read in the data of Farashi et al. that is used as a reference set
tes = pd.read_excel("Figure 1 Teslovich et al.xlsx")

In [4]:
snp_replacement_dictionary = {"rs9411489" : "rs635634"}
# ABCG5/8 doesn't exist, but ABCG5 and ABCG8 do. Therefore I have split them up in the input file
# Same holds for FADS1-2-3
# HLA and LILRA3 are still mysteries
gene_replacement_dictionary = {"LOC55908" : "ANGPTL8", "FLJ36070" : "MAMSTR", "MOSC1" : "MARC1", "MSL2L1" : "MSL2", "PLEC1" : "PLEC"}

# LILRA3 is strange, because it does have a Ensembl ID, but does not occur in arrays. I therefore remove it from the reference set
unmappable = {"LILRA3", "HLA"}
tes = tes[~tes["Locus"].isin(unmappable)]

# Strip whitespaces
tes["Lead SNP"] = tes["Lead SNP"].replace(snp_replacement_dictionary)
tes["Locus"] = tes["Locus"].replace(gene_replacement_dictionary)

In [5]:
# Get the SNP properties from dbsnp
import myvariant
mv = myvariant.MyVariantInfo()
   
dbsnp = mv.querymany(list(set(tes["Lead SNP"])), scopes='dbsnp.rsid', fields='dbsnp', returnall = True)

querying 1-99...done.
Finished.
43 input query terms found dup hits:
	[('rs4731702', 2), ('rs2068888', 2), ('rs2925979', 2), ('rs4299376', 2), ('rs7515577', 3), ('rs51423


In [6]:
dbsnp_tab = pd.DataFrame(dbsnp["out"])
dbsnp_tab = dbsnp_tab[dbsnp_tab["query"].isin(tes["Lead SNP"])]

# Extract the data from the payloads
dbsnp_tab["chromosome"] = dbsnp_tab["dbsnp"].apply(lambda x: x["chrom"])
dbsnp_tab["location"] = dbsnp_tab["dbsnp"].apply(lambda x: x["hg19"]["start"] if "hg19" in x.keys() else None)
dbsnp_tab["ref"] = dbsnp_tab["dbsnp"].apply(lambda x: x["ref"])
dbsnp_tab["alt"] = dbsnp_tab["dbsnp"].apply(lambda x: x["alt"])

In [7]:
positives = tes.merge(dbsnp_tab[["query", "chromosome", "location", "ref", "alt"]], how = "outer", left_on = "Lead SNP", right_on = "query")
positives.drop(columns = ["query"], inplace = True)
positives.columns = ["gene", "SNP ID", "Lead Trait", "Other Traits", "P", "chromosome", "location", "ref", "alt"]

In [8]:
# Use the prostate cancer set to extract Ensembl identifiers and map them, as well as getting gene coordinates for the negatives
dex = pd.read_csv("../Gene expression data/Galaxy37-[edgeR_DGE_on_2__design_matrix_prostate_unpaired.txt_-_differentially_expressed_genes].tabular.annotated.txt", 
                  delimiter = "\t", index_col = 0, names = ["ENSEMBL", "gene", "logFC", "logCPM", "LR", "pValue", "FDR"], header = 0)
dex["ENSEMBL"].replace("(\.\d+)", "", regex = True, inplace = True)

In [9]:
# Initiate the connection to the database (first time you may have to run ensembl.download() and ensembl.index())
# To align with our reference set, we take a Ensembl version from 2019
from pyensembl import EnsemblRelease
ensembl = EnsemblRelease(92)

# Function to extract the data (and prevent superfluous queries)
def getEnsemblData(id):
    try:
        data = ensembl.gene_by_id(id)
        return pd.Series({"gene name" : data.gene_name, 
                          "chromosome" : data.contig, 
                          "gene start" : data.start,
                          "gene stop" : data.end,
                          "protein_coding" : data.is_protein_coding})
    except ValueError:
        return pd.Series({"gene name" : None, 
                          "chromosome" : None, 
                          "gene start" : None,
                          "gene stop" : None,
                          "protein_coding" : None})

In [10]:
# Get the properties of the genes
dex[["gene name", "chromosome", "gene start", "gene stop", "protein_coding"]] = dex["ENSEMBL"].apply(lambda x: getEnsemblData(x))

In [19]:
# All non-protein coding entries are removed
dex = dex[dex["protein_coding"] == True]

# Drop the mitochondrial genes and the entries that could not be found
dex.drop(dex[dex["chromosome"].isin(["MT", None])].index, inplace = True)

In [20]:
exons = pd.DataFrame([x.to_dict() for x in ensembl.exons()])

In [44]:
# Create a dataframe of all combinations of SNPs and genes that are on the same chromosome
positives["location"] = positives["location"].astype(int)
snp_candidates = positives[["SNP ID", "chromosome", "location", "gene", "P"]].merge(
    dex[["ENSEMBL", "gene name", "chromosome", "gene start", "gene stop"]], on = "chromosome", how = "inner")

# Drop all entries that do not describe an unique pair of SNP and gene.
# If gene start/stop would also be used to determine duplicates, at most 57 extra entries would be found. 
# This number is considered to be insignificant.
snp_candidates.drop_duplicates(["SNP ID", "gene name"], keep = "last", inplace = True)

In [57]:
# Select the pairs where the SNP window and the gene windows overlap
snp_candidates["candidate"] = snp_candidates.apply(lambda x: True if ((x["gene stop"] >= x["location"] - max_distance and x["gene stop"] <= x["location"] + max_distance) or 
                                                                      (x["gene start"] >= x["location"] - max_distance and x["gene start"] <= x["location"] + max_distance)) else False, axis = 1)

# Without any distance filter, there are 244 unique SNP-gene pairs
if distance_filter:
    snp_candidates.drop(snp_candidates[snp_candidates["candidate"] == False].index, inplace = True)

In [58]:
# Mark the entries in the SNP candidates as positives. Also remove all duplicate SNP-gene entries.
positives["Class"] = 1
positives.drop_duplicates(["SNP ID", "gene"], keep = "last", inplace = True)

# Assign reference to set
f = snp_candidates.drop(columns = ["gene", "candidate"]).merge(positives[["Class", "SNP ID", "gene"]], how = "left", left_on = ["SNP ID", "gene name"], right_on = ["SNP ID", "gene"])
f["Class"].fillna(value = 0, inplace = True)
f["Class"] = f["Class"].astype(int)

# Also removed a column that was only used for merging.
f.drop(columns = "gene", inplace = True)

# Drop all entries that don't have at least one positive case
f = f[f["SNP ID"].isin(f["SNP ID"][f["Class"] == True])]

In [49]:
# Check whether any snps are on the exons. These should be removed
f = f.merge(exons[["start", "end", "gene_id"]], left_on = "ENSEMBL", right_on = "gene_id")

f["between"] = f.apply(lambda x: True if x["location"] >= x["start"] & x["end"] >= x["location"] else False, axis = 1)
f.drop(f[f["between"] == True].index, inplace = True)
f.drop(columns = ["start", "end", "gene_id", "between"], inplace = True)
f = f.drop_duplicates()

In [52]:
# Calculate the distance between the gene and the SNP
f[["gene start", "gene stop"]] = f[["gene start", "gene stop"]].astype(int)
f["bp distance absolute"] = f.apply(lambda x: min([abs(x["gene start"] - x["location"]), abs(x["gene stop"] - x["location"])]), axis = 1).astype(int)
f["bp distance"] = f.apply(lambda x: min([x["gene start"] - x["location"], x["gene stop"] - x["location"]], key = abs), axis = 1).astype(int)

In [None]:
# Calculate the rank of the gene as compared to other genes
f = f.sort_values(["SNP ID", "bp distance absolute"])
f["Gene rank"] = f.groupby("SNP ID").cumcount() + 1

In [None]:
f.to_csv("/Users/vlietstraw/git/Post-GWAS/Input sets/Teslovich/Teslovich reference set.csv", index = False)

In [None]:
sum(f["Gene rank"][f["Class"] == 1])/sum(f["Class"])

In [None]:
import sklearn.metrics

fpr, tpr, thresholds = sklearn.metrics.roc_curve(f["Class"], -f["Gene rank"], pos_label = 1)
print(sklearn.metrics.auc(fpr, tpr) * 100)

In [None]:
SNPS2 = list(set(f["SNP ID"]))
aucs = []
for snp in SNPS2:
  fpr, tpr, thresholds = sklearn.metrics.roc_curve(f["Class"][f["SNP ID"] == snp], -f["Gene rank"][f["SNP ID"] == snp], pos_label = 1)
  aucs.append(sklearn.metrics.auc(fpr, tpr))
print(sum(aucs)/len(aucs))