# Create DeRycke reference set

In [1]:
# Load the required packages
import pandas as pd

In [2]:
max_distance = 2000000
FC_cutoff = 1.5
FDR_cutoff = 0.0001
distance_filter = True

In [18]:
# Read in the data of Farashi et al. that is used as a reference set
dr = pd.read_excel("DeRycke processed.xlsx")

In [4]:
# Drop all snps that target non protein-coding sequences.
dr = dr[(dr["Gene Type"] == "Coding") & (dr["Significant Genes (Group)*"] != "none")]

In [23]:
# Expand collapsed rows
dr["SNP ID"] = dr["Risk SNPs *"].str.split(",")
dr = dr.explode("SNP ID")

dr["gene"] = dr["Significant Genes (Group)*"].str.split("; ")
dr = dr.explode("gene")

In [6]:
# Replace parenthesized entries at back of string
dr["gene"].replace("\(\d\)$", "", regex = True, inplace = True)

In [7]:
dr.drop(columns = ['Risk Locus', 'Risk Region (hg19)', 'Risk Region #', 'Risk SNPs *', 'N \nRisk SNPs', 'Region Status', 'N SNPs \nTested', 'Gene Type', 'N Genes \nTested', 'N Tests', 'N Significant Genes', 'Significant Genes (Group)*'], inplace = True)

In [8]:
snp_replacement_dictionary = {"rs74738513" : "rs11673591", "rs114376585" : "rs3096702"}
dropped_snps = ["chr6:160581543:I", "kgp8152823"]

gene_replacement_dictionary = {"AGAP7" : "AGAP7P",
                               "\nHLA-DQB2" : "HLA-DQB2",
                               "APITD1" : "CENPS",
                               "APITD1-CORT" : "CENPS-CORT",
                               "BZRAP1" : "TSPOAP1",
                               "C10orf32" : "BORCS7",
                               "C10orf32-ASMT" : "BORCS7-ASMT",
                               "C2orf43" : "LDAH",
                               "RAB7L1" : "RAB29",
                               "TMEM180" : "MFSD13A",
                               "WDR52" : "CFAP44"}

unmappable = {"LILRA3", "HCG27", "AC104667.3", "CTB-102L5.4", "RP11-817J15.3"}

dr["gene"] = dr["gene"].replace(gene_replacement_dictionary)
dr = dr[~dr["gene"].isin(unmappable)]

dr["SNP ID"] = dr["SNP ID"].replace(snp_replacement_dictionary)
dr = dr[~dr["SNP ID"].isin(dropped_snps)]

In [9]:
# Get the SNP properties from dbsnp
import myvariant
mv = myvariant.MyVariantInfo()
   
dbsnp = mv.querymany(list(set(dr["SNP ID"])), scopes='dbsnp.rsid', fields='dbsnp', returnall = True)

querying 1-115...done.
Finished.
51 input query terms found dup hits:
	[('rs34925593', 2), ('rs8093601', 3), ('rs10122495', 2), ('rs7127900', 3), ('rs6853490', 2), ('rs130


In [10]:
dbsnp_tab = pd.DataFrame(dbsnp["out"])
dbsnp_tab = dbsnp_tab[dbsnp_tab["query"].isin(dr["SNP ID"])]

# Extract the data from the payloads
dbsnp_tab["chromosome"] = dbsnp_tab["dbsnp"].apply(lambda x: x["chrom"])
dbsnp_tab["location"] = dbsnp_tab["dbsnp"].apply(lambda x: x["hg19"]["start"] if "hg19" in x.keys() else None)
dbsnp_tab["ref"] = dbsnp_tab["dbsnp"].apply(lambda x: x["ref"])
dbsnp_tab["alt"] = dbsnp_tab["dbsnp"].apply(lambda x: x["alt"])

In [11]:
positives = dr.merge(dbsnp_tab[["query", "chromosome", "location", "ref", "alt"]], how = "outer", left_on = "SNP ID", right_on = "query")
positives.drop(columns = ["query"], inplace = True)

In [12]:
# Use the prostate cancer set to extract Ensembl identifiers and map them, as well as getting gene coordinates for the negatives
dex = pd.read_csv("../Gene expression data/Galaxy37-[edgeR_DGE_on_2__design_matrix_prostate_unpaired.txt_-_differentially_expressed_genes].tabular.annotated.txt", 
                  delimiter = "\t", index_col = 0, names = ["ENSEMBL", "gene", "logFC", "logCPM", "LR", "pValue", "FDR"], header = 0)
dex["ENSEMBL"].replace("(\.\d+)", "", regex = True, inplace = True)

# We know that the logFC is a 2 log. However, I prefer absolute values rather than logs
dex["FC"] = 2**dex["logFC"]

# Assign differential expression to the genes based on the previously set parameters
dex["Diff expression"] = "Unchanged"
dex.loc[(dex["FDR"] <= FDR_cutoff) & (dex["FC"] <= 1/FC_cutoff), "Diff expression"] = "Underexpressed"
dex.loc[(dex["FDR"] <= FDR_cutoff) & (dex["FC"] >= FC_cutoff), "Diff expression"] = "Overexpressed"

In [13]:
# Initiate the connection to the database (first time you may have to run ensembl.download() and ensembl.index())
# To align with our reference set, we take a Ensembl version from 2019
from pyensembl import EnsemblRelease
ensembl = EnsemblRelease(92)

# Function to extract the data (and prevent superfluous queries)
def getEnsemblData(id):
    try:
        data = ensembl.gene_by_id(id)
        return pd.Series({"gene name" : data.gene_name, 
                          "chromosome" : data.contig, 
                          "gene start" : data.start,
                          "gene stop" : data.end,
                          "protein_coding" : data.is_protein_coding})
    except ValueError:
        return pd.Series({"gene name" : None, 
                          "chromosome" : None, 
                          "gene start" : None,
                          "gene stop" : None,
                          "protein_coding" : None})

In [14]:
dex[["gene name", "chromosome", "gene start", "gene stop", "protein_coding"]] = dex["ENSEMBL"].apply(lambda x: getEnsemblData(x))

In [16]:
# All non-protein coding entries are removed
dex = dex[dex["protein_coding"] == True]

# Drop the mitochondrial genes and the entries that could not be found
dex.drop(dex[dex["chromosome"].isin(["MT", None])].index, inplace = True)

In [17]:
exons = pd.DataFrame([x.to_dict() for x in ensembl.exons()])

In [18]:
# Create a dataframe of all combinations of SNPs and genes that are on the same chromosome
positives["location"] = positives["location"].astype(int)
snp_candidates = positives[["SNP ID", "chromosome", "location", "gene"]].merge(
    dex[["ENSEMBL", "gene name", "chromosome", "gene start", "gene stop", "Diff expression"]], on = "chromosome", how = "inner")

# Drop all entries that do not describe an unique pair of SNP and gene.
# If gene start/stop would also be used to determine duplicates, at most 57 extra entries would be found. 
# This number is considered to be insignificant.
snp_candidates.drop_duplicates(["SNP ID", "gene name"], keep = "last", inplace = True)

In [None]:
# Select the pairs where the SNP window and the gene windows overlap
snp_candidates["candidate"] = snp_candidates.apply(lambda x: True if ((x["gene stop"] >= x["location"] - max_distance and x["gene stop"] <= x["location"] + max_distance) or 
                                                                      (x["gene start"] >= x["location"] - max_distance and x["gene start"] <= x["location"] + max_distance)) else False, axis = 1)

# Without any distance filter, there are 244 unique SNP-gene pairs
if distance_filter:
    snp_candidates.drop(snp_candidates[snp_candidates["candidate"] == False].index, inplace = True)

In [None]:
# Mark the entries in the SNP candidates as positives. Also remove all duplicate SNP-gene entries.
positives["Class"] = 1
positives.drop_duplicates(["SNP ID", "gene"], keep = "last", inplace = True)

# Assign reference to set
f = snp_candidates.drop(columns = ["gene", "candidate"]).merge(positives[["Class", "SNP ID", "gene"]], how = "left", left_on = ["SNP ID", "gene name"], right_on = ["SNP ID", "gene"])
f["Class"].fillna(value = 0, inplace = True)
f["Class"] = f["Class"].astype(int)

# Also removed a column that was only used for merging.
f.drop(columns = "gene", inplace = True)

# Drop all entries that don't have at least one positive case
f = f[f["SNP ID"].isin(f["SNP ID"][f["Class"] == True])]

In [None]:
# Calculate the distance between the gene and the SNP
f[["gene start", "gene stop"]] = f[["gene start", "gene stop"]].astype(int)
f["bp distance absolute"] = f.apply(lambda x: min([abs(x["gene start"] - x["location"]), abs(x["gene stop"] - x["location"])]), axis = 1).astype(int)
f["bp distance"] = f.apply(lambda x: min([x["gene start"] - x["location"], x["gene stop"] - x["location"]], key = abs), axis = 1).astype(int)

In [None]:
# Check whether any snps are on the exons. These should be removed
f = f.merge(exons[["start", "end", "gene_id"]], left_on = "ENSEMBL", right_on = "gene_id")

f["between"] = f.apply(lambda x: True if x["location"] >= x["start"] & x["end"] >= x["location"] else False, axis = 1)
f.drop(f[f["between"] == True].index, inplace = True)
f.drop(columns = ["start", "end", "gene_id", "between"], inplace = True)
f = f.drop_duplicates()

In [None]:
# Calculate the rank of the gene as compared to other genes
f = f.sort_values(["SNP ID", "bp distance absolute"])
f["Gene rank"] = f.groupby("SNP ID").cumcount() + 1

In [None]:
# Write file to CSV
f.to_csv("/Users/vlietstraw/git/Post-GWAS/Input sets/DeRycke/DeRycke reference set.csv", index = False)

In [None]:
import sklearn.metrics

fpr, tpr, thresholds = sklearn.metrics.roc_curve(f["Class"], -f["Gene rank"], pos_label = 1)
print(sklearn.metrics.auc(fpr, tpr) * 100)

In [None]:
sum(f["Gene rank"][f["Class"] == 1])/sum(f["Class"])

In [None]:
SNPS2 = list(set(f["SNP ID"]))
aucs = []
for snp in SNPS2:
  fpr, tpr, thresholds = sklearn.metrics.roc_curve(f["Class"][f["SNP ID"] == snp], -f["Gene rank"][f["SNP ID"] == snp], pos_label = 1)
  aucs.append(sklearn.metrics.auc(fpr, tpr))
print(sum(aucs)/len(aucs))