# Creat referense sets for the Farashi data

In [1]:
# Load the required packages
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 2000)

In [2]:
max_distance = 2000000
FC_cutoff = 1.5
FDR_cutoff = 0.0001
distance_filter = False

In [3]:
# Read in the data of Farashi et al. that is used as a reference set
farashi = pd.read_excel("41568_2018_87_MOESM1_ESM-3.xls", header = 1)

# Remove the empty rows at the bottom
farashi = farashi[farashi["SNP ID"].notnull() & farashi["Target/assigned/e-Gene"].notnull()]

In [4]:
# Create separate entries for cells with delimited genes
farashi["Target/assigned/e-Gene"] = farashi["Target/assigned/e-Gene"].str.replace(", ", ";")
farashi = farashi.assign(gene=farashi["Target/assigned/e-Gene"].str.split(';')).explode('gene')

# One gene has as postfix "FoxA1 binding", which we manually remove
farashi.drop(farashi[farashi["gene"] == "FoxA1 binding"].index, inplace = True)

# Strip whitespaces
farashi["gene"] = farashi["gene"].str.strip()

# Some gene names are outdated/erroneous. These are manually mapped to correct genes as decribed above
gene_replacement_dictionary = { "MSMB1" : "MSMB",
                                "MSMB2" : "MSMB",
                                "NCOA4-1" : "NCOA4P1",
                                "NCOA4-3" : "NCOA4P3", 
                                "ANKRD5" : "ANKEF1", 
                                "C6orf228" : "SMIM13",
                                "HoxB13" : "HOXB13",
                                "LASS2" : "CERS2",
                                "C10orf32" : "BORCS7",
                                "LOC100505761" : "RPARP-AS1",
                                "LOC100505495" : "PCAT19",
                                "WDR52" : "CFAP44",
                                "HCG4P6" : "HCG4B",
                                "LOC285830" : "HLA-F-AS1",
                                "RAB7L1" : "RAB29",
                                "LOC284578" : "MFSD4A-AS1",
                                "AGAP7" : "AGAP7P",
                                "C2orf43" : "LDAH",
                                "FAM96B" : "CIAO2B",
                                "TMEM180" : "MFSD13A",
                                "WBSCR27" : "METTL27",
                                "KLK3 (PSA)" : "KLK3",
                                "PCAT1 (lncRNA)" : "PCAT1",
                                "SUV420H1" : "KMT5B",
                                "c-MYC" : "MYC",
                                "AL391261.1" : "NCOA4P1"}

farashi["gene"] = farashi["gene"].replace(gene_replacement_dictionary)

In [5]:
# Replace fault-inducing postfixes for snp data
farashi["SNP ID"] = farashi["SNP ID"].str.strip()
farashi["SNP ID"].replace("(_A)|(_C)", "", regex = True, inplace = True)

# Remove entries without valid rs identifiers
farashi.drop(farashi[farashi["SNP ID"].str.startswith("chr")].index, inplace = True)

In [6]:
# Create a mapping dictionary
snp_replacement_dictionary = {"rs565245309" : "rs10700825",
                              "rs397764955" : "rs11371876",
                              "rs567544918" : "rs143009074",
                              "rs68007409" : "rs58061354",
                              "rs576874987" : "rs2735090",
                              "rs56969947" : "rs5794883",
                              "rs71390080" : "rs35883900",
                              "rs397885676" : "rs35853071",
                              "rs563936332" : "rs11425106",
                              "rs570238728" :  "rs2735091",
                              "rs386572937" : "rs2735095",
                              "rs368454874" : "rs5875242",
                              "rs576956856" : "rs557303655",
                              "rs527768094" : "rs3115587",
                              "rs34421549" : "rs11281315",
                              "rs543833159" : "rs9261476",
                              "rs573341295" : "rs3083610",
                              "rs397841490" : "rs3839562",
                              "rs72562630" : "rs10643878",
                              "rs67276543" : "rs34884832",
                              "rs113645266" : "rs6557271",
                              "rs540840764" : "rs9278594",
                              "rs145076668" : "rs34837204",
                              "rs79588872" : "rs35538902",
                              "rs397847839" : "rs35826034",
                              "rs551993434" : "rs11371951",
                              "rs113130272" : "rs11153141",
                              "rs114376585" : "rs3096702",
                              "rs527588882" : "rs9278592",
                              "rs144721865" : "rs9368661",
                              "rs572291073" : "rs2571388",
                              "rs376201080" : "rs142474496",
                              "rs34948850" : "rs10688614",
                              "rs397887654" : "rs36076724",
                              "rs114473420" : "rs3135340",
                              "rs371043306" : "rs145380596",
                              "rs572943237" : "rs11421756",
                              "rs139078838" : "rs9501073",
                              "rs539183916" : "rs2437062",
                              "rs386410791" : "rs141020575",
                              "rs141507970" : "rs9267919",
                              "rs397823414" : "rs35850123",
                              "rs63475060" : "rs5875246",
                              "rs139104997" : "rs9261481",
                              "rs150282463" : "rs13137700",
                              "rs143466021" : "rs9269108",
                              "rs5875234" : "rs3058350"
                             }

not_found_dbsnp = {"rs77010356", "rs60284051", "rs563604877"}

# Strip whitespaces
farashi["SNP ID"] = farashi["SNP ID"].replace(snp_replacement_dictionary)

In [7]:
farashi["GWAS/eQTL p-value¥"][(farashi["GWAS/eQTL p-value¥"].isin(["_", "*"])) | farashi["GWAS/eQTL p-value¥"].isna()] = 0.99

farashi["GWAS/eQTL p-value¥"] = farashi["GWAS/eQTL p-value¥"].apply(lambda x: x.replace("E", "e") if type(x) is str else x)
farashi["GWAS/eQTL p-value¥"] = farashi["GWAS/eQTL p-value¥"].apply(lambda x: x.replace("-", "-") if type(x) is str else x)
farashi["GWAS/eQTL p-value¥"] = farashi["GWAS/eQTL p-value¥"].apply(lambda x: x.replace("−", "-") if type(x) is str else x)

farashi["GWAS/eQTL p-value¥"].replace({"6.11.e - 10 " : "6.11e-10",
                                       "4.65.e - 10 " : "4.65e-10",
                                       "2.42.e - 10 " : "2.42e-10",
                                       "9.4027e–09" : '9.4027e-09',
                                       "0.01–0.0009" : "0.0009"}, inplace = True)

farashi["GWAS/eQTL p-value¥"] = farashi["GWAS/eQTL p-value¥"].apply(lambda x: x.replace(",", ";") if type(x) is str else x)
farashi["GWAS/eQTL p-value¥"] = farashi["GWAS/eQTL p-value¥"].apply(lambda x: x.split(";") if type(x) is str else x)
farashi["GWAS/eQTL p-value¥"] = farashi["GWAS/eQTL p-value¥"].apply(lambda x: min(x) if type(x) is list else x)
farashi["GWAS/eQTL p-value¥"] = farashi["GWAS/eQTL p-value¥"].astype(float)

In [8]:
# Get the SNP properties from dbsnp
import myvariant
mv = myvariant.MyVariantInfo()
   
dbsnp = mv.querymany(list(set(farashi["SNP ID"])), scopes='dbsnp.rsid', fields='dbsnp', returnall = True)
print("PASSED: Only known SNPs missing") if not_found_dbsnp == set(dbsnp["missing"]) else print("ERROR: New SNPs missing")

querying 1-1000...done.
querying 1001-1139...done.
Finished.
482 input query terms found dup hits:
	[('rs77560449', 3), ('rs4682495', 3), ('rs9364535', 2), ('rs742136', 2), ('rs12198152', 2), ('rs6688
3 input query terms found no hit:
	['rs60284051', 'rs77010356', 'rs563604877']
PASSED: Only known SNPs missing


In [9]:
# Merge the dbSNP data with the reference set
dbsnp_tab = pd.DataFrame(dbsnp["out"])

# Drop the rs ids that were not found
dbsnp_tab.drop(dbsnp_tab[dbsnp_tab["notfound"] == True].index, inplace = True)

dbsnp_tab["chromosome"] = dbsnp_tab["dbsnp"].apply(lambda x: x["chrom"])
dbsnp_tab["location"] = dbsnp_tab["dbsnp"].apply(lambda x: x["hg19"]["start"] if "hg19" in x.keys() else None)
dbsnp_tab["ref"] = dbsnp_tab["dbsnp"].apply(lambda x: x["ref"])
dbsnp_tab["alt"] = dbsnp_tab["dbsnp"].apply(lambda x: x["alt"])

# Drop entries that do not have a chromosome location
dbsnp_tab.drop(dbsnp_tab[dbsnp_tab["location"].isnull()].index, inplace = True)
dbsnp_tab["location"] = dbsnp_tab["location"].astype(int)

positives = farashi.merge(dbsnp_tab[["query", "chromosome", "location", "ref", "alt"]], how = "inner", left_on = "SNP ID", right_on = "query")

In [10]:
# Remove entries that are based on a single eQTL study, or that are found in the exon/coding region
Dadaev = ["Dadaev T. et al. 2018", "Dadaev T. et al."]
coding_snps = ["Coding region", "exonic"]

positives = positives[(~positives["SNP's Genomic Location"].isin(coding_snps))]
positives = positives[(~positives["reference"].isin(Dadaev))]

In [11]:
# Extract the gene expression data
cols = pd.read_csv("../Gene expression data/design_matrix_prostate_unpaired.txt", delimiter = "\t")
reads = pd.read_csv("../Gene expression data/expression_matrix_prostate_clean.txt", delimiter = "\t")
# Read in the differential expression calculations
dex = pd.read_csv("../Gene expression data/Galaxy37-[edgeR_DGE_on_2__design_matrix_prostate_unpaired.txt_-_differentially_expressed_genes].tabular.annotated.txt", 
                  delimiter = "\t", index_col = 0, names = ["ENSEMBL", "gene", "logFC", "logCPM", "LR", "pValue", "FDR"], header = 0)
dex["ENSEMBL"].replace("(\.\d+)", "", regex = True, inplace = True)
# We know that the logFC is a 2 log. However, I prefer absolute values rather than logs
dex["FC"] = 2**dex["logFC"]

# Assign differential expression to the genes based on the previously set parameters
dex["Diff expression"] = "Unchanged"
dex.loc[(dex["FDR"] <= FDR_cutoff) & (dex["FC"] <= 1/FC_cutoff), "Diff expression"] = "Underexpressed"
dex.loc[(dex["FDR"] <= FDR_cutoff) & (dex["FC"] >= FC_cutoff), "Diff expression"] = "Overexpressed"

# Get the raw reads data
reads["total"] = reads.sum(axis = 1, numeric_only = True)
reads["freq"] = ((reads[list(cols["samplename"])] > 0) * 1).sum(axis = 1)
reads["gene_ids"].replace("(\.\d+)", "", regex = True, inplace = True)

reads = reads.merge(dex, left_on = "gene_ids", right_on = "ENSEMBL", how = "outer")

In [12]:
# Initiate the connection to the database (first time you may have to run ensembl.download() and ensembl.index())
# To align with our reference set, we take a Ensembl version from 2019
from pyensembl import EnsemblRelease
ensembl = EnsemblRelease(92)

# Function to extract the data (and prevent superfluous queries)
def getEnsemblData(id):
    try:
        data = ensembl.gene_by_id(id)
        return pd.Series({"gene name" : data.gene_name, 
                          "chromosome" : data.contig, 
                          "gene start" : data.start,
                          "gene stop" : data.end,
                          "protein_coding" : data.is_protein_coding})
    except ValueError:
        return pd.Series({"gene name" : None, 
                          "chromosome" : None, 
                          "gene start" : None,
                          "gene stop" : None,
                          "protein_coding" : None})

In [13]:
# Remove the postifxes of the gene identifiers
reads["gene_ids"].replace("\.\d+", "", regex = True, inplace = True)
reads[["gene name", "chromosome", "gene start", "gene stop", "protein_coding"]] = reads["gene_ids"].apply(lambda x: getEnsemblData(x))

# Replace some of the gene names with the updated ones
reads["gene name"].replace(gene_replacement_dictionary, inplace = True)

In [14]:
# Check whether all genes in the reference set are in the expression data
known_missing_from_reads = {'LOC284581', 'MFSD4A-AS1'}

print("PASSED: Only known missing genes absent in gene expression data") if set(positives["gene"]) - set(reads["gene name"]) == known_missing_from_reads else print("Failed: New missing entries found")

PASSED: Only known missing genes absent in gene expression data


In [None]:
# PathwayStudio primarily contains protein coding gene entries. All non-protein coding entries are therefore removed
reads = reads[reads["protein_coding"] == True]

# Drop the mitochondrial genes and the entries that could not be found
reads.drop(reads[reads["chromosome"].isin(["MT", None])].index, inplace = True)

In [None]:
# Calculate the length of the exon
exons = pd.DataFrame([x.to_dict() for x in ensembl.exons()])
exons["exon length"] = (exons["end"] - exons["start"]) + 1
exon_length = exons.groupby("gene_id").sum()

reads = reads.merge(exon_length["exon length"], how = "left", left_on = "gene_ids", right_index = True)

In [None]:
# Create a dataframe of all combinations of SNPs and genes that are on the same chromosome
positives["location"] = positives["location"].astype(int)
snp_candidates = positives[["SNP ID", "chromosome", "location", "gene", "SNP's Genomic Location", "GWAS/eQTL p-value¥"]].merge(
    reads[["gene_ids", "gene name", "chromosome", "gene start", "gene stop", "exon length", "FC", "FDR", "Diff expression"]], on = "chromosome", how = "inner")

# Drop all entries that do not describe an unique pair of SNP and gene.
# If gene start/stop would also be used to determine duplicates, at most 57 extra entries would be found. 
# This number is considered to be insignificant.
snp_candidates.drop_duplicates(["SNP ID", "gene name"], keep = "last", inplace = True)

In [None]:
# Select the pairs where the SNP window and the gene windows overlap
snp_candidates["candidate"] = snp_candidates.apply(lambda x: True if ((x["gene stop"] >= x["location"] - max_distance and x["gene stop"] <= x["location"] + max_distance) or 
                                                                      (x["gene start"] >= x["location"] - max_distance and x["gene start"] <= x["location"] + max_distance)) else False, axis = 1)

# Without any distance filter, there are 244 unique SNP-gene pairs
if distance_filter:
    snp_candidates.drop(snp_candidates[snp_candidates["candidate"] == False].index, inplace = True)

In [None]:
# Mark the entries in the SNP candidates as positives. Also remove all duplicate SNP-gene entries.
positives["Class"] = 1
positives.drop_duplicates(["SNP ID", "gene"], keep = "last", inplace = True)

# Assign reference to set
f = snp_candidates.drop(columns = ["gene", "candidate"]).merge(positives[["Class", "SNP ID", "gene"]], how = "left", left_on = ["SNP ID", "gene name"], right_on = ["SNP ID", "gene"])
f["Class"].fillna(value = 0, inplace = True)
f["Class"] = f["Class"].astype(int)

# Also removed a column that was only used for merging.
f.drop(columns = "gene", inplace = True)

# Drop all entries that don't have at least one positive case
f = f[f["SNP ID"].isin(f["SNP ID"][f["Class"] == True])]

In [None]:
# Calculate the distance between the gene and the SNP
f[["gene start", "gene stop"]] = f[["gene start", "gene stop"]].astype(int)
f["bp distance absolute"] = f.apply(lambda x: min([abs(x["gene start"] - x["location"]), abs(x["gene stop"] - x["location"])]), axis = 1).astype(int)
f["bp distance"] = f.apply(lambda x: min([x["gene start"] - x["location"], x["gene stop"] - x["location"]], key = abs), axis = 1).astype(int)

In [None]:
# Calculate the rank of the gene as compared to other genes
f = f.sort_values(["SNP ID", "bp distance absolute"])
f["Gene rank"] = f.groupby("SNP ID").cumcount() + 1

In [None]:
import sklearn.metrics

fpr, tpr, thresholds = sklearn.metrics.roc_curve(f["Class"], -f["Gene rank"], pos_label = 1)
print(sklearn.metrics.auc(fpr, tpr) * 100)

In [None]:
roc_auc = sklearn.metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
SNPS2 = list(set(f_out["SNP ID"]))
aucs = []
for snp in SNPS2:
  fpr, tpr, thresholds = sklearn.metrics.roc_curve(outcomes["reference"][outcomes["snp"] == snp], -outcomes["rank for SNP"][outcomes["snp"] == snp], pos_label = 1)
  aucs.append(sklearn.metrics.auc(fpr, tpr))
print(sum(aucs)/len(aucs))

In [None]:
f.drop(columns = ["gene start", "gene stop"], inplace = True)
f_backup = f.copy()

In [None]:
f.to_csv("Farashi full 2000000 bp distance no pvalue filtering.csv", index = False)

In [None]:
f = f[f["bp distance absolute"] <= 1000000]
snpsums = f.groupby("SNP ID")["Class"].sum()
f = f[f["SNP ID"].isin(list(snpsums[snpsums > 0].index))]

In [None]:
f.to_csv("Farashi full 1000000 bp distance no pvalue filtering.csv", index = False)

In [None]:
f = f[f["bp distance absolute"] <= 100000]
snpsums = f.groupby("SNP ID")["Class"].sum()
f = f[f["SNP ID"].isin(list(snpsums[snpsums > 0].index))]

In [None]:
f.to_csv("Farashi full 100000 bp distance no pvalue filtering.csv", index = False)

In [None]:
f = f_backup
f = f[f["GWAS/eQTL p-value¥"] <= 0.00000005]
snpsums = f.groupby("SNP ID")["Class"].sum()
f = f[f["SNP ID"].isin(list(snpsums[snpsums > 0].index))]

In [None]:
f.to_csv("Farashi full 2000000 bp distance pvalue filtering.csv", index = False)