# RDF2vec classification

In [1]:
# Load the required packages
import pandas as pd
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
import json
import numpy as np

In [2]:
class_type = "normal"
classifier = "DT"

In [3]:
# Load the autoencoded embeddings
if class_type == "autoencode":
    f = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/RDF2vec/autorcode_emb.txt", sep = "\t", header = None)
    f.drop(columns = [350], inplace = True)
    f2 = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/RDF2vec/Farashi complete protein-protein embeddings.csv", index_col = 0)
    f.index = f2.index

In [4]:
if class_type == "combi":
    f = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/RDF2vec/autorcode_emb.txt", sep = "\t", header = None)
    f.drop(columns = [350], inplace = True)
    f2 = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/RDF2vec/Farashi complete protein-protein embeddings.csv", index_col = 0)
    f.index = f2.index
    
    # Load the mapping file
    with open("/Users/vlietstraw/git/Post-GWAS/ENSEMBL_mappings.json", "r") as fp:
        ensembl_dict = json.load(fp)
    inv_map = {v: k for k, v in ensembl_dict.items()}
    
    graphlets = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/EVOKE/unfiltered.txt", header = None, sep = " ", skiprows = 1)
    graphlets.drop(columns = [73], inplace = True)
    graphlets = graphlets.apply(lambda x: np.log10(x, where = x > 0))
    graphlets.columns = ["Graphlet " + str(x) for x in range(len(list(graphlets)))]
    graphlets.index = [x + 1 for x in list(graphlets.index)]
    graphlets.index = [inv_map[x] for x in graphlets.index]
    f = f.merge(graphlets, right_index = True, left_index = True)

In [5]:
# Extend embeddings with graphlets
if class_type == "graphlet":
    f = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/RDF2vec/Farashi complete protein-protein embeddings.csv", index_col = 0)
    graphlets = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/EVOKE/unfiltered.txt", header = None, sep = " ", skiprows = 1)
    
    # Load the mapping file
    with open("/Users/vlietstraw/git/Post-GWAS/ENSEMBL_mappings.json", "r") as fp:
        ensembl_dict = json.load(fp)
    inv_map = {v: k for k, v in ensembl_dict.items()}
    
    graphlets.drop(columns = [73], inplace = True)
    graphlets = graphlets.apply(lambda x: np.log10(x, where = x > 0))
    graphlets.columns = ["Graphlet " + str(x) for x in range(len(list(graphlets)))]
    graphlets.index = [x + 1 for x in list(graphlets.index)]
    graphlets.index = [inv_map[x] for x in graphlets.index]
    f = f.merge(graphlets, right_index = True, left_index = True)

In [6]:
# Load the data
if class_type == "normal":
    f = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/RDF2vec/Farashi complete protein-protein embeddings.csv", index_col = 0)

In [7]:
# Load the reference set
ref = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Input sets/Farashi/Farashi full 2000000 bp distance no pvalue filtering.csv")

In [8]:
# Set bp distance cutoff
max_bp_distance = 2000
max_bp_distance = max_bp_distance * 1000
ref = ref[ref["bp distance absolute"] <= max_bp_distance]

In [9]:
# Drop all SNPs which no longer have a positive case
ref = ref[ref["gene_ids"].isin(f.index)]
pos_counts = ref.groupby("SNP ID")["Class"].sum()
ref = ref[~ref["SNP ID"].isin(pos_counts[pos_counts == 0].index)]

In [10]:
# Identify all genes which are at least once positive
positives = ref.groupby("gene_ids")["Class"].sum()
positives[positives > 1] = 1

f = f.merge(positives, left_index = True, right_index = True)

## Leave SNP out classification

In [11]:
outcomes = pd.DataFrame()
train_auc_score = []
train_auc_rank = []

In [12]:
# Perform leave-SNP-out cross validation
SNPs = list(set(ref["SNP ID"]))
for snp in SNPs:
    print("Predicting candidates for " + snp + ", number " + str(SNPs.index(snp) + 1) + " out of " + str(len(SNPs)))

    f_test = f[f.index.isin(ref[ref["SNP ID"] == snp]["gene_ids"])].copy()
    f_train = f[f.index.isin(ref[ref["SNP ID"] != snp]["gene_ids"])].copy()
    
    train_class = f["Class"][f.index.isin(f_train.index)]
    test_class = f["Class"][f.index.isin(f_test.index)]
    
    f_test.drop(columns = ["Class"], inplace = True)
    f_train.drop(columns = ["Class"], inplace = True)

    if classifier == "SVM":
        clf = SVR(gamma="auto")
    if classifier == "DT":
        clf = DecisionTreeRegressor()
    if classifier == "KNN":
        clf = KNeighborsClassifier(n_neighbors = 3)
    if classifier == "LR":
        from warnings import filterwarnings
        filterwarnings('ignore')
        clf = LogisticRegression()
    if classifier == "RF":
        clf = RandomForestRegressor(n_estimators = 1000, n_jobs = -1, max_features = "sqrt", max_depth = 5)
    
    clf.fit(f_train, train_class)

    outcomes = pd.concat([outcomes, pd.DataFrame({"predicted" : clf.predict(f_test), 
                                                    "SNP ID" : snp,
                                                    "gene_ids" : f_test.index})])
    
outcomes = outcomes.merge(ref[["SNP ID", "gene_ids", "Class"]], on = ["SNP ID", "gene_ids"], how = "left")

Predicting candidates for rs62003544, number 1 out of 194
Predicting candidates for rs1188529, number 2 out of 194
Predicting candidates for rs11891426, number 3 out of 194
Predicting candidates for rs11568311, number 4 out of 194
Predicting candidates for rs12453443, number 5 out of 194
Predicting candidates for rs6702939, number 6 out of 194
Predicting candidates for rs2289081, number 7 out of 194
Predicting candidates for rs12137162, number 8 out of 194
Predicting candidates for rs9469899, number 9 out of 194
Predicting candidates for rs3798439, number 10 out of 194
Predicting candidates for rs651164, number 11 out of 194
Predicting candidates for rs6983267, number 12 out of 194
Predicting candidates for rs112871784, number 13 out of 194
Predicting candidates for rs3746337, number 14 out of 194
Predicting candidates for rs135009, number 15 out of 194
Predicting candidates for rs6545977, number 16 out of 194
Predicting candidates for rs461251, number 17 out of 194
Predicting candidat

## Leave chromosome out validation

In [13]:
outcomes2 = pd.DataFrame()
train_auc_score2 = []
train_auc_rank2 = []

In [14]:
# Perform leave-SNP-out cross validation
chromosomes = list(set(ref["chromosome"]))

for chrom in chromosomes:
    print("Predicting candidates for chromosome " + chrom)

    f_test = f[f.index.isin(ref["gene_ids"][ref["chromosome"] == chrom])].copy()
    f_train = f[f.index.isin(ref["gene_ids"][ref["chromosome"] != chrom])].copy()
    
    train_class = f["Class"][f.index.isin(f_train.index)]
    test_class = f["Class"][f.index.isin(f_test.index)]
    
    f_test.drop(columns = ["Class"], inplace = True)
    f_train.drop(columns = ["Class"], inplace = True)
    
    if classifier == "SVM":
        clf = SVR(gamma="auto")
    if classifier == "DT":
        clf = DecisionTreeRegressor()
    if classifier == "KNN":
        clf = KNeighborsClassifier(n_neighbors = 3)
    if classifier == "LR":
        from warnings import filterwarnings
        filterwarnings('ignore')
        clf = LogisticRegression()
    if classifier == "RF":
        clf = RandomForestRegressor(n_estimators = 1000, n_jobs = -1, max_features = "sqrt", max_depth = 5)
    
    clf.fit(np.array(f_train), np.array(train_class))

    outcomes2 = pd.concat([outcomes2, pd.DataFrame({"predicted" : clf.predict(f_test),
                                                    "Class" : test_class, 
                                                    "chromosome" : chrom,
                                                    "gene_ids" : f_test.index})])

Predicting candidates for chromosome 7
Predicting candidates for chromosome X
Predicting candidates for chromosome 3
Predicting candidates for chromosome 16
Predicting candidates for chromosome 17
Predicting candidates for chromosome 5
Predicting candidates for chromosome 2
Predicting candidates for chromosome 4
Predicting candidates for chromosome 6
Predicting candidates for chromosome 21
Predicting candidates for chromosome 1
Predicting candidates for chromosome 14
Predicting candidates for chromosome 22
Predicting candidates for chromosome 11
Predicting candidates for chromosome 13
Predicting candidates for chromosome 8
Predicting candidates for chromosome 20
Predicting candidates for chromosome 10
Predicting candidates for chromosome 9
Predicting candidates for chromosome 19
Predicting candidates for chromosome 12


In [15]:
outcomes.to_csv("/Users/vlietstraw/git/Post-GWAS/RDF2vec/Leave-SNP-Out combi graphlets validation " + class_type + " " + str(max_bp_distance) + " " + classifier + ".csv", index = False)
outcomes2.to_csv("/Users/vlietstraw/git/Post-GWAS/RDF2vec/Leave-chromosome-Out graphlets cross validation " + class_type + " " + str(max_bp_distance) + " " + classifier + ".csv" index = False)

SyntaxError: invalid syntax (<ipython-input-15-cac32b3f1bd5>, line 2)

## Evaluate leave-SNP-out

In [None]:
outcomes = outcomes.sort_values(["SNP ID", "predicted"], ascending = False)
outcomes["For-SNP rank"] = outcomes.groupby("SNP ID").cumcount() + 1

In [None]:
sum(outcomes["Class"])

In [None]:
len(set(outcomes["SNP ID"]))

In [None]:
import sklearn.metrics

fpr, tpr, thresholds = sklearn.metrics.roc_curve(outcomes["Class"], -outcomes["For-SNP rank"], pos_label = 1)
print(sklearn.metrics.auc(fpr, tpr) * 100)

In [None]:
# Calculate the ROC-AUC for every SNP and average the result
SNPS2 = list(set(outcomes["SNP ID"]))
aucs = []
for snp in SNPS2:
  if len(set(outcomes["Class"][outcomes["SNP ID"] == snp])) == 1:
      aucs.append(list(set(outcomes["Class"][outcomes["SNP ID"] == snp]))[0])
  else:
      fpr, tpr, thresholds = sklearn.metrics.roc_curve(outcomes["Class"][outcomes["SNP ID"] == snp], -outcomes["For-SNP rank"][outcomes["SNP ID"] == snp], pos_label = 1)
      aucs.append(sklearn.metrics.auc(fpr, tpr))
print(sum(aucs)/len(aucs))

In [None]:
# Calculate hits @1
sum(outcomes["Class"][(outcomes["Class"] == 1) & (outcomes["For-SNP rank"] == 1)])

In [None]:
# Calculate hits @3
sum(outcomes["Class"][(outcomes["Class"] == 1) & (outcomes["For-SNP rank"] <= 3)])

In [None]:
# Calculate hits @5
sum(outcomes["Class"][(outcomes["Class"] == 1) & (outcomes["For-SNP rank"] <= 5)])

In [None]:
# Calculate hits @10
sum(outcomes["Class"][(outcomes["Class"] == 1) & (outcomes["For-SNP rank"] <= 10)])

In [None]:
outcomes["For-SNP rank"][(outcomes["Class"] == 1)].mean()

In [None]:
outcomes["For-SNP rank"][outcomes["Class"] == 1].quantile(q = [0,0.25,0.5,0.75,1])

## Evaluate leave-chromosome-out

In [None]:
outcomes2 = outcomes2.sort_values(["chromosome", "predicted"], ascending = False)
outcomes2["For-chromosome rank"] = outcomes2.groupby("chromosome").cumcount() + 1

In [None]:
chromosomes = list(set(outcomes2["chromosome"]))
aucs = []
for chrom in chromosomes:
  fpr, tpr, thresholds = sklearn.metrics.roc_curve(outcomes2["Class"][outcomes2["chromosome"] == chrom], -outcomes2["For-chromosome rank"][outcomes2["chromosome"] == chrom], pos_label = 1)
  aucs.append(sklearn.metrics.auc(fpr, tpr))
print(sum(aucs)/len(aucs))

In [None]:
ref = ref.merge(outcomes2[["gene_ids", "predicted"]], on = "gene_ids", how = "left")

In [None]:
ref = ref.sort_values(["SNP ID", "predicted"], ascending = False)
ref["For-SNP rank"] = ref.groupby("SNP ID").cumcount() + 1

In [None]:
fpr, tpr, thresholds = sklearn.metrics.roc_curve(ref["Class"], -ref["For-SNP rank"], pos_label = 1)
print(sklearn.metrics.auc(fpr, tpr) * 100)

In [None]:
# Calculate the ROC-AUC for every SNP and average the result
SNPS2 = list(set(ref["SNP ID"]))
aucs = []
for snp in SNPS2:
  if len(set(ref["Class"][ref["SNP ID"] == snp])) == 1:
      aucs.append(list(set(ref["Class"][ref["SNP ID"] == snp]))[0])
  else:
      fpr, tpr, thresholds = sklearn.metrics.roc_curve(ref["Class"][ref["SNP ID"] == snp], -ref["For-SNP rank"][ref["SNP ID"] == snp], pos_label = 1)
      aucs.append(sklearn.metrics.auc(fpr, tpr))
print(sum(aucs)/len(aucs))

In [None]:
# Calculate hits @1
sum(ref["Class"][(ref["Class"] == 1) & (ref["For-SNP rank"] == 1)])

In [None]:
# Calculate hits @3
sum(ref["Class"][(ref["Class"] == 1) & (ref["For-SNP rank"] <= 3)])

In [None]:
# Calculate hits @5
sum(ref["Class"][(ref["Class"] == 1) & (ref["For-SNP rank"] <= 5)])

In [None]:
# Calculate hits @10
sum(ref["Class"][(ref["Class"] == 1) & (ref["For-SNP rank"] <= 10)])

In [None]:
ref["For-SNP rank"][(ref["Class"] == 1)].mean()

In [None]:
ref["For-SNP rank"][ref["Class"] == 1].quantile(q = [0,0.25,0.5,0.75,1])