# Deep learning test script

In [1]:
# Load the required packages
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping

# Load the required packages
import pandas as pd
from sklearn.svm import SVR
import json
import numpy as np

In [2]:
class_type = "autoencode"

In [3]:
# Load the data
if class_type == "normal":
    f = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Struc2vec/PathwayStudio_PPI_complete_directed_weighted.emb", sep = " ", skiprows = 1, header = None, index_col = 0)

In [4]:
# Load the autoencoded embeddings
if class_type == "autoencode":
    f = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Struc2vec/autorcode_weighted_emb.txt", sep = "\t", header = None)
    f.drop(columns = [350], inplace = True)
    f2 = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Struc2vec/PathwayStudio_PPI_complete_directed_weighted.emb", sep = " ", skiprows = 1, header = None, index_col = 0)
    f.index = f2.index

In [5]:
# Load the reference set
ref = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Input sets/Farashi/Farashi full 2000000 bp distance no pvalue filtering.csv")

# Load the mapping file
with open("/Users/vlietstraw/git/Post-GWAS/ENSEMBL_mappings.json", "r") as fp:
    ensembl_dict = json.load(fp)
ref["nodeID"] = [ensembl_dict[x] if x in ensembl_dict.keys() else None for x in ref["gene_ids"]]

In [6]:
# Drop all unmappable candidates
ref.dropna(subset = ["nodeID"], inplace = True)
ref["nodeID"] = ref["nodeID"].astype(int)

In [7]:
# Set bp distance cutoff
max_bp_distance = 2000
max_bp_distance = max_bp_distance * 1000
ref = ref[ref["bp distance absolute"] <= max_bp_distance]

In [8]:
# Drop all SNPs which no longer have a positive case
pos_counts = ref.groupby("SNP ID")["Class"].sum()
ref = ref[~ref["SNP ID"].isin(pos_counts[pos_counts == 0].index)]

In [9]:
# Identify all genes which are at least once positive
positives = ref.groupby("nodeID")["Class"].sum()
positives[positives > 1] = 1

f = f.merge(positives, left_index = True, right_index = True)

In [10]:
def createKeras(n_features):
  model = Sequential()
  model.add(Dense(230, input_dim = n_features, activation = "relu"))
  model.add(Dense(230, activation = "relu"))
  model.add(Dense(230, activation = "relu"))
  #model.add(Dense(32, activation = "relu"))
  model.add(Dense(1, activation = "sigmoid"))

  model.compile(loss = "binary_crossentropy",
                optimizer = "Adam",
                metrics = ["AUC"])
  return model

ES = EarlyStopping(monitor = 'loss', mode = 'min', patience = 3, min_delta = 0.025)

In [11]:
outcomes = pd.DataFrame()
train_auc_score = []
train_auc_rank = []

In [12]:
class_weights = { 0 : 1., 1 : round(len(f) / sum(f["Class"]), 2)}
aa = []
bb = []

In [13]:
# Perform leave-SNP-out cross validation
SNPs = list(set(ref["SNP ID"]))
for snp in SNPs:
    print("Predicting candidates for " + snp + ", number " + str(SNPs.index(snp) + 1) + " out of " + str(len(SNPs)))

    f_test = f[f.index.isin(ref[ref["SNP ID"] == snp]["nodeID"])].copy()
    f_train = f[f.index.isin(ref[ref["SNP ID"] != snp]["nodeID"])].copy()
    
    train_class = f["Class"][f.index.isin(f_train.index)]
    test_class = f["Class"][f.index.isin(f_test.index)]
    
    f_test.drop(columns = ["Class"], inplace = True)
    f_train.drop(columns = ["Class"], inplace = True)

    clf = None
    clf = createKeras(350)
    
    clf.fit(np.array(f_train), np.array(train_class), epochs = 25, verbose = 0, class_weight = class_weights, callbacks = [ES])
    
    a, b = clf.evaluate(np.array(f_test), np.array(test_class))
    
    aa.append(a)
    bb.append(b)

    outcomes = pd.concat([outcomes, pd.DataFrame({  "predicted" : [x[0] for x in clf.predict(np.array(f_test))],
                                                    "SNP ID" : snp,
                                                    "nodeID" : f_test.index})])
    
outcomes = outcomes.merge(ref[["SNP ID", "nodeID", "Class"]], on = ["SNP ID", "nodeID"], how = "left")

Predicting candidates for rs12682374, number 1 out of 194
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Predicting candidates for rs10936845, number 2 out of 194
Predicting candidates for rs34837204, number 3 out of 194
Predicting candidates for rs4976790, number 4 out of 194
Predicting candidates for rs62003544, number 5 out of 194
Predicting candidates for rs878987, number 6 out of 194
Predicting candidates for rs13395911, number 7 out of 194
Predicting candidates for rs12639887, number 8 out of 194
Predicting candidates for rs3746337, number 9 out of 194
Predicting candidates for rs461251, number 10 out of 194
Predicting candidates for rs75513592, number 11 out of 194
Predicting candidates for rs79618460, number 12 out of 194
Predicting candidates for rs6557271, number 13 out of 194
Predicting candidates for rs2412106, number 14 out of 194
Predicting candi

In [14]:
outcomes = outcomes.sort_values(["SNP ID", "predicted"], ascending = False)
outcomes["For-SNP rank"] = outcomes.groupby("SNP ID").cumcount() + 1

In [15]:
import sklearn.metrics

fpr, tpr, thresholds = sklearn.metrics.roc_curve(outcomes["Class"], -outcomes["For-SNP rank"], pos_label = 1)
print(sklearn.metrics.auc(fpr, tpr) * 100)

81.23034111812352


In [16]:
print(np.mean(bb))

0.7430851


In [17]:
# Calculate the ROC-AUC for every SNP and average the result
SNPS2 = list(set(outcomes["SNP ID"]))
aucs = []
for snp in SNPS2:
  if len(set(outcomes["Class"][outcomes["SNP ID"] == snp])) == 1:
      aucs.append(list(set(outcomes["Class"][outcomes["SNP ID"] == snp]))[0])
  else:
      fpr, tpr, thresholds = sklearn.metrics.roc_curve(outcomes["Class"][outcomes["SNP ID"] == snp], -outcomes["For-SNP rank"][outcomes["SNP ID"] == snp], pos_label = 1)
      aucs.append(sklearn.metrics.auc(fpr, tpr))
print(sum(aucs)/len(aucs))

0.7280879634861137
