# EVOKE classification

In [28]:
# Load the required packages
import pandas as pd
from sklearn.svm import SVR
import json
import numpy as np

In [29]:
# Load the data
f = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/EVOKE/unfiltered.txt", sep = " ", header = None, skiprows = 1)
f.index = range(1, len(f) + 1)
f.drop(columns = [73], inplace = True)
f = f.astype(int)

# Log transform the data
#f = np.log10(f, where = f > 0)

In [30]:
# Load the reference set
ref = pd.read_csv("/Users/vlietstraw/git/Post-GWAS/Input sets/Farashi/Farashi full 2000000 bp distance no pvalue filtering.csv")

# Load the mapping file
with open("/Users/vlietstraw/git/Post-GWAS/ENSEMBL_mappings.json", "r") as fp:
    ensembl_dict = json.load(fp)
ref["nodeID"] = [ensembl_dict[x] if x in ensembl_dict.keys() else None for x in ref["gene_ids"]]

# Drop all unmappable candidates
ref.dropna(subset = ["nodeID"], inplace = True)
ref["nodeID"] = ref["nodeID"].astype(int)

In [31]:
# Set bp distance cutoff
max_bp_distance = 2000
max_bp_distance = max_bp_distance * 1000
ref = ref[ref["bp distance absolute"] <= max_bp_distance]

In [32]:
# Drop all SNPs which no longer have a positive case
pos_counts = ref.groupby("SNP ID")["Class"].sum()
ref = ref[~ref["SNP ID"].isin(pos_counts[pos_counts == 0].index)]

In [33]:
# Identify all genes which are at least once positive
positives = ref.groupby("nodeID")["Class"].sum()
positives[positives > 1] = 1

f = f.merge(positives, left_index = True, right_index = True)

## Leave SNP out classification

In [None]:
outcomes = pd.DataFrame()
train_auc_score = []
train_auc_rank = []

In [None]:
# Perform leave-SNP-out cross validation
SNPs = list(set(ref["SNP ID"]))
for snp in SNPs:
    print("Predicting candidates for " + snp + ", number " + str(SNPs.index(snp) + 1) + " out of " + str(len(SNPs)))

    f_test = f[f.index.isin(ref[ref["SNP ID"] == snp]["nodeID"])].copy()
    f_train = f[f.index.isin(ref[ref["SNP ID"] != snp]["nodeID"])].copy()
    
    train_class = f["Class"][f.index.isin(f_train.index)]
    test_class = f["Class"][f.index.isin(f_test.index)]
    
    f_test.drop(columns = ["Class"], inplace = True)
    f_train.drop(columns = ["Class"], inplace = True)

    clf = SVR(gamma="auto")
    
    clf.fit(f_train, train_class)

    outcomes = pd.concat([outcomes, pd.DataFrame({"predicted" : clf.predict(f_test), 
                                                    "SNP ID" : snp,
                                                    "nodeID" : f_test.index})])
    
outcomes = outcomes.merge(ref[["SNP ID", "nodeID", "Class"]], on = ["SNP ID", "nodeID"], how = "left")

## Leave chromosome out validation

In [34]:
outcomes2 = pd.DataFrame()
train_auc_score2 = []
train_auc_rank2 = []

In [35]:
# Perform leave-SNP-out cross validation
chromosomes = list(set(ref["chromosome"]))
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier

for chrom in chromosomes:
    print("Predicting candidates for chromosome " + chrom)

    f_test = f[f.index.isin(ref["nodeID"][ref["chromosome"] == chrom])].copy()
    f_train = f[f.index.isin(ref["nodeID"][ref["chromosome"] != chrom])].copy()
    
    train_class = f["Class"][f.index.isin(f_train.index)]
    test_class = f["Class"][f.index.isin(f_test.index)]
    
    f_test.drop(columns = ["Class"], inplace = True)
    f_train.drop(columns = ["Class"], inplace = True)
    
    #clf = SVR(gamma="auto")
    clf = DecisionTreeClassifier()
    
    clf.fit(np.array(f_train), np.array(train_class))

    outcomes2 = pd.concat([outcomes2, pd.DataFrame({"predicted" : clf.predict_proba(f_test)[:,1],
                                                    "Class" : test_class, 
                                                    "chromosome" : chrom,
                                                    "nodeID" : f_test.index})])

Predicting candidates for chromosome 4
Predicting candidates for chromosome 19
Predicting candidates for chromosome 2
Predicting candidates for chromosome 11
Predicting candidates for chromosome X
Predicting candidates for chromosome 5
Predicting candidates for chromosome 20
Predicting candidates for chromosome 1
Predicting candidates for chromosome 9
Predicting candidates for chromosome 8
Predicting candidates for chromosome 6
Predicting candidates for chromosome 16
Predicting candidates for chromosome 7
Predicting candidates for chromosome 22
Predicting candidates for chromosome 14
Predicting candidates for chromosome 21
Predicting candidates for chromosome 12
Predicting candidates for chromosome 17
Predicting candidates for chromosome 3
Predicting candidates for chromosome 10
Predicting candidates for chromosome 13


In [None]:
outcomes.to_csv("/Users/vlietstraw/git/Post-GWAS/EVOKE/Leave-SNP-Out cross validation " + str(max_bp_distance) + ".csv", index = False)
outcomes2.to_csv("/Users/vlietstraw/git/Post-GWAS/EVOKE/Leave-chromosome-Out cross validation " + str(max_bp_distance) + ".csv", index = False)

## Evaluate leave-SNP-out

In [7]:
outcomes = outcomes.sort_values(["SNP ID", "predicted"], ascending = False)
outcomes["For-SNP rank"] = outcomes.groupby("SNP ID").cumcount() + 1

NameError: name 'outcomes' is not defined

In [None]:
len(set(outcomes["SNP ID"]))

In [None]:
sum(outcomes["Class"])

In [None]:
import sklearn.metrics

fpr, tpr, thresholds = sklearn.metrics.roc_curve(outcomes["Class"], -outcomes["For-SNP rank"], pos_label = 1)
print(sklearn.metrics.auc(fpr, tpr) * 100)

In [None]:
# Calculate the ROC-AUC for every SNP and average the result
SNPS2 = list(set(outcomes["SNP ID"]))
aucs = []
for snp in SNPS2:
  if len(set(outcomes["Class"][outcomes["SNP ID"] == snp])) == 1:
      aucs.append(list(set(outcomes["Class"][outcomes["SNP ID"] == snp]))[0])
  else:
      fpr, tpr, thresholds = sklearn.metrics.roc_curve(outcomes["Class"][outcomes["SNP ID"] == snp], -outcomes["For-SNP rank"][outcomes["SNP ID"] == snp], pos_label = 1)
      aucs.append(sklearn.metrics.auc(fpr, tpr))
print(sum(aucs)/len(aucs))

In [None]:
# Calculate hits @1
sum(outcomes["Class"][(outcomes["Class"] == 1) & (outcomes["For-SNP rank"] == 1)])

In [None]:
# Calculate hits @3
sum(outcomes["Class"][(outcomes["Class"] == 1) & (outcomes["For-SNP rank"] <= 3)])

In [None]:
# Calculate hits @5
sum(outcomes["Class"][(outcomes["Class"] == 1) & (outcomes["For-SNP rank"] <= 5)])

In [None]:
# Calculate hits @10
sum(outcomes["Class"][(outcomes["Class"] == 1) & (outcomes["For-SNP rank"] <= 10)])

In [None]:
outcomes["For-SNP rank"][(outcomes["Class"] == 1)].mean()

In [None]:
outcomes["For-SNP rank"][outcomes["Class"] == 1].quantile(q = [0,0.25,0.5,0.75,1])

## Evaluate leave-chromosome-out

In [36]:
outcomes2 = outcomes2.sort_values(["chromosome", "predicted"], ascending = False)
outcomes2["For-chromosome rank"] = outcomes2.groupby("chromosome").cumcount() + 1

In [48]:
chromosomes = list(set(outcomes2["chromosome"]))
aucs = []
for chrom in chromosomes:
  fpr, tpr, thresholds = sklearn.metrics.roc_curve(outcomes2["Class"][outcomes2["chromosome"] == chrom], -outcomes2["For-chromosome rank"][outcomes2["chromosome"] == chrom], pos_label = 1)
  aucs.append(sklearn.metrics.auc(fpr, tpr))
print(sum(aucs)/len(aucs))

0.4480401341766646


In [38]:
ref = ref.merge(outcomes2[["nodeID", "predicted"]], on = "nodeID", how = "left")

In [44]:
ref = ref.sort_values(["SNP ID", "predicted"], ascending = False)

SNP_temp = 0
counter = 0
prediction_temp = 9999
for indx, row in ref.iterrows():
    if SNP_temp != row["SNP ID"]:
        SNP_temp = row["SNP ID"]
        counter = 1
    elif SNP_temp == row["SNP ID"] and prediction_temp != row["predicted"]:
        counter += 1
        prediction_temp = row["predicted"]
    ref.at[indx, "For-SNP rank"] = counter

In [47]:
fpr, tpr, thresholds = sklearn.metrics.roc_curve(ref["Class"], -ref["For-SNP rank"], pos_label = 1)
print(sklearn.metrics.auc(fpr, tpr) * 100)

62.41990355397014


In [49]:
# Calculate the ROC-AUC for every SNP and average the result
SNPS2 = list(set(ref["SNP ID"]))
aucs = []
for snp in SNPS2:
  if len(set(ref["Class"][ref["SNP ID"] == snp])) == 1:
      aucs.append(list(set(ref["Class"][ref["SNP ID"] == snp]))[0])
  else:
      fpr, tpr, thresholds = sklearn.metrics.roc_curve(ref["Class"][ref["SNP ID"] == snp], -ref["For-SNP rank"][ref["SNP ID"] == snp], pos_label = 1)
      aucs.append(sklearn.metrics.auc(fpr, tpr))
print(sum(aucs)/len(aucs))

0.5140279045191526


In [None]:
# Calculate hits @1
sum(ref["Class"][(ref["Class"] == 1) & (ref["For-SNP rank"] == 1)])

In [None]:
# Calculate hits @3
sum(ref["Class"][(ref["Class"] == 1) & (ref["For-SNP rank"] <= 3)])

In [None]:
# Calculate hits @5
sum(ref["Class"][(ref["Class"] == 1) & (ref["For-SNP rank"] <= 5)])

In [None]:
# Calculate hits @10
sum(ref["Class"][(ref["Class"] == 1) & (ref["For-SNP rank"] <= 10)])

In [None]:
ref["For-SNP rank"][(ref["Class"] == 1)].mean()

In [None]:
ref["For-SNP rank"][ref["Class"] == 1].quantile(q = [0,0.25,0.5,0.75,1])