In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# converts only 1D or 2D array
def convert_to_percentage(arr):
  ret = list()

  for row in arr:
    count = 0
    newrow = list()

    for cell in row:
      count = count+cell
    
    for cell in row:
      newrow.append(cell/count)
    
    ret.append(newrow)
  
  return ret

In [3]:
def fix(s):
  firstcolon = s.find(":")
  s_new = s[firstcolon-2:firstcolon] + s[firstcolon+1:firstcolon+3]
  s_new = s_new.replace("*", "0")
  return s_new


In [4]:
def compute_resolution(gs_val,pre_val):
    gs_fixed = fix(gs_val)
    pre_fixed = fix(pre_val)

    if (gs_fixed[0:2] == pre_fixed[0:2]):
      if (gs_fixed[2:4] == pre_fixed[2:4]):
        return 4
      return 2
    return 0

In [5]:
# requirements: gs accession numbers are under a column labeled "Run" 
#pre accession numbers are under a column labeled "ERR" 
# accession numbers/column titles are labeled identically between gold standard and results csv
# Only accuracy for samples in both GS and PRE are calculated. Samples in PRE, but not in GS are ignored. Samples in GS, but not in PRE, are tallied in the "failed" variable 
def compute_matches(pre,gs):

  fourdig = 0
  twodig = 0
  zerodig = 0
  fail = 0

  accession_numbers = gs["Run"].values.tolist()
  genes = gs.columns.values.tolist()

  for number in accession_numbers:
      pre_row = pre.loc[pre['ERR'] == number]
      gs_row = gs.loc[gs['Run'] == number]

      for i in range(1,len(genes),2):
        try:
          gs_val1 = gs_row[genes[i]].astype(str).values[0]
          pre_val1 = pre_row[genes[i]].astype(str).values[0]
          gs_val2 = gs_row[genes[i+1]].astype(str).values[0]
          pre_val2 = pre_row[genes[i+1]].astype(str).values[0]

          if gs_val1 == None or pre_val1 == None or gs_val2 == None or pre_val2 == None:
            fail = fail+1
            continue

          # assuming no swapping 
          ans1 = compute_resolution(gs_val1,pre_val1)
          ans2 = compute_resolution(gs_val2,pre_val2)

          # assuming swapping
          ans3 = compute_resolution(gs_val1,pre_val2)
          ans4 = compute_resolution(gs_val2,pre_val1)

          if (ans1+ans2 > ans3+ans4):
            if (ans1 == 0):
              zerodig = zerodig + 1
            if (ans1 == 2):
              twodig = twodig + 1
            if (ans1 == 4):
              fourdig = fourdig + 1
            if (ans2 == 0):
              zerodig = zerodig + 1
            if (ans2 == 2):
              twodig = twodig + 1
            if (ans2 == 4):
              fourdig = fourdig + 1
          else:
            if (ans3 == 0):
              zerodig = zerodig + 1
            if (ans3 == 2):
              twodig = twodig + 1
            if (ans3 == 4):
              fourdig = fourdig + 1
            if (ans4 == 0):
              zerodig = zerodig + 1
            if (ans4 == 2):
              twodig = twodig + 1
            if (ans4 == 4):
              fourdig = fourdig + 1
        except:
          fail = fail+1

  return zerodig,twodig,fourdig #,fail #onzero fail indicates exception occurred

In [12]:
range(0,6)

range(0, 6)

In [24]:
data = list()
tools=["hlaforest","optitype","phlat","seq2hla","rna2hla","arcas","hlavbseq"]
#datasets=["1","d2","d3","d4","d5","d6"]
for t in tools:
    for d in range(1,7):
        gs=pd.read_csv("../datasets/"+str(d)+"_gs.csv")
        pre=pd.read_csv("../results/standard/"+str(t)+"_d"+str(d)+".csv")
        ret=compute_matches(pre,gs)
        print(t,"d"+str(d),ret)


hlaforest d1 (22, 39, 239)
hlaforest d2 (133, 797, 3970)
hlaforest d3 (5, 26, 141)
hlaforest d4 (10, 4, 10)
hlaforest d5 (4, 2, 2)
hlaforest d6 (0, 0, 0)
optitype d1 (8, 10, 282)
optitype d2 (13, 30, 2897)
optitype d3 (0, 0, 0)
optitype d4 (0, 0, 0)
optitype d5 (4, 0, 4)
optitype d6 (0, 0, 0)
phlat d1 (13, 39, 248)
phlat d2 (495, 453, 3952)
phlat d3 (4, 12, 156)
phlat d4 (7, 2, 5)
phlat d5 (4, 0, 4)
phlat d6 (0, 0, 0)
seq2hla d1 (14, 25, 261)
seq2hla d2 (152, 325, 4423)
seq2hla d3 (55, 18, 95)
seq2hla d4 (21, 1, 6)
seq2hla d5 (4, 1, 3)
seq2hla d6 (0, 0, 0)
rna2hla d1 (14, 25, 261)
rna2hla d2 (83, 154, 3683)
rna2hla d3 (14, 27, 131)
rna2hla d4 (21, 1, 6)
rna2hla d5 (4, 1, 3)
rna2hla d6 (0, 0, 0)
arcas d1 (9, 88, 203)
arcas d2 (47, 353, 4500)
arcas d3 (4, 5, 163)
arcas d4 (24, 0, 4)
arcas d5 (3, 0, 5)
arcas d6 (0, 0, 0)
hlavbseq d1 (32, 27, 241)
hlavbseq d2 (2162, 258, 2480)
hlavbseq d3 (82, 4, 86)
hlavbseq d4 (22, 0, 2)
hlavbseq d5 (0, 0, 8)
hlavbseq d6 (0, 0, 0)
