In [21]:
import math

# affinity for susceptibility (negative/positive) - indices and directories
NEG, POS, TEST = 0, 1, 2

# sum of significance threshold parameter
ALPHA = 0.5

# weight function for a mutation given total # mutations in sequence
def weight(n):
  return (1 / (math.log(n) + 2))

# length of human ACE2 sequence
SEQ_LEN = 805

In [22]:
from os import listdir
from re import search

# complete list of data in format (organism, sequence) for each affinity
seqs = [[], []]

for aff in [NEG, POS]:
  for filename in listdir('./data/%d' % aff):
    with open("./data/%d/%s" % (aff, filename)) as f:
      lines = f.readlines()
      name = search(r'\[(.*?)\]',lines[0][1:]).group(1)
      sequence = ''.join(map(lambda k: k.strip(), lines[1:])).strip()
      seqs[aff].append((name, sequence))

# seqs

In [23]:
consensus_seq = ""

for i in range(SEQ_LEN):
  acids = []
  for (name, seq) in seqs[POS]:
    acids.append(seq[i])
  consensus_seq += max(acids, key=acids.count)

# consensus_seq

In [14]:

mutts = {}

for (name, seq) in seqs[NEG]:
  n_mutations = 0
  ix_mutation = []
  for (i, ch_i) in enumerate(seq):
    if (ch_i) == "-": continue
    has_match = False
    for (_, seq_pos) in seqs[POS]:
      if (seq_pos[i] == ch_i): 
        has_match = True
        break
    if (not has_match): 
      n_mutations += 1
      mut = "%c%d" % (ch_i, i + 1)
      ix_mutation.append(mut) 
  for i in ix_mutation:
    # computed weighted value = 1 / (log(n) + 2)
    wv = weight(n_mutations)
    mutts[i] = wv if i not in mutts else mutts[i] + wv

result = []
for (i, f) in mutts.items():
  if (f >= ALPHA): 
    result.append((i, f))
result.sort()

result

[('A41', 0.5),
 ('A66', 0.5385357335328163),
 ('D31', 0.6863135611816127),
 ('F83', 0.5226018801516173),
 ('H353', 0.8362883189700047),
 ('N113', 0.525115887738091),
 ('S426', 0.5226018801516173),
 ('V679', 0.525115887738091)]

In [15]:
result = [65, 352, 82, 112, 425, 678, 40, 30]
result.sort()

def print_influence_pts(seq, name):
  p_str = "  "
  for i in result: p_str += seq[i] + "   "
  p_str += name
  print(p_str)

print(''.join(["%4d" % (i+1) for i in result]))
for (name, seq) in seqs[POS]: print_influence_pts(seq, name) 
print()
for (name, seq) in seqs[NEG]: print_influence_pts(seq, name)
print()
print_influence_pts(consensus_seq, "Consensus sequence")

  31  41  66  83 113 353 426 679
  K   Y   G   Y   S   K   P   I   Felis catus
  K   Y   G   Y   S   K   P   I   Chlorocebus sabaeus
  K   Y   G   Y   S   K   P   I   Rhinolophus macrotis
  K   Y   E   Y   S   K   Y   I   Oryctolagus cuniculus
  K   H   G   Y   S   K   P   I   Callithrix jacchus
  K   Y   G   Y   S   K   P   I   Macaca fascicularis
  K   Y   R   Y   S   K   P   -   Odocoileus virginianus texanus
  K   Y   G   Y   S   K   P   I   Panthera tigris
  K   Y   G   Y   R   K   P   I   Mustela putorius furo
  K   Y   R   H   T   K   P   I   Cynopterus sphinx
  K   Y   G   Y   R   K   P   I   Mustela lutreola biedermanni
  K   Y   G   Y   S   K   P   I   Homo sapiens

  -   A   -   -   -   -   -   -   Mutation A41
  N   Y   A   Y   R   K   P   I   Procyon lotor
  D   Y   -   -   -   -   -   -   Mutation D31
  -   -   -   -   -   H   -   -   Mutation H353
  N   Y   A   F   N   H   S   V   Mus musculus
  K   Y   G   Y   N   K   P   V   Manis pentadactyla
  K   Y   A   F   N   H  