/
03-annotate-results.py
106 lines (90 loc) · 4.19 KB
/
03-annotate-results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python3
'''
Update:
This program expects there to be on plate_Sequences.tsv file.
This program expects there to be two csv files:
./02-Results/*_HeavyChain.tsv
./02-Results/*_LightChain.tsv
It will load each of these files, calculate the three
frames of translations, then push the sequences through
ANARCI to look for a predicted domain and numbering.
ANARCI output will then be parsed and included in a new
output file for the web page.
'''
from Bio.Seq import Seq
import csv
import subprocess
from glob import glob
import os
## Globals
anarciexe = 'ANARCI'
outpath = './03-AnnotatedResults/'
os.system('mkdir -p ' + outpath)
def process_file(chain):
"""Process ASV records through ANARCI software tool and write new table to outpath."""
tsvs = glob('./02-Results/*_' + chain + '.tsv')
assert (len(tsvs) > 0), "NO TSV file found, check that R scripts ran."
assert (len(tsvs) == 1), "More than one TSV file was found for " + './02-Results/*_' + chain + '.tsv'
inf = tsvs[0]
infDR = csv.DictReader(open(inf, 'r'), delimiter='\t')
outf = open(os.path.join(outpath,os.path.basename(inf)), 'w')
outfDW = csv.DictWriter(outf, delimiter='\t', restval='-', fieldnames = infDR.fieldnames +
['chain_type', 'e-value', 'score', 'seqstart_index', 'seqend_index', 'scheme', 'frame', 'AA', 'numbering', 'domain'])
outfDW.writeheader()
for record in infDR:
seq = Seq(record['ASV'].strip())
# Build translations of each frame until one has a predicted domain:
for i in range(3):
aa = (seq[i:] + ("N" * (3 - len(seq[i:]) %3))).translate()
anarci = run_anarci(aa, i)
if anarci is not None:
record.update(anarci)
break
outfDW.writerow(record)
outf.close()
def run_anarci(aa, frame):
"""Run ANARCI program and parse results into a dictionary. Return FIRST hit."""
result = None
# Parse the aa into each sub-piece that doesn't have a stop codon:
subAAs = str(aa).strip('X').split('*')
for i in range(len(subAAs)):
s = subAAs[i]
if(len(s) > 0):
cmd = anarciexe + " --scheme imgt -i " + s
test = subprocess.run(cmd.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
output = test.stdout.splitlines()
if len(output) > 2:
result = {}
mcols = output[4].split('|')
mdata = output[5].split('|')
result['chain_type'] = mdata[mcols.index('chain_type')]
result['e-value'] = mdata[mcols.index('e-value')]
result['score'] = mdata[mcols.index('score')]
result['seqstart_index'] = mdata[mcols.index('seqstart_index')]
result['seqend_index'] = mdata[mcols.index('seqend_index')]
result['scheme'] = output[6].split('=')[1].strip()
splitAA = s[0:int(result['seqstart_index'])]
splitAA += '`' + s[int(result['seqstart_index']):int(result['seqend_index'])] + '`'
splitAA += s[int(result['seqend_index']):]
subAAs[i] = splitAA
result['frame'] = frame
result['AA'] = '*'.join(subAAs)
result['numbering'] = []
result['domain'] = []
for l in output[7:]:
if l != '//':
result['numbering'] += [l.strip().split()[1]]
result['domain'] += [l.strip().split()[2]]
result['numbering'] = ','.join(result['numbering'])
result['domain'] = ','.join(result['domain'])
break
return(result)
#for chain in ['HeavyChain', 'LightChain']:
# process_file(chain)
for chain in ['Sequences']:
process_file(chain)
"""
Example call:
./anarci-1.3/bin/ANARCI --scheme imgt -i TDQSPQAVSSGCLLKMKLPVRLLVLMFWIPASSSDVVMTQTPLSLPVSLGDQASISCRSSQSLVHSNGNTYLHWYLQNPGQSPKLLIYKVSKRFSGVPDRFSGSGSGTDFTLKISRVEAEDLGVYFCSQSTHVPYTFGGGTKLEIKRA
NL*SVSSPQSLKTLTLTMEWSWVFLFLLSVTSGVHSQVQLQQSGAELVKPGASVKLSCKTSGYTFTSYWIQWVKRRPGQGLGWIGEIFPGTGTTYYNENFKGKATLTIDTSSSTAYMQLSSLTSEDSAVYFCARVDGSSYFDYWGQGTTLTVSS
"""