-
Notifications
You must be signed in to change notification settings - Fork 1
/
extract_ITS_sequences.py
59 lines (51 loc) · 1.66 KB
/
extract_ITS_sequences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env python3
'''
extract_ITS_sequences.py -- extracte ITS sequences according ITSx result
Date: 2021-06-21
Bugs: Any bugs should be reported to chenyanpeng1992@outlook.com
'''
import os
import sys
import argparse
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('fasta',
metavar='<ITS.fasta>',
type=str,
help='a multiple fasta file')
parser.add_argument('itsxposition',
metavar='<txt>',
type=str,
help='ITSx position result' )
args = parser.parse_args()
def parse_fa2dict(fasta):
fadict = {}
with open(fasta) as fafh:
for line in fafh:
line = line.rstrip('\n')
if line.startswith('>'):
seqid = line.split()[0].lstrip('>')
fadict[seqid] = []
else:
fadict[seqid].append(line)
fadict = {k:''.join(v) for k,v in fadict.items()}
return fadict
def parse_itsx_positionfile(itsxposition):
positiondict = {}
with open(itsxposition) as pfh:
for line in pfh:
line_lst = line.split('\t')
seqid = line_lst[0]
its1_start = int(line_lst[3].split(': ')[1].split('-')[0])
its2_end = int(line_lst[5].split(': ')[1].split('-')[1])
positiondict[seqid] = [its1_start, its2_end]
return positiondict
if __name__ == '__main__':
fadict = parse_fa2dict(args.fasta)
positiondict = parse_itsx_positionfile(args.itsxposition)
for seqid,poss in positiondict.items():
start, end = poss
print(f'>{seqid}')
sequence = fadict[seqid][start - 1: end]
print(f'{sequence}')