-
Notifications
You must be signed in to change notification settings - Fork 15
/
all2all_jobs.py
90 lines (75 loc) · 3.87 KB
/
all2all_jobs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import argparse
import gzip
import itertools
def parse_fasta_index(fasta_file):
fai_file = fasta_file + '.fai'
sequences = []
with open(fai_file, 'r') as file:
for line in file:
sequence_name = line.strip().split('\t')[0]
sequences.append(sequence_name)
return sequences
def group_sequences(sequences, grouping):
grouped_sequences = {}
for sequence in sequences:
if '#' in sequence:
fields = sequence.split('#')
if grouping in ['g', 'genome']:
group_key = fields[0]
elif grouping in ['h', 'haplotype']:
group_key = '#'.join(fields[:2])
elif grouping in ['c', 'contig']:
group_key = sequence
else:
raise ValueError(f"Invalid grouping: {grouping}")
else:
group_key = sequence
if group_key not in grouped_sequences:
grouped_sequences[group_key] = []
grouped_sequences[group_key].append(sequence)
return grouped_sequences
def generate_pairings(target_grouped_sequences, query_grouped_sequences, num_queries):
pairings = []
target_groups = list(target_grouped_sequences.keys())
query_groups = list(query_grouped_sequences.keys())
for target_group in target_groups:
query_pool = [group for group in query_groups if group != target_group]
for query_chunk in itertools.zip_longest(*[iter(query_pool)] * num_queries):
query_chunk = [q for q in query_chunk if q is not None]
pairings.append((target_group, query_chunk))
return pairings
def main():
parser = argparse.ArgumentParser(description='Generate pairings or wfmash command lines for all-to-all alignment using PanSN format.')
parser.add_argument('fasta_file', help='Path to the FASTA file (can be gzipped)')
parser.add_argument('-n', '--num-queries', type=int, default=4, help='Number of query groups per target group (default: 4)')
parser.add_argument('-t', '--target-grouping', choices=['g', 'genome', 'h', 'haplotype', 'c', 'contig'], default='haplotype', help='Grouping level for targets: g/genome, h/haplotype, or c/contig (default: haplotype)')
parser.add_argument('-q', '--query-grouping', choices=['g', 'genome', 'h', 'haplotype', 'c', 'contig'], default='haplotype', help='Grouping level for queries: g/genome, h/haplotype, or c/contig (default: haplotype)')
parser.add_argument('-o', '--output', help='Output file to save the pairings or command lines')
args, wfmash_args = parser.parse_known_args()
# Parse the FASTA index file
sequences = parse_fasta_index(args.fasta_file)
# Group sequences based on the specified grouping levels
target_grouped_sequences = group_sequences(sequences, args.target_grouping)
query_grouped_sequences = group_sequences(sequences, args.query_grouping)
# Generate pairings
pairings = generate_pairings(target_grouped_sequences, query_grouped_sequences, args.num_queries)
# Save or print the pairings or command lines
if wfmash_args:
wfmash_options = ' '.join(wfmash_args)
if args.output:
with open(args.output, 'w') as file:
for target_group, query_groups in pairings:
file.write(f"wfmash {wfmash_options} -T {target_group} -Q {','.join(query_groups)}\n")
else:
for target_group, query_groups in pairings:
print(f"wfmash {wfmash_options} -T {target_group} -Q {','.join(query_groups)}")
else:
if args.output:
with open(args.output, 'w') as file:
for target_group, query_groups in pairings:
file.write(f"{target_group}\t{','.join(query_groups)}\n")
else:
for target_group, query_groups in pairings:
print(f"{target_group}\t{','.join(query_groups)}")
if __name__ == '__main__':
main()