-
Notifications
You must be signed in to change notification settings - Fork 0
/
blat_parse.py
executable file
·696 lines (544 loc) · 23.2 KB
/
blat_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
#!/usr/bin/env python
"""
Python 2.7+ is needed to run this.
parses blat .psl files and outputs some statistics
Example of input linkage map file:
Name Comment_FINAL LG Order Female Male Average Meioses
Gdist:574061_1677 SNP-0 gm-01 0 0 0 0 956
Gdist:483922_6360 SNP-0 gm-01 1 0 0.2 0.5 464
Gdist:483921_286 SNP-0 gm-01 2 0 0.2 0.5 1067
"""
import numpy as np
import sys, argparse
import math
from operator import itemgetter, attrgetter
def read_lg(lg_file, exclude_snps):
"""
Parses the linkage map. Originally called CodMap_InfoFeb2014_til_Lex.txt with this
format:
Name Comment_FINAL LG Order Female Male Average Meioses
In the first case, I'm only interested in the columns Name, LG and Order.
"""
lg_snps = {}
#Throw away the header
lg_file.readline()
for line in lg_file:
elements = line.split()
if elements[0] not in exclude_snps:
lg_snps[elements[0]] = [elements[2], elements[3]]
return lg_snps
def parse_psl_file(psl_file, head):
"""
Parses a psl created by BLAT and returns a dictionary with all the SNPs.
SNPs with multiple alignments will have multiple values (a list of alignments), while
SNPs with a unique alignment will just have one entry in the list.
It also creates a score and percentage identity for each entry. And a field for whether
or not it is the best alignment
"""
snps = {}
#Throw away first 5 lines: Not needed when using -noHead with BLAT
if not head:
for i in range(0, 5):
psl_file.readline()
identity = 0
score = 0
best_align = False
#Parses the psl file and adds the entries to the dictionary called snps
for line in psl_file:
elements = line.split()
score = calc_score(int(elements[0]), int(elements[2]), int(elements[1]), int(elements[4]), int(elements[6]))
identity = 100.0 - calc_millibad(int(elements[12]), int(elements[11]), int(elements[16]), int(elements[15]), int(elements[0]), int(elements[2]), int(elements[1]), int(elements[4])) * 0.1
elements.append(score)
elements.append(identity)
elements.append(best_align)
#a bit of a hack, but I want to sort on the alignment against the longest target seq
#and need that as an int. Not sure how else I'd do it.
elements[14]= int(elements[14])
#elements.append(int(elements[14]))
#print (elements[0], elements[21])
if elements[9] in snps:
snps[elements[9]].append(elements)
else:
snps[elements[9]] = [elements]
return snps
#Creating the data structure sequences should be separate from this
def extract_longest_hit(snps, lg_snps):
#Getting longest alignment for each SNP
equal_length_ali = []
#Sequences: seq_name is key, a list as values with length of sequence,
#a list of SNPs and a dictionary of linkage groups with count as values
sequences = {}
unique = 0
unique_lg = 0
unique_other = 0
multiple_best_hits = {}
for snp in snps:
#Get longest alignment, 9251 of them
sorted_hit_list = sorted(snps[snp], key = lambda ele: int(ele[0]), reverse=True)
#print (sorted_hit_list)
#print(len(sorted_hit_list))
longest_ali = sorted_hit_list[0]
#Add all with equal alignment length to a list
for ali in sorted_hit_list:
if (int(ali[0]) == int(longest_ali[0])):
equal_length_ali.append(ali)
#Go through that list and create some datastructures and so
for best in equal_length_ali:
#If sequence name already in sequences
if best[13] in sequences:
sequences[best[13]][1].append(best[9])
#if linkage group already in datastructure
# best[9]
if best[9] in lg_snps:
#print(lg_snps[best[9]][0])
#print(sequences[best[13]][2])
if lg_snps[best[9]][0] in sequences[best[13]][2]:
#add count of one SNP
sequences[best[13]][2][lg_snps[best[9]][0]] += 1
else:
sequences[best[13]][2][lg_snps[best[9]][0]] = 1
else:
#Don't care if SNP is not in linkage map
if best[9] in lg_snps:
#print ({lg_snps[best[9]][0]: 1})
sequences[best[13]] = [best[14], [best[9]], {lg_snps[best[9]][0]: 1}]
if (len(equal_length_ali) == 1):
unique += 1
if (best[9] in lg_snps):
unique_lg += 1
else:
unique_other += 1
else:
if best[9] in multiple_best_hits:
multiple_best_hits[best[9]].append(best[13])
else:
multiple_best_hits[best[9]] = [best[13]]
equal_length_ali = []
#for equal in equal_length_ali.append:
#print(longest_ali)
return sequences, unique, unique_lg, unique_other, multiple_best_hits
def summary_counts(sequences):
#seq would not exist without being connected to a lg
count_seq_lg = len(sequences)
bases_seq_lg = 0
count_seq_multi_lg = 0
bases_seq_multi_lg = 0
conflict_count_seq_lg = 0
conflict_bases_seq_lg = 0
linked_to_multiple_lgs = {}
sum_multi = 0
for seq in sequences:
#print(sequences[seq])
#if sequences[seq]
bases_seq_lg += int(sequences[seq][0])
#if (len(sequences[seq][1]) > 1):
for lg in sequences[seq][2]:
sum_multi += sequences[seq][2][lg]
if (sum_multi > 1):
#print (sequences[seq])
count_seq_multi_lg += 1
bases_seq_multi_lg += int(sequences[seq][0])
sum_multi = 0
if (len(sequences[seq][2]) > 1):
conflict_count_seq_lg += 1
conflict_bases_seq_lg += int(sequences[seq][0])
linked_to_multiple_lgs[seq] = sequences[seq]
return count_seq_lg, bases_seq_lg, count_seq_multi_lg, bases_seq_multi_lg, conflict_count_seq_lg, conflict_bases_seq_lg, linked_to_multiple_lgs
def lexs_method(snps, lex_snps, lg_snps):
sequences, unique, unique_lg, unique_other, multiple_best_hits = extract_longest_hit(lex_snps, lg_snps)
#print (unique)
#print (unique_lg)
#print (unique_other)
multi_lg = 0
multi_no_lg = 0
for snp in multiple_best_hits:
if snp in lg_snps:
multi_lg += 1
else:
multi_no_lg += 1
#print (multiple_best_hits)
count_seq_lg, bases_seq_lg, count_seq_multi_lg, bases_seq_multi_lg, conflict_count_seq_lg, conflict_bases_seq_lg, linked_to_multiple_lgs = summary_counts(sequences)
lex_and_lg = 0
lex_and_not_lg = 0
for snp in lex_snps:
if snp in lg_snps:
lex_and_lg += 1
else:
lex_and_not_lg += 1
snps_in_lg_and_mapped = 0
snps_not_in_lg_and_mapped = 0
for snp in snps:
if snp in lg_snps:
snps_in_lg_and_mapped += 1
else:
snps_not_in_lg_and_mapped += 1
# summary stats
print ("Sequences\tCount\tTotal\tlength")
print ("Linked to one or more LG\t%d\t%d" % (count_seq_lg, bases_seq_lg))
print ("With multiple LG SNPs linked to them\t%d\t%d" % (count_seq_multi_lg, bases_seq_multi_lg))
print ("Linked to multiple LG (conflict)\t%d\t%d\n" % (conflict_count_seq_lg, conflict_bases_seq_lg))
print ("SNPs\tLG SNPs\tother SNPs")
print ("All\t%d\tNA" % len(lg_snps))
print ("SNPs mapped\t%d\t%d" % (snps_in_lg_and_mapped, snps_not_in_lg_and_mapped))
print ("SNPs mapped (>= 0.95 flank length)\t%d\t%d" % (lex_and_lg, lex_and_not_lg))
print ("SNPs uniquely mapped\t%d\t%d" % (unique_lg, unique_other))
print ("SNPs with multiple best hits\t%d\t%d" % (multi_lg, multi_no_lg))
def extract_hits(file_prefix, snps, lg_snps, ident, length):
#Just get all alignments of SNPs and filter before this point
#equal_length_ali = []
#Sequences: seq_name is key, a list as values with length of sequence,
#a list of SNPs and a dictionary of linkage groups with count as values
sequences = {}
unique = 0
unique_lg = 0
unique_other = 0
multiple_ali_hits = {}
multiple_alignment_snps_lg = open(file_prefix + ".multiple_alignment_snps_lg.tsv", "w")
multiple_alignment_snps_no_lg = open(file_prefix + ".multiple_alignment_snps_no_lg.tsv", "w")
multiple_alignment_snps_lg_ident = open(file_prefix + ".multiple_alignment_snps_lg_ident.tsv", "w")
multiple_alignment_snps_no_lg_ident = open(file_prefix + ".multiple_alignment_snps_no_lg_ident.tsv", "w")
#sys.stderr.write("snps\n")
for snp in snps:
#sys.stderr.write("%s\n" % snp)
#Go through that list and create some datastructures and so
for ali in snps[snp]:
#sys.stderr.write("ali\n" )
#If sequence name already in sequences
#sys.stderr.write("%s\n" %ali)
if ali[13] in sequences:
if ali[9] in lg_snps:
sequences[ali[13]][1].append([ali[9], int(ali[15]), int(ali[16]), int(ali[21]), int(ali[22]), ali[23]])
#if linkage group already in datastructure
if lg_snps[ali[9]][0] in sequences[ali[13]][2]:
#add count of one SNP
sequences[ali[13]][2][lg_snps[ali[9]][0]] += 1
else:
sequences[ali[13]][2][lg_snps[ali[9]][0]] = 1
else:
#Don't care if SNP is not in linkage map
if ali[9] in lg_snps:
#print ({lg_snps[ali[9]][0]: 1})
sequences[ali[13]] = [int(ali[14]), [[ali[9], int(ali[15]), int(ali[16]), int(ali[21]), int(ali[22]), ali[23]]], {lg_snps[ali[9]][0]: 1}]
if (len(snps[snp]) == 1):
unique += 1
if (ali[9] in lg_snps):
unique_lg += 1
else:
unique_other += 1
else:
#print ali
if ali[9] in lg_snps:
for i in ali:
multiple_alignment_snps_lg.write("%s\t" % i)
multiple_alignment_snps_lg.write("\n")
else:
for i in ali:
multiple_alignment_snps_no_lg.write("%s\t" % i)
multiple_alignment_snps_no_lg.write("\n")
if ((ali[9] in lg_snps) and (ali[22] >= ident) and ((float(ali[0])/float(ali[10])) >= length)):
for i in ali:
multiple_alignment_snps_lg_ident.write("%s\t" % i)
multiple_alignment_snps_lg_ident.write("\n")
elif ((ali[22] >= ident) and ((float(ali[0])/float(ali[10])) >= length)):
for i in ali:
multiple_alignment_snps_no_lg_ident.write("%s\t" % i)
multiple_alignment_snps_no_lg_ident.write("\n")
if ali[9] in multiple_ali_hits:
multiple_ali_hits[ali[9]].append(ali[13])
else:
multiple_ali_hits[ali[9]] = [ali[13]]
multiple_alignment_snps_lg.close()
multiple_alignment_snps_no_lg.close()
multiple_alignment_snps_lg_ident.close()
multiple_alignment_snps_no_lg_ident.close()
return sequences, unique, unique_lg, unique_other, multiple_ali_hits
def summary_counts_ole(sequences):
#seq would not exist without being connected to a lg
count_seq_lg = len(sequences)
bases_seq_lg = 0
count_seq_multi_lg = 0
bases_seq_multi_lg = 0
conflict_count_seq_lg = 0
conflict_bases_seq_lg = 0
linked_to_multiple_lgs = {}
sum_multi = 0
for seq in sequences:
#print(sequences[seq])
#if sequences[seq]
bases_seq_lg += int(sequences[seq][0])
#if (len(sequences[seq][1]) > 1):
for lg in sequences[seq][2]:
sum_multi += sequences[seq][2][lg]
if (sum_multi > 1):
#print (sequences[seq])
count_seq_multi_lg += 1
bases_seq_multi_lg += int(sequences[seq][0])
sum_multi = 0
if (len(sequences[seq][2]) > 1):
conflict_count_seq_lg += 1
conflict_bases_seq_lg += int(sequences[seq][0])
linked_to_multiple_lgs[seq] = sequences[seq]
return count_seq_lg, bases_seq_lg, count_seq_multi_lg, bases_seq_multi_lg, conflict_count_seq_lg, conflict_bases_seq_lg, linked_to_multiple_lgs
def oles_method(file_prefix, snps, ole_snps, lg_snps, best_ali_snps, ident, length):
#sequences =
sequences, unique, unique_lg, unique_other, multiple_best_hits = extract_hits(file_prefix, ole_snps, lg_snps, ident, length)
#print (unique)
#print (unique_lg)
#print (unique_other)
multi_lg = 0
multi_no_lg = 0
for snp in multiple_best_hits:
if snp in lg_snps:
multi_lg += 1
else:
multi_no_lg += 1
#print (multiple_best_hits)
count_seq_lg, bases_seq_lg, count_seq_multi_lg, bases_seq_multi_lg, conflict_count_seq_lg, conflict_bases_seq_lg, linked_to_multiple_lgs = summary_counts_ole(sequences)
conflict_seqs = open(file_prefix + ".seqs_linked_to_multiple_lgs.tsv", "w")
#snp_start_pos = {}
#temp = []
#Output format: sequence name, sequence length, SNP name, lg, order, start position, end position, score, identity
#sorted_seq_length = sorted(linked_to_multiple_lgs[seq][0], reverse=True)
sorted_seq_length = sorted(linked_to_multiple_lgs.items(), key=lambda x:x[1], reverse=True)
#print sorted_seq_length
for seq, seq_info in sorted_seq_length:
sorted_snps_start_pos = sorted(seq_info[1], key = lambda ele: ele[1])
#print sorted_snps_start_pos
for snp, start_pos, end_pos, score, identity, best_ali in sorted_snps_start_pos:
conflict_seqs.write("%s\t%s\t%s\t%s\t%s\t%d\t%d\t%d\t%d\t%d" %
(seq, seq_info[0], snp, lg_snps[snp][0], lg_snps[snp][1], start_pos, end_pos, score, identity, best_ali))
conflict_seqs.write("\n")
conflict_seqs.close()
#New data structure. LG is key, with SNP and order as values
lgs = {}
for snp in lg_snps:
if lg_snps[snp][0] in lgs:
lgs[lg_snps[snp][0]].append([snp, int(lg_snps[snp][1])])
else:
lgs[lg_snps[snp][0]] = [[snp, int(lg_snps[snp][1])]]
#print len(lgs)
lg_linked_seq_ident = open(file_prefix + ".lg_linked_seq_ident.tsv", "w")
lg_linked_seq = open(file_prefix + ".lg_linked_seq.tsv", "w")
lg_linked_seq_single = open(file_prefix + ".single_lg_linked_seq.tsv", "w")
for lg in sorted(lgs):
#print lg
#print (lgs[lg])
sorted_snps_order = sorted(lgs[lg], key = lambda ele: ele[1])
#print sorted_snps_order
#I want LG, SNP, order, seq, seq len
for snp, order in sorted_snps_order:
if snp in best_ali_snps:
lg_linked_seq.write("%s\t%s\t%s\t%s\t%s\t%d\t%g\n" % (lg, snp, order, best_ali_snps[snp][0][13], best_ali_snps[snp][0][14], best_ali_snps[snp][0][21], best_ali_snps[snp][0][22]))
if best_ali_snps[snp][0][13] not in linked_to_multiple_lgs:
lg_linked_seq_single.write("%s\t%s\t%s\t%s\t%s\t%d\t%g\n" % (lg, snp, order, best_ali_snps[snp][0][13], best_ali_snps[snp][0][14], best_ali_snps[snp][0][21], best_ali_snps[snp][0][22]))
else:
#Should there be an empty " " between tabs?
lg_linked_seq.write("%s\t%s\t%s\t \t \t \t \n" % (lg, snp, order))
#I want LG, SNP, order, seq, seq len, score, identity, start pos, end pos
if ((snp in best_ali_snps) and (best_ali_snps[snp][0][22] >= ident) and ((float(best_ali_snps[snp][0][0])/float(best_ali_snps[snp][0][10])) >= length)):
lg_linked_seq_ident.write("%s\t%s\t%s\t%s\t%s\t%d\t%g\t%s\t%s\n" % (lg, snp, order, best_ali_snps[snp][0][13], best_ali_snps[snp][0][14], best_ali_snps[snp][0][21], best_ali_snps[snp][0][22], best_ali_snps[snp][0][15], best_ali_snps[snp][0][16]))
else:
lg_linked_seq_ident.write("%s\t%s\t%s\t \t \t \t \t \t \n" % (lg, snp, order))
#lg_linked_seq.write("\n")
lg_linked_seq.close()
lg_linked_seq_ident.close()
lg_linked_seq_single.close()
snp_no_lg = open(file_prefix + ".snp_no_lg.tsv", "w")
for snp in ole_snps:
if snp not in lg_snps:
if snp in best_ali_snps:
#SNP, seq, seq len,
snp_no_lg.write("%s\t%s\t%s\t%d\t%g\t%s\t%s\n" % (snp, best_ali_snps[snp][0][13], best_ali_snps[snp][0][14], best_ali_snps[snp][0][21], best_ali_snps[snp][0][22], best_ali_snps[snp][0][15], best_ali_snps[snp][0][16]))
snp_no_lg.close()
ole_and_lg = 0
ole_and_not_lg = 0
for snp in ole_snps:
if snp in lg_snps:
ole_and_lg += 1
"""
for i in lex_snps[snp][0]:
sys.stderr.write("%s\t" % i)
sys.stderr.write("\n")
"""
else:
ole_and_not_lg += 1
snps_in_lg_and_mapped = 0
snps_not_in_lg_and_mapped = 0
for snp in snps:
if snp in lg_snps:
snps_in_lg_and_mapped += 1
"""
for i in lex_snps[snp][0]:
sys.stderr.write("%s\t" % i)
sys.stderr.write("\n")
"""
else:
snps_not_in_lg_and_mapped += 1
# summary stats
print ("Sequences\tCount\tTotal\tlength")
print ("Linked to one or more LG\t%d\t%d" % (count_seq_lg, bases_seq_lg))
print ("With multiple LG SNPs linked to them\t%d\t%d" % (count_seq_multi_lg, bases_seq_multi_lg))
print ("Linked to multiple LG (conflict)\t%d\t%d\n" % (conflict_count_seq_lg, conflict_bases_seq_lg))
print ("SNPs\tLG SNPs\tother SNPs")
print ("All\t%d\tNA" % len(lg_snps))
print ("SNPs mapped\t%d\t%d" % (snps_in_lg_and_mapped, snps_not_in_lg_and_mapped))
print ("SNPs mapped (>= 0.95 flank length)\t%d\t%d" % (ole_and_lg, ole_and_not_lg))
print ("SNPs uniquely mapped\t%d\t%d" % (unique_lg, unique_other))
print ("SNPs with multiple best hits\t%d\t%d" % (multi_lg, multi_no_lg))
#Rewritten from biopython's implementation: http://biopython.org/DIST/docs/api/Bio.SearchIO.BlatIO-pysrc.html
def calc_millibad(qend, qstart, tend, tstart, matches, repmatches, mismatches, qnuminsert):
# calculates millibad
# adapted from http://genome.ucsc.edu/FAQ/FAQblat.html#blat4
size_mul = 1
millibad = 0
qali_size = size_mul * (qend - qstart)
tali_size = tend - tstart
ali_size = min(qali_size, tali_size)
if ali_size <= 0:
return 0
size_dif = qali_size - tali_size
size_dif = 0 if size_dif < 0 else size_dif
total = size_mul * (matches + repmatches + mismatches)
if total != 0:
millibad = (1000 * (mismatches * size_mul + qnuminsert +
round(3 * math.log(1 + size_dif)))) / total
return millibad
def calc_score(matches, repmatches, mismatches, qnuminsert, tnuminsert):
# calculates score
# adapted from http://genome.ucsc.edu/FAQ/FAQblat.html#blat4
size_mul = 1
return (size_mul * (matches + (repmatches >> 1)) - size_mul * mismatches - qnuminsert - tnuminsert)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=
'The blat .psl file.')
parser.add_argument('-p', '--psl', action='store', help='blat psl file', required=True, type=argparse.FileType('r'))
parser.add_argument('-l', '--lg', action='store', help='linkage map file', required=False, type=argparse.FileType('r'))
parser.add_argument('-s', '--score', action='store', help='fraction score', required=False, type=float, default=0.99)
parser.add_argument('-i', '--ident', action='store', help='only output SNPs with higher than this percent identity', required=False, type=float, default=95.0)
parser.add_argument('-e', '--ead', action='store_false', help='BLAT ran with -noHead', required=False, default=True)
parser.add_argument('-g', '--length', action='store', help='alignment length has to be more than this fraction of total flank sequence length', required=False, type=float, default=0.5)
parser.add_argument('-x', '--exclude', action='store', help='exclude these SNPs from LG analysis', required=False, type=argparse.FileType('r'))
parser.add_argument('-m', '--multiple', action='store_true', help='do not use SNPs mapping multiple times for analysis', required=False, default=False)
args = parser.parse_args()
snps = parse_psl_file(args.psl, args.ead)
prefix = '.'.join(args.psl.name.split('.')[0:-1])
exclude_snps = []
if args.exclude:
prefix += "_exclude"
for line in args.exclude:
exclude_snps.append(line.strip())
#9355 SNPs in the linkage map - excluded snps
lg_snps = read_lg(args.lg, exclude_snps)
#print exclude_snps
print ("Number of SNPs in BLAT psl file: %d" % len(snps))
print ("Number of SNPs in linkage map: %d, number excluded: %d" % (len(lg_snps) + len(exclude_snps), len(exclude_snps)))
uniquely_mapping_snps = {}
ambigously_mapping_snps = {}
uniquely_mapping_snps_in_lg = {}
longest_alignment_for_all_snps = {}
#print (lg_snps)
for snp in snps:
#print (snp)
#Maps only once
if (len(snps[snp]) == 1):
uniquely_mapping_snps[snp] = snps[snp]
if (snp in lg_snps):
#print ("Here")
uniquely_mapping_snps_in_lg[snp] = snps[snp]
else:
ambigously_mapping_snps[snp] = snps[snp]
#Filter based on Lex's rule (0.95 alignment length compared to flank sequence length)
total_num_lex_snp_maps = 0
lex_snps = {}
for snp in snps:
for i in snps[snp]:
#0.95 alignment length compared to flank sequence length
if ((float(i[0])/(int(i[10])-1)) >= 0.95):
total_num_lex_snp_maps += 1
if i[9] in lex_snps:
lex_snps[i[9]].append(i)
else:
lex_snps[i[9]] = [i]
#print("Total number of alignments with (alignment/flank sequence) > 0.95 (Lex's rule): %d" % total_num_lex_snp_maps)
for snp in snps:
#sorted based on the fraction of alignment length divided by flanking sequence length
sorted_hit_list = sorted(snps[snp], key = lambda ele: float(ele[0])/(int(ele[10])-1), reverse=True)
#print (sorted_hit_list[0])
longest_alignment_for_all_snps[snp] = [sorted_hit_list[0]]
ole_snps = {}
ole_snps_ident = {}
best_ali_snps = {}
not_multiple_mapping_snps = {}
best_alignments = 0
secondary_alignments = 0
#I want a separate file with all the "best" alignments by score
best_ali = open(prefix + ".best_alignments.tsv", "w")
#My way, best score for all, plus all other alignments within 99 % of that score
for snp in snps:
#for i in snps[snp]:
#print i
#sorted_score_list = sorted(snps[snp], key = lambda ele: ele[21], reverse=True)
#Sort by score first, then identity, then length of target sequence. Should prioritize hits
#against scaffolds instead of contigs
sorted_score_list = sorted(snps[snp], key=itemgetter(21,22,14), reverse=True)
#print sorted_score_list
#Always add the best scoring alignment, and the one against the longest sequence
sorted_score_list[0][23] = True
ole_snps[snp] = [sorted_score_list[0]]
if ((sorted_score_list[0][22] >= args.ident) and ((float(sorted_score_list[0][0])/float(sorted_score_list[0][10])) >= args.length)):
ole_snps_ident[snp] = [sorted_score_list[0]]
not_multiple_mapping_snps[snp] = [sorted_score_list[0]]
best_ali_snps[snp] = [sorted_score_list[0]]
for i in sorted_score_list[0]:
best_ali.write("%s\t" % i)
best_ali.write("\n")
best_alignments += 1
#print ("sorted_score_list[0]")
#print sorted_score_list[0]
for j in sorted_score_list[1:]:
#Not equal, has to be above args.score
if ((float(j[21])/sorted_score_list[0][21]) >= args.score):
#print("j")
#print j
ole_snps[snp].append(j)
secondary_alignments += 1
if (((float(j[21])/sorted_score_list[0][21]) >= args.score) and (j[22] >= args.ident) and ((float(j[0])/float(j[10])) >= args.length)):
not_multiple_mapping_snps.pop(snp, None)
#This should not happen, but apparently does
if snp not in ole_snps_ident:
ole_snps_ident[snp] = [j]
else:
ole_snps_ident[snp].append(j)
best_ali.close()
if args.multiple:
oles_method(prefix, snps, not_multiple_mapping_snps, lg_snps, best_ali_snps, args.ident, args.length)
else:
oles_method(prefix, snps, ole_snps_ident, lg_snps, best_ali_snps, args.ident, args.length)
"""
1. matches - Number of matching bases that aren't repeats.
2. misMatches - Number of bases that don't match.
3. repMatches - Number of matching bases that are part of repeats.
4. nCount - Number of 'N' bases.
5. qNumInsert - Number of inserts in query.
6. qBaseInsert - Number of bases inserted into query.
7. tNumInsert - Number of inserts in target.
8. tBaseInsert - Number of bases inserted into target.
9. strand - defined as + (forward) or - (reverse) for query strand. In mouse, a second '+' or '-' indecates genomic strand.
10. qName - Query sequence name.
11. qSize - Query sequence size.
12. qStart - Alignment start position in query.
13. qEnd - Alignment end position in query.
14. tName - Target sequence name.
15. tSize - Target sequence size.
16. tStart - Alignment start position in query.
17. tEnd - Alignment end position in query.
18. blockCount - Number of blocks in the alignment.
19. blockSizes - Comma-separated list of sizes of each block.
20. qStarts - Comma-separated list of start position of each block in query.
21. tStarts - Comma-separated list of start position of each block in target.
Get score and % identity: http://genome.ucsc.edu/FAQ/FAQblat.html#blat4
And for the data structure in this program
22. Score
23. Identity
24. Best alignment (true or false)
"""