-
Notifications
You must be signed in to change notification settings - Fork 9
/
samcount.pl
executable file
·134 lines (117 loc) · 3.03 KB
/
samcount.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/perl
$usage="
samcount v.0.2 (November 2014):
counts reads mapping to isogrops in SAM files
Arguments:
arg1: SAM file (by cluster, contig, or isotig)
arg2: a table in the form 'reference_seq<tab>gene_ID', giving the correspondence of
reference sequences to genes. With 454-deived transcriptome, the gene_ID would be isogroup;
with Trinity-derived transcriptiome,it would be component.
dup.reads=keep|toss : whether to remove exact sequence-duplicate reads mapping to the
same position in the reference. Default keep (duplicates are supposed to be tossed at the
trimming stage).
aligner=gmapper|bowtie2 : aligner that made the SAM file. Default bowtie2.
bowtie2 is assumed to be used in -k mode.
mult.iso=random|toss : (for aligner=gmapper) if a read maps to multiple isogroups, it is
disregarded by default. Set this option to 'random' if you want to randomly pick an
isogroup to assign a count to.
";
my $t1=shift @ARGV or die $usage;
my $t2=shift @ARGV or die $usage;
my $rmdup="keep";
if ("@ARGV"=~/dup.reads=toss/) {
$rmdup="toss";
}
my $miso="toss";
my $aligner="bowtie2";
if ("@ARGV"=~/aligner=gmapper/) {
$aligner="gmapper";
if ("@ARGV"=~/min.mapq=(\d+)/) { $minmapq=$1; }
}
else {
if ("@ARGV"=~/mult.iso=random/) {
$miso="random";
warn "adding a count to a randomly picked isogroup when a read maps to multiple isogroups\n";
}
else { warn "disregarding reads mapping to multiple isogroups\n"; }
}
open SAM, $t1 or die "cannot open $t1\n";
open C2I, $t2 or die "cannot open $t2\n";
my %c2i={};
my %count={};
my %hit={};
my %refhit={};
my $c="";
my $i="";
my $f="";
my $r="";
my $pos="";
my $seq="";
while (<C2I>) {
chop;
($c,$i)=split(/\s+/,$_);
$c2i{$c}=$i;
}
my $mapq;
my $cigar;
my $flag;
my @rest;
my %bestmatch={};
my %trust={};
my %ciglen;
while (<SAM>) {
if ($_=~/^@/) { next;}
chop;
($r,$flag,$c,$pos,$mapq,$cigar,@rest)=split(/\s/,$_);
my $as;
if ("@rest"=~/AS:i:(\d+)/) {
$as=$1;
}
else { warn "cannot find alignment score in @rest\n" and next; }
$i=$c2i{$c};
if ($i!~/\d+/) { warn "$c has no isogroup designation\n" and next;}
my @sseq=grep(/[ATGCatgc-]{30,}/,@rest);
next if (!$sseq[0]);
if ($aligner eq "bowtie2") {
if ($bestmatch{$r} and $as<$bestmatch{$r}) {
#warn "$r:mapq=$mapq:cig=$cigar:AS=$as:best=$bestmatch{$r}: SKIP\n";
next;
}
else {
#warn "$r:mapq=$mapq:cig=$cigar:AS=$as:best=$bestmatch{$r}: RETAIN\n";
$bestmatch{$r}=$as;
}
}
$seq=$sseq[0];
my $toss=0;
if ($rmdup eq "toss") {
foreach $sr (@{$refhit{$c}{$pos}}) {
if ($sr=~/^$seq/ | $seq=~/^$sr/) {
$toss=1;
last;
}
}
}
if ($toss==0) {
push @{$refhit{$c}{$pos}},$seq;
push @{$hit{$r}},$i unless (" @{$hit{$r}} "=~/ $i /);
}
}
foreach $r (keys %hit){
next if ($r=~/HASH/);
if($#{$hit{$r}}>0) {
#warn "$r:isogroups: @{$hit{$r}}\n";
if($miso eq "random") {
my $pick=${$hit{$r}}[rand @{$hit{$r}}];
$count{$pick}++;
}
else { next; }
}
else {
$count{${$hit{$r}}[0]}++;
}
}
foreach $i (sort keys %count) {
next if ($i=~/HASH/);
print "$i\t$count{$i}\n";
}