-
Notifications
You must be signed in to change notification settings - Fork 0
/
count-read-length-from-sam.pl
179 lines (137 loc) · 5.27 KB
/
count-read-length-from-sam.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/usr/bin/perl
use strict;
use warnings;
=head1 AUTHOR
Tim A. Dahlmann
tim.dahlmann@rub.de
Chair of general and molecular botany
Ruhr-university Bochum
44801 Bochum, Germany
=head2 VERSION
Verion #3 from 08.10.2013
Previous script: "count-read-length.pl"
Verion #2: Add arguments and argument count for optimized usage.
Version #3: The script count_read_length.pl was adapted for the usage with .sam files created by BEDtools.
=head3 DISCRIPTION
Reads a multi-fasta file and calculates the size and the number of nucleotide or amino acid sequences with this size and prints out a tab delimited file.
The output-file contains the calculation in this type: "seqlength" \t "seqnumber"
=head4 USAGE
perl count-read-length-from-sam.pl input.fasta output.txt
=cut
# Bestimmung der Anzahl an Reads mit gleicher Readlaenge und Ausgabe
# check if we have the correct number of arguments
my $num_args = $#ARGV + 1;
if ($num_args != 2) {
print "\nUsage: perl count-read-length-from-sam inputfile.sam outputfile.txt";
exit;
}
my $inputfile=$ARGV[0]; # <= Inputfile
my $outputfile=$ARGV[1]; # <= Outputfile
open INPUT, "< $inputfile" or die {"Can't open inputfile!\n"}; # Open inputfile
open OUTPUT, "> $outputfile"; # Open outputfile
my @daten;
chomp (@daten = <INPUT>); # Get rid of all newlines
my $whole_seq =0;
my $seq_laenge;
my %hashall;
for (my $i=14; $i <= 73; $i++) { # $max is the afore calculated maximal clonality
$hashall{"$i"} =0; # Create a %hash filled with zeros
}
foreach (@daten) {
if ($_ =~ /0\t([ATGCNRYSWKM]+)\t/) {
my $seq = $1;
$whole_seq += length($1);
$seq_laenge = length($1);
$hashall{"$seq_laenge"} +=1;
}
}
print "The size of all sequnces amounts to $whole_seq nt!\n";
#print %hash;
foreach $seq_laenge (sort {$a <=> $b } keys %hashall) { # Numerisch aufsteigend sortieren
print "$seq_laenge\t$hashall{$seq_laenge}\n"; # Ausgabe auf dem Monitor
print OUTPUT "$seq_laenge\t$hashall{$seq_laenge}\n";
}
#foreach $seq_laenge (sort {$a <=> $b } keys %hashall) { # Numerisch aufsteigend sortieren
# print OUTPUT "$seq_laenge\t$hashall{$seq_laenge}\n"; # Ausgabe in OUTPUT
#}
### Loop for 5'-U
print "\nStarting with U\n";
print OUTPUT "\nStarting with U\n";
my $seq_laengeU;
my %hashU;
for (my $i=14; $i <= 73; $i++) { # $max is the afore calculated maximal clonality
$hashU{"$i"} =0; # Create a %hash filled with zeros
}
foreach (@daten) {
if ($_ =~ /0\t(T[ATGCNRYSWKM]+)\t/) { # Identifiziert die Identifier
my $seq = $1;
$seq_laengeU = length($1); # Bestimmung der Sequenzlängen aller Sequenzen
$hashU{"$seq_laengeU"} +=1; # Schreiben der Anzahl der Sequenzlängen in %hash
}
}
foreach $seq_laengeU (sort {$a <=> $b } keys %hashU) { # Numerisch aufsteigend sortieren
print "$seq_laengeU\t$hashU{$seq_laengeU}\n";
print OUTPUT "$seq_laengeU\t$hashU{$seq_laengeU}\n"; # Ausgabe auf dem Monitor
}
### Loop for 5'-A
print "\nStarting with A\n";
print OUTPUT "\nStarting with A\n";
my $seq_laengeA;
my %hashA;
for (my $i=14; $i <= 73; $i++) { # $max is the afore calculated maximal clonality
$hashA{"$i"} =0; # Create a %hash filled with zeros
}
foreach (@daten) {
if ($_ =~ /0\t(A[ATGCNRYSWKM]+)\t/) { # Identifiziert die Identifier
my $seq = $1;
$seq_laengeA = length($1); # Bestimmung der Sequenzlängen aller Sequenzen
$hashA{"$seq_laengeA"} +=1; # Schreiben der Anzahl der Sequenzlängen in %hash
}
}
foreach $seq_laengeA (sort {$a <=> $b } keys %hashA) { # Numerisch aufsteigend sortieren
print "$seq_laengeA\t$hashA{$seq_laengeA}\n";
print OUTPUT "$seq_laengeA\t$hashA{$seq_laengeA}\n"; # Ausgabe auf dem Monitor
}
### Loop for 5'-G
print "\nStarting with G\n";
print OUTPUT "\nStarting with G\n";
my $seq_laengeG;
my %hashG;
for (my $i=14; $i <= 73; $i++) { # $max is the afore calculated maximal clonality
$hashG{"$i"} =0; # Create a %hash filled with zeros
}
foreach (@daten) {
if ($_ =~ /0\t(G[ATGCNRYSWKM]+)\t/) { # Identifiziert die Identifier
my $seq = $1;
$seq_laengeG = length($1); # Bestimmung der Sequenzlängen aller Sequenzen
$hashG{"$seq_laengeG"} +=1; # Schreiben der Anzahl der Sequenzlängen in %hash
}
}
foreach $seq_laengeG (sort {$a <=> $b } keys %hashG) { # Numerisch aufsteigend sortieren
print "$seq_laengeG\t$hashG{$seq_laengeG}\n";
print OUTPUT "$seq_laengeG\t$hashG{$seq_laengeG}\n"; # Ausgabe auf dem Monitor
}
### Loop for 5'-C
print "\nStarting with C\n";
print OUTPUT "\nStarting with C\n";
my $seq_laengeC;
my %hashC;
for (my $i=14; $i <= 73; $i++) { # $max is the afore calculated maximal clonality
$hashC{"$i"} =0; # Create a %hash filled with zeros
}
foreach (@daten) {
if ($_ =~ /0\t(C[ATGCNRYSWKM]+)\t/) { # Identifiziert die Identifier
my $seq = $1;
$seq_laengeC = length($1); # Bestimmung der Sequenzlängen aller Sequenzen
$hashC{"$seq_laengeC"} +=1; # Schreiben der Anzahl der Sequenzlängen in %hash
}
}
foreach $seq_laengeC (sort {$a <=> $b } keys %hashC) { # Numerisch aufsteigend sortieren
print "$seq_laengeC\t$hashC{$seq_laengeC}\n";
print OUTPUT "$seq_laengeC\t$hashC{$seq_laengeC}\n"; # Ausgabe auf dem Monitor
}
#while (($key, $wert) = each %hash) {
# print "Der Wert von $key ist $wert.\n";
#}
close INPUT;
close OUTPUT;