-
Notifications
You must be signed in to change notification settings - Fork 0
/
count-read-length.pl
137 lines (109 loc) · 4.35 KB
/
count-read-length.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/perl
use strict;
use warnings;
=head1 AUTHOR
Dr. Tim A. Dahlmann
tim.dahlmann@rub.de
Chair of General and Molecular Botany
Ruhr-University Bochum
Universitaetsstr. 150
44801 Bochum, Germany
=head2 VERSION
Verion #4 from 08.10.2020
Previous script: "count-read-length.pl"
Version #2: Add arguments and argument count for optimized usage.
Version #3: Add parameters for fastq analysis and count for all sequences.
Version #4: Simplify coding to make it faster and to optimize the the data output.
=head3 DISCRIPTION
This perl script (tested for perl 5.26.2) reads in a multi-fasta or multi-fastq file and calculates the size and
the number of sequences/reads of a specific length in total and of those starting with either U, A, G, or C.
It prints out the results on screen and into a tab-delimited text file. The number of total reads and the sum of U,A,G,C reads
might vary because of ambiguous base calling at 5'-end (N,R,Y,S,W,K,M).
=head4 USAGE
perl count-read-length.pl input.fasta output.txt
=cut
# Identification of reads with the same size and same 5'-nucleotide
my $inputfile=$ARGV[0]; # <= Inputfile
my $outputfile=$ARGV[1]; # <= Outputfile
if ($inputfile =~ /--help/) {
print "\nUsage: perl read-length.pl inputfile.fasta outputfile.txt\n\nThis perl script (tested for perl 5.26.2) reads in a multi-fasta or multi-fastq file (inputfile.fasta) and calculates the size and the number of sequences/reads of a specific length in total and of those starting with either U, A, G, or C.
It prints out the results on screen and into a tab-delimited text file (outputfile.txt).
The number of total reads and the sum of U,A,G,C reads might vary from the number of total reads because of ambiguous nucleotides at the 5'-end (N,R,Y,S,W,K,M).";
exit;
}
# check if we have the correct number of arguments
my $num_args = $#ARGV + 1;
if ($num_args != 2) {
print "\nUsage: perl read-length.pl inputfile.fasta outputfile.txt";
exit;
}
open INPUT, "< $inputfile" or die {"Can't open inputfile!\n"}; # Open the input file
open OUTPUT, "> $outputfile"; # Open the output file
my @daten;
chomp (@daten = <INPUT>); # Get rid of all newlines
my $whole_seq =0;
my $seq_laenge;
my $seq_laengeU;
my $seq_laengeA;
my $seq_laengeG;
my $seq_laengeC;
my %hashall;
my %hashU;
my %hashA;
my %hashG;
my %hashC;
# Identify of the inputfile contains fasta or fastq sequences
if ($daten[0] =~ /^>/) {
print "The input files contains fasta sequences!\n";
}
if ($daten[0] =~ /^@/) {
print "The input file contains fastq sequences!\n";
}
# Borders ($min and $max) of the read length that should be taken into account
my $min = 14;
my $max = 73;
for (my $i=$min; $i <= $max; $i++) { # Create hashes filled with zeros to avoid gaps
$hashall{"$i"} =0;
$hashU{"$i"} =0;
$hashA{"$i"} =0;
$hashG{"$i"} =0;
$hashC{"$i"} =0;
}
foreach (@daten) {
if ($_ =~ /^[ATGCNRYSWKM]+$/) { # identify all sequences
my $seq = $_;
#$whole_seq += length($_); # calculate the sum of all reads length in nt
$seq_laenge = length($_); # calculate read length
$hashall{"$seq_laenge"} +=1; # count reads with the same read length and write it to %hashall
}
if ($_ =~ /^T[ATGCNRYSWKM]+$/) { # 5'-U (DNA: T, RNA: U)
my $seq = $_;
$seq_laengeU = length($_);
$hashU{"$seq_laengeU"} +=1;
}
if ($_ =~ /^A[ATGCNRYSWKM]+$/) { # 5'-A
my $seq = $_;
$seq_laengeA = length($_);
$hashA{"$seq_laengeA"} +=1;
}
if ($_ =~ /^G[ATGCNRYSWKM]+$/) { # 5'-G
my $seq = $_;
$seq_laengeG = length($_);
$hashG{"$seq_laengeG"} +=1;
}
if ($_ =~ /^C[ATGCNRYSWKM]+$/) { # 5'-C
my $seq = $_;
$seq_laengeC = length($_);
$hashC{"$seq_laengeC"} +=1;
}
}
print "size\t#reads\t5'-U\t5'-A\t5'-G\t5'-C\n"; # header for print on screen
print OUTPUT "size\t#reads\t5'-U\t5'-A\t5'-G\t5'-C\n"; # header for OUTPUT file
my $x = $min; # counter starting with the minimal size border $min
foreach $seq_laenge (sort {$a <=> $b } keys %hashall) { # numerical sort
print "$seq_laenge\t$hashall{$seq_laenge}\t$hashU{$x}\t$hashA{$x}\t$hashG{$x}\t$hashC{$x}\n"; # print on screen
print OUTPUT "$seq_laenge\t$hashall{$seq_laenge}\t$hashU{$x}\t$hashA{$x}\t$hashG{$x}\t$hashC{$x}\n"; # print in OUTPUT
$x++; # add +1 to the minimal border, $max will be reached automatically
}
close INPUT;
close OUTPUT;