forked from tanjiti/perl_tools
/
getAlexaAndCat.pl
executable file
·57 lines (48 loc) · 1.47 KB
/
getAlexaAndCat.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/usr/bin/perl
use strict;
use warnings;
use LWP::UserAgent;
use feature qw(say);
binmode(STDIN, ':encoding(utf8)');
binmode(STDOUT, ':encoding(utf8)');
binmode(STDERR, ':encoding(utf8)');
die "You must specify the file include hostlist for analyze or the hostname
for analyze .\n" if ($#ARGV != 0);
my $host = shift;
if(-e $host){
my $out = $host."_alexaAndType";
open my $FH, "<:encoding(UTF-8)", $host or die "cannot open $host for reading $!";
open my $OUT, ">:encoding(UTF-8)", $out or die "cannot open $out for writing $!";
while(<$FH>){
chomp;
say $OUT getAlexa($_) if $_;
}
close $FH;
close $OUT;
}else{
say getAlexa($host) if $host ne "";
}
sub getAlexa{
my $host = shift;
chomp $host;
my $UserAgent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/12.0";
my $browser = LWP::UserAgent->new();
$browser->agent($UserAgent);
my $uri = 'http://data.alexa.com/data?cli=10&dat=snbamz&url='.$host;
my $response = $browser->get($uri);
my $content = $response->decoded_content;
my $alexa = 0;
my $catalog = "";
$alexa = $1 if $content =~ /<COUNTRY CODE="[A-Z]{2}" NAME="[\w\s]+" RANK="(\d+)"\/>/;
my @cats = ($content =~/<CAT ID="[^"]+" TITLE="([^"]+)" CID="\d+"\/>/g);
if ($#cats == 0){
$catalog = $cats[0];
}else{
foreach (@cats){
$catalog .= $_.";";
}
chop $catalog;
}
$catalog = "NONE" if $catalog eq "";
return $host."\t".$alexa."\t".$catalog;
}