/
yen.pl
executable file
·106 lines (89 loc) · 2.12 KB
/
yen.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/perl -w
use strict;
use warnings;
use utf8;
use encoding 'utf8';
use WWW::Mechanize;
use Data::Dumper;
use DBI;
use Encode qw(decode);
use CGI qw(:standard);
use lib qw(/home/williamjxj/scraper/lib/);
use config;
use yahoo;
use constant SURL => q{http://search.yahoo.com/};
our $keyword;
if ($#ARGV == 0) {
$keyword = decode("utf8", $ARGV[0]);
}
else {
my $q = CGI->new;
if (defined($q->param('q'))) {
$keyword = $q->param('q');
Encode::_utf8_on($keyword);
}
else {
die "usage: $0 keyword";
}
}
my $yh = new yahoo();
my $h = {
'keyword' => $yh->{'dbh'}->quote($keyword),
'source' => $yh->{'dbh'}->quote(SURL),
'createdby' => $yh->{'dbh'}->quote($yh->get_os_stripname(__FILE__)),
};
my $mech = WWW::Mechanize->new( ) or die;
$mech->timeout( 20 );
$mech->get( SURL );
$mech->success or die $mech->response->status_line;
$mech->submit_form(
form_id => 'sf',
fields => { p => $keyword }
);
$mech->success or die $mech->response->status_line;
# 保存查询的url, 上面有字符集, 查询数量等信息.
$h->{'author'} = $yh->{'dbh'}->quote($mech->uri()->as_string) if($mech->uri);
my $t = $yh->strip_result( $mech->content );
my $aoh = $yh->parse_result($t);
# yahoo竟然没有相关关键词推荐!!!
my $sql = '';
foreach my $p (@{$aoh}) {
$h->{'url'} = $yh->{'dbh'}->quote($p->[0]);
$h->{'title'} = $yh->{'dbh'}->quote($p->[1]);
$h->{'desc'} = $yh->{'dbh'}->quote($p->[2]);
# 当前OS系统的时间, created 存放数据库系统的时间,两者不同.
$h->{'pubdate'} = $yh->{'dbh'}->quote($yh->get_time('2'));
$h->{'clicks'} = $yh->generate_random();
$h->{'likes'} = $yh->generate_random(100);
$h->{'guanzhu'} = $yh->generate_random(100);
$sql = qq{ insert ignore into } . CONTENTS_1 . qq{(
title,
url,
author,
source,
pubdate,
tags,
clicks,
likes,
guanzhu,
createdby,
created,
content
) values(
$h->{'title'},
$h->{'url'},
$h->{'author'},
$h->{'source'},
$h->{'pubdate'},
$h->{'keyword'},
$h->{'clicks'},
$h->{'likes'},
$h->{'guanzhu'},
$h->{'createdby'},
now(),
$h->{'desc'}
)};
$yh->{'dbh'}->do($sql);
}
$yh->{'dbh'}->disconnect();
exit 6;