Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
branch: master
Fetching contributors…

Cannot retrieve contributors at this time

executable file 99 lines (89 sloc) 1.451 kb
package soso;
use config;
use common;
@ISA = qw(common);
use strict;
sub new {
my ( $type, $dbh_handle ) = @_;
my $self = {};
$self->{dbh} = $dbh_handle;
bless $self, $type;
}
# use for yy.pl. yahoo.com can pagnation, google.com can't.
sub strip_result
{
my ( $self, $html ) = @_;
$html =~ m {
<div\sid="result"
.*?
<ol\s*>
(.*?) #soso用ol->li来划分每条记录
</ol>
}six;
return $1;
}
sub parse_result
{
my ( $self, $html ) = @_;
return unless $html;
my $aoh = [];
while ($html =~ m {
<li
.*?
href="
(.*?) #1.链接地址
"
(?:.*?)
>
(.*?) #2.标题
</a>
(?:.*?)
<p\sclass="ds">
(.*?) #3.正文
</p>
(?:.*?)
<cite>
(.*?) #4.日期和网址
</cite>
}sgix) {
my ($t1,$t2,$t3,$t4) = ($1,$2,$3,$4);
my @url_date = $t4 =~ m/(.*?)(?:\s|-|\.{1,3})(.*)/;
push (@{$aoh}, [$t1,$t2,$t3,$url_date[0], $url_date[1]]);
}
return $aoh;
}
# 相关搜索。
sub strip_related_keywords
{
my ( $self, $html ) = @_;
$html =~ m{
<div
(?:.*?)
id="rel"
.*?
>
(.*?)
<div\sid="bSearch"
}six;
return $1;
}
sub get_related_keywords
{
my ( $self, $html ) = @_;
return unless $html;
my $aoh;
while($html =~ m{
<a
(?:.*?)
href="
(.*?) #链接地址
">
(.*?) #关键词
</a>
}sgix) {
my ($t1, $t2) = ($1, $2);
push (@{$aoh}, [$t1, $t2]);
}
return $aoh;
}
1;
Jump to Line
Something went wrong with that request. Please try again.