Permalink
Browse files

craiglist-scrape

  • Loading branch information...
0 parents commit 9c8d8ad1d3ffbb7af5aa30d0588e38a44c613023 william committed Jul 1, 2012
Showing with 3,020 additions and 0 deletions.
  1. +7 −0 .gitignore
  2. +8 −0 bat.sh
  3. +21 −0 bat1.sh
  4. +21 −0 bat2.sh
  5. +22 −0 cacraig.sh
  6. +267 −0 cacraigslist.pl
  7. +219 −0 cajobs.pl
  8. +246 −0 category.pl
  9. +345 −0 city.pl
  10. +12 −0 clear_log.sh
  11. +497 −0 craig.pm
  12. +30 −0 db.sh
  13. +6 −0 env.pl
  14. +220 −0 housing.pl
  15. +245 −0 jobs.pl
  16. +12 −0 mail.sh
  17. +13 −0 mail_resumes.sh
  18. +6 −0 readme
  19. +93 −0 table.sql
  20. +293 −0 uc.pl
  21. +8 −0 usbat.sh
  22. +42 −0 uscraig.sh
  23. +289 −0 uscraigslist.pl
  24. +50 −0 usdata.sh
  25. +27 −0 uuu.sh
  26. +21 −0 uuu_resumes.sh
7 .gitignore
@@ -0,0 +1,7 @@
+bak/
+docs/
+html/
+test/
+mysql/
+*.log
+~
8 bat.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+cd $HOME/craig/
+
+$HOME/craig/cajobs.pl -f | while read city
+do
+ $HOME/craig/cajobs.pl -c "$city" -i jobs
+done
21 bat1.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Manually test.
+
+cajobs.pl -c "medicine hat" -i jobs
+cajobs.pl -c "peace river country" -i jobs
+cajobs.pl -c "red deer" -i jobs
+cajobs.pl -c "comox valley" -i jobs
+cajobs.pl -c "fraser valley" -i jobs
+cajobs.pl -c "kelowna / okanagan" -i jobs
+cajobs.pl -c "peace river country" -i jobs
+cajobs.pl -c "prince george" -i jobs
+cajobs.pl -c "sunshine coast" -i jobs
+cajobs.pl -c "new brunswick" -i jobs
+cajobs.pl -c "st john's, NL" -i jobs
+cajobs.pl -c "niagara region" -i jobs
+cajobs.pl -c "owen sound" -i jobs
+cajobs.pl -c "sault ste marie" -i jobs
+cajobs.pl -c "thunder bay" -i jobs
+cajobs.pl -c "prince edward island" -i jobs
+cajobs.pl -c "quebec city" -i jobs
+cajobs.pl -c "territories" -i jobs
21 bat2.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# ft mcmurray
+# peace river country
+# kelowna / okanagan
+#----------------------------------
+# craigslist_category
+# craigslist_city
+# craigslist_country_state
+# craigslist_item
+# craigslist_topic
+#----------------------------------
+
+# MYSQL="mysql -u craig -pwilliam -D craig"
+# today=`date +'%y%m%d'`
+
+cd $HOME/craig/
+
+$HOME/craig/housing.pl -f | while read city
+do
+ $HOME/craig/housing.pl -c "$city" -i housing
+done
22 cacraig.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+export PATH=.:$PATH
+JOB='cacraigslist.pl'
+jobs='jobs'
+
+cd $HOME/craig/
+
+if [ $# -ne 1 ]; then
+ echo "Which category to download ? 1:jobs; 2:services; 3:gigs"
+ exit;
+fi
+
+$JOB -f | while read city
+do
+ $JOB -j "$jobs" -l | while read item
+ do
+ echo $JOB -j "$jobs" -c "$city" -i "$item"
+ $JOB -j "$jobs" -c "$city" -i "$item"
+ done
+done
+
267 cacraigslist.pl
@@ -0,0 +1,267 @@
+#! /cygdrive/c/Perl/bin/perl.exe
+
+use lib qw(../lib/);
+use craig_config;
+use db;
+use craig;
+
+use warnings;
+use strict;
+use Data::Dumper;
+use FileHandle;
+use WWW::Mechanize;
+use DBI;
+use Getopt::Long;
+
+local ($|) = 1;
+undef $/;
+
+#-----------------------------------
+# 0. initialize:
+#-----------------------------------
+our ( $num, $start_time, $end_time, $end_date ) = ( 0, 0, 0, '' );
+our ( $start_url, $page_url, $todate ) = ( undef, undef, INTERVAL_DATE );
+our ( $mech, $db, $craig, $log ) = ( undef, undef );
+our ( $dbh, $sth );
+
+$start_time = time;
+
+my ( $host, $user, $pass, $dsn ) = ( HOST, USER, PASS, DSN );
+$dsn .= ":hostname=$host";
+$db = new db( $user, $pass, $dsn );
+$dbh = $db->{dbh};
+
+$craig = new craig( $db->{dbh} ) or die;
+
+$log = $craig->get_filename(__FILE__);
+$craig->set_log($log);
+$craig->write_log( "[" . $log . "]: start at: [" . localtime() . "]." );
+
+my ( $city, $item, $keywords, $email ) = ( undef, undef, undef, undef );
+my ( $jobs, $first, $help, $version, $list ) = ( 'jobs', undef, undef, undef );
+
+usage()
+ unless (
+ GetOptions(
+ 'jobs=s' => \$jobs,
+ 'first' => \$first,
+ 'list' => \$list,
+ 'todate=s' => \$todate,
+ 'city=s' => \$city,
+ 'item=s' => \$item,
+ 'keywords=s' => \$keywords,
+ 'email=s' => \$email,
+ 'help|?' => \$help,
+ 'version' => \$version
+ )
+ );
+
+$help && usage();
+
+# print $jobs . "\n"; print $db_name . "\n";
+
+if ($first) {
+ my $ca1 = $craig->select_ca_cities();
+ foreach my $ca2 (@$ca1) {
+ print $ca2->[0] . "\n";
+ }
+ exit 1;
+}
+
+my $db_name;
+if ( $jobs eq 'jobs' ) {
+ $db_name = 'craigslist_cajobs';
+}
+else {
+ $jobs = 'jobs';
+ $db_name = 'craigslist_cajobs';
+ # die "There is no suitable job selected.";
+}
+
+if ($list) {
+ my $list1 = $craig->select_items($jobs);
+ foreach my $list2 (@$list1) {
+ print $list2->[0] . "\n";
+ }
+ exit 2;
+}
+if ($version) {
+ print <<EOF;
+
+$0: Version 2.0
+EOF
+ exit 2;
+}
+
+# date +'%a %d %b' -d "2 day ago"
+if ($todate) {
+ $end_date = $craig->get_end_date($todate);
+}
+if ( $city && $item ) {
+ my ( $r1, $r2 ) = ( '', '' );
+
+ $r1 = $craig->select_city($city);
+ die "No such city: <" . $city . ">, $0 quit." unless ($r1);
+
+ if (($jobs eq 'resumes') && ($item eq 'resumes')) {
+ $r2 = 'res/';
+ }
+ else {
+ $r2 = $craig->select_category($item, $jobs);
+ die "No such category: <" . $item . ">, $0 quit." unless ($r2);
+ }
+
+ $start_url = $r1 . $r2 if ( $city && $item );
+ $craig->write_log( "URL: <" . $start_url . ">." );
+}
+
+$mech = WWW::Mechanize->new( autocheck => 0 );
+if ( $start_url =~ m/cgi-bin/ ) {
+ $mech->get($start_url);
+ $mech->success or die $mech->response->status_line;
+ $page_url = $craig->parse_cgi_page( $mech->content );
+}
+else {
+ $page_url = $start_url;
+}
+
+LOOP:
+$mech->get($page_url);
+$mech->success or die $mech->response->status_line;
+my $html = $mech->content;
+
+# Only parse data before $end_date.
+my $ht = $craig->parse_date( $end_date, $html );
+unless ($ht) {
+ $dbh->disconnect();
+ $end_time = time;
+ $craig->write_log( "Terminated: Total [$todate] days' data (end at: $end_date): [ " . ( $end_time - $start_time ) . " ] seconds used.\n" );
+ $craig->write_log( "[$jobs],[$city],[$item]: There are total [ $num ] records was processed succesfully!\n");
+ $craig->write_log("==============================================\n");
+ $craig->close_log();
+ exit 6;
+}
+
+$page_url = $craig->parse_next_page($ht);
+
+my $aoh = $craig->parse_item_main($ht);
+
+my ( $pdt, $pemail, $phone, $web, $relevant, $email1 ) = ('', '', '', '', '');
+my ( $t0, $t1, $t2, $t3 , $ttt );
+foreach my $t ( @{$aoh} ) {
+ my $url = $t->[0];
+
+ if ( $t->[2] eq 'img' ) {
+ $t->[2] = ''; # t->[]2]=location
+ }
+ $t->[2] =~ s/\).*$//s if ( $t->[2] =~ m/\)/ );
+
+ $num++;
+ $mech->follow_link( url => $url );
+ $mech->success
+ or next; # $mech->success or die $mech->response->status_line;
+
+ if (($jobs eq 'resumes') && ($item eq 'resumes')) {
+ ( $pdt, $pemail, $phone, $web, $relevant ) = $craig->parse_detail_resumes( $mech->content );
+ }
+ else {
+ ( $pdt, $pemail, $phone, $web, $relevant, $email1 ) = $craig->parse_detail( $mech->content );
+ }
+
+ $pemail = '' unless ( defined $pemail );
+ $pemail = '' unless ($pemail);
+
+ if( $pemail && ! $email1 ) {
+ $email1 = $pemail;
+ }
+ elsif ($pemail && ($pemail !~ m/\@craigslist.org/)) {
+ if ($email1) {
+ my $ex = $pemail;
+ $pemail = $email1;
+ $email1 = $ex;
+ }
+ }
+ $pemail = $dbh->quote( $pemail );
+ $email1 = $dbh->quote( $email1 );
+
+ ( $t0, $t1, $t2, $t3 ) = @{$t};
+ $t0 = $dbh->quote( $t->[0] );
+ $t1 = $dbh->quote( $t->[1] );
+ $t2 = $dbh->quote( $t->[2] );
+ $pdt = ' ' unless ($pdt);
+
+ $phone = $dbh->quote($phone);
+ $web = $dbh->quote($web);
+ $relevant = $dbh->quote($relevant);
+
+ my $c1 = $dbh->quote($city); # st john's,NL
+
+ # $craig->write_log( "No: " . ($num) . " -- [" . $t0 . ", " . $t1 . ", " . $t2 . ", " . $item . ", " . $pdt . ", " . $pemail . ", " . $phone . ", " . $web . ", " . $c1 . ", " . $email1 . "]\n" );
+ # $craig->write_log( "No: " . $url . ", " . $num . " -- [" . $pemail . ", " . $email1 . "]\n" );
+
+ # add column email1 to craigslist_usjobs on July 15, 2010.
+ if ($jobs eq 'jobs') {
+ $sth = $dbh->do(
+ qq{ insert ignore into } . $db_name . qq{
+ (url,keywords,relevant,location,item,post_time,email,phone,
+ web,city,category,date, email1)
+ values($t0,$t1,$relevant,$t2,'$item','$pdt',$email1,
+ $phone, $web, $c1,'$jobs',now(), $pemail) }
+ );
+ }
+ else {
+ $sth = $dbh->do(
+ qq{ insert ignore into } . $db_name . qq{
+ (url,keywords,relevant,location,item,post_time,email,phone,
+ web,city,category,date)
+ values($t0,$t1,$relevant,$t2,'$item','$pdt',$pemail,
+ $phone, $web, $c1,'$jobs',now())}
+ );
+ }
+
+ $mech->back();
+}
+
+goto LOOP if ($page_url);
+
+$dbh->disconnect();
+
+$end_time = time;
+$craig->write_log( "Total [$todate] days' data (end at: $end_date): [ " . ( $end_time - $start_time ) . " ] seconds used.\n" );
+$craig->write_log( "[$jobs],[$city],[$item]: There are total [ $num ] records was processed succesfully!\n");
+$craig->write_log("----------------------------------------------\n");
+$craig->close_log();
+
+exit 8;
+
+sub usage {
+ print <<HELP;
+Uage:
+ $0
+ or:
+ $0 -c city -i category
+ or:
+ $0 -t 3
+ or:
+ $0 -k keyword -e email
+ or:
+ $0 -h [-v]
+Description:
+ -t from what date to download? default it's from 2 days before.
+ -c city, which city to scrape?
+ -i category/item, which category/item to scrape?
+ -h this help
+ -v version
+
+Examples:
+ (1) $0 # use default
+ (2) $0 -d # use default
+ (3) $0 -c vancouver -i services # scrape vancouver's gigs
+ (4) $0 -c calgory -i 'services'
+ (5) $0 -h # get help
+ (6) $0 -v # get version
+
+HELP
+ exit 3;
+}
+
219 cajobs.pl
@@ -0,0 +1,219 @@
+#! /cygdrive/c/Perl/bin/perl.exe
+
+use lib qw(../lib/);
+use craig_config;
+use db;
+use craig;
+
+use warnings;
+use strict;
+use Data::Dumper;
+use FileHandle;
+use WWW::Mechanize;
+use DBI;
+use Getopt::Long;
+
+local($|) = 1;
+undef $/;
+
+#-----------------------------------
+# 0. initialize:
+#-----------------------------------
+our ($num, $start_time, $end_time) = (0,0,0);
+our ($start_url, $page_url, $todate) = (URL4, undef, INTERVAL_DATE);
+our ($end_date, $today) = ('', []);
+our ($mech, $db, $craig, $log) = (undef, undef);
+our ($dbh, $sth);
+
+$start_time = time;
+
+my ($host, $user, $pass, $dsn) = (HOST, USER, PASS, DSN);
+$dsn .= ":hostname=$host";
+$db = new db($user, $pass, $dsn);
+$dbh = $db->{dbh};
+
+$craig = new craig($db->{dbh}) or die;
+
+$log = $craig->get_filename(__FILE__);
+$craig->set_log($log);
+$craig->write_log("[".$log."]: start at: [".localtime() . "].");
+
+my ($city,$item) = (DEFAULT_CITY, DEFAULT_CATEGORY);
+my ($keywords,$email) =(undef,undef);
+my ($first, $help, $version) = (undef, undef, undef);
+
+usage() unless (GetOptions(
+ 'first' => \$first,
+ 'todate=s' => \$todate,
+ 'city=s' => \$city,
+ 'item=s' => \$item,
+ 'keywords=s' => \$keywords,
+ 'email=s' => \$email,
+ 'help|?' => \$help,
+ 'version' => \$version
+));
+
+$help && usage();
+
+if ($first) {
+ my $ca1 = $craig->select_ca_cities();
+ foreach my $ca2 (@$ca1) {
+ foreach my $ca3 (@{$ca2}) {
+ print $ca3 . "\n";
+ }
+ }
+ exit 1;
+}
+if ($version) {
+ print <<EOF;
+
+$0: Version 2.0
+EOF
+ exit 2;
+}
+# date +'%a %d %b' -d "2 day ago"
+if ($todate) {
+ $end_date = $craig->get_end_date($todate);
+}
+if ($city && $item) {
+ my ($r1, $r2) = ('', '');
+
+ $r1 = $craig->select_city($city);
+ die "No such city: <".$city.">, $0 quit." unless ($r1);
+
+ $r2 = $craig->select_category($item);
+ die "No such category: <".$item.">, $0 quit." unless ($r2);
+
+ $start_url = $r1 . $r2 if ($city && $item);
+ $craig->write_log("URL: <".$start_url.">.");
+ print $start_url . "\n";
+}
+if ($keywords && $email) {
+ $craig->select_keywords_email($keywords, $email);
+}
+elsif ($keywords) {
+ $craig->select_keywords($keywords);
+}
+elsif ($email) {
+ $craig->select_email($email);
+}
+
+$mech = WWW::Mechanize->new( autocheck => 0 );
+
+#-----------------------------------
+# 1. scrape:
+#-----------------------------------
+
+$mech->get($start_url);
+$mech->success or die $mech->response->status_line;
+$page_url = $craig->parse_cgi_page($mech->content);
+
+LOOP:
+$mech->follow_link(url => $page_url);
+$mech->success or die $mech->response->status_line;
+my $html = $mech->content;
+
+# Only parse data before $end_date.
+my $ht = $craig->parse_date($end_date, $html);
+unless ($ht) {
+ $dbh->disconnect();
+ $end_time = time;
+ $craig->write_log("$todate dates data: total [ " . ($end_time - $start_time) . " ] seconds used.\n");
+ $craig->write_log( "There are total [ $num ] records was processed succesfully!\n");
+ $craig->close_log();
+ exit 6;
+}
+
+$page_url = $craig->parse_next_page($ht);
+
+my $aoh = $craig->parse_main($ht);
+
+# $craig->write_log($aoh);
+
+foreach my $t (@{$aoh})
+{
+ my $url = $t->[0];
+
+ $num ++;
+ $mech->follow_link(url => $url);
+ $mech->success or next;
+
+ my ($pdt,$pemail,$phone,$web,$relevant) = $craig->parse_detail($mech->content);
+ if (defined $pemail)
+ {
+ my ($t0, $t1, $t2, $t3, $t4, $t5) = @{$t};
+ $t0 = $dbh->quote($t->[0]);
+ $t1 = $dbh->quote($t->[1]);
+ $t2 = $dbh->quote($t->[2]);
+ $t3 = $dbh->quote($t->[3]);
+ $t4 = $dbh->quote($t->[4]);
+ $pdt = ' ' unless ($pdt);
+ # phone
+ $phone = $dbh->quote($phone);
+ # web
+ $web = $dbh->quote($web);
+ $relevant = $dbh->quote($relevant);
+
+ my $c1 = $dbh->quote($city); # st john's,NL
+ $craig->write_log("No: ".($num)." -- [".$t0.", ".$t1.", ".$t2.", ".$t3.", ".$t4.", ".$pdt.", ".$pemail.", ".$phone.", ".$web.", ".$c1.", ".$item."]\n");
+
+ $sth = $dbh->do(qq{ insert ignore into }.TOPIC.qq{
+ (url,keywords,relevant,location,item_url,item,post_time,email,phone,
+ web,city,category,date)
+ values($t0,$t1,$relevant,$t2,$t3,$t4,'$pdt','$pemail',
+ $phone, $web, $c1,'$item',now())});
+ }
+
+ $mech->back();
+}
+
+goto LOOP if ($page_url);
+
+$dbh->disconnect();
+
+$end_time = time;
+$craig->write_log("$todate dates data: total [ " . ($end_time-$start_time) . " ] seconds used.\n");
+$craig->write_log( "There are total [ $num ] records was processed succesfully!\n");
+$craig->write_log("----------------------------------------------\n");
+$craig->close_log();
+
+exit 8;
+
+
+sub usage
+{
+print <<HELP;
+Uage:
+ $0
+ or:
+ $0 -c city -i category
+ or:
+ $0 -t 3
+ or:
+ $0 -k keyword -e email
+ or:
+ $0 -h [-v]
+Description:
+ -t from what date to download? default it's from 2 days before.
+ -c city, which city to scrape?
+ -i category/item, which category/item to scrape?
+ -k search by keywords, what keyword to search?
+ -e search by email, what email to search
+ -h this help
+ -v version
+
+Examples:
+ (1) $0 # use default
+ (2) $0 -d # use default
+ (3) $0 -c vancouver -i jobs # scrape vancouver's jobs
+ (4) $0 -c calgory -i 'software / qa / dba'
+ (5) $0 -k 'php develper' -e 'email\@dummy.com' # seach keywords & email
+ (6) $0 -k 'php develper' # seach keywords
+ (7) $0 -e 'email\@dummy.com' # seach email
+ (8) $0 -h # get help
+ (9) $0 -v # get version
+
+HELP
+exit 3;
+}
+
246 category.pl
@@ -0,0 +1,246 @@
+#! /cygdrive/c/Perl/bin/perl.exe
+##!/usr/bin/perl -w
+# $Id$
+
+use lib qw(./);
+use config;
+use common;
+# use parse;
+
+use warnings;
+use strict;
+use Data::Dumper;
+use FileHandle;
+use WWW::Mechanize;
+use DBI;
+
+local($|) = 1;
+undef $/;
+
+#-----------------------------------
+# 0. initialize:
+#-----------------------------------
+our ($html, $aohoa) = ('', []);
+our ($num1, $num2, $start_time, $end_time) = (0,0,0,0);
+my ($mech, $dbh, $sth, @row) = (undef, undef, undef, ());
+
+my $comm = new common() or die;
+my $log = $comm->get_filename(__FILE__) . '_' . $comm->get_time(1);
+$comm->set_log($log);
+$comm->write_log("\n[".$log."]: start at: [".localtime() . "].");
+
+$start_time = time;
+
+$mech = WWW::Mechanize->new( autocheck => 1 );
+
+my ($host, $user, $pass, $dsn) = (HOST, USER, PASS, DSN);
+$dsn .= ":hostname=$host";
+$dbh = DBI->connect($dsn, $user, $pass, { RaiseError=>1 });
+
+#-----------------------------------
+# 1. scrape:
+#-----------------------------------
+my @ary = ();
+
+#. Option A:
+$mech->get (URL2);
+die "Can't even get [" . URL2 . "] page: ",
+$mech->response->status_line unless $mech->success;
+$html = $mech->content;
+
+=comment
+#. Option B:
+my $fd = new FileHandle(CHTML."category.html") or die;
+($html) = (<$fd>);
+# $aohoa = $comm->get_category();
+=cut
+
+#-----------------------------------
+# 2. parse.
+#-----------------------------------
+# <div id="main">...enlish |
+my $h0 = trunc_category($html);
+
+# parse 8 category.
+my $h1 = parse_category($h0);
+
+# parse personals.
+my $h2 = parse_category_personals($h0);
+
+# put all 9 cateogry into array.
+$aohoa = [$h1, $h2];
+
+# $comm->write_log($aohoa, 'category');
+
+#-----------------------------------
+# 3. insert tables.
+#-----------------------------------
+my $category = 'dummy';
+foreach my $item (@{$aohoa}) {
+ foreach my $key (keys %$item) {
+ foreach my $t (@{$item->{$key}}) {
+ my ($t1,$t2,$t3) = ($dbh->quote($t->[0]),$t->[1],$t->[2]);
+ $comm->write_log("[".$t1.", ".$t2.", ".$t3."]\n");
+ if ($t3 eq 'c') {
+ $category = $t1;
+ ++ $num1;
+ $sth=$dbh->do(qq{insert into } . CATEGORY . qq{(cname,curl,cdate) values($t1,'$t2',now())});
+ }
+ if ($t3 eq 's') {
+ ++ $num2;
+ $sth=$dbh->do(qq{insert into } . ITEM . qq{(iname,iurl,category,idate) values($t1,'$t2',$category,now())});
+ }
+ }
+ }
+}
+
+#-----------------------------------
+# 4. double check data is stored.
+#-----------------------------------
+# select_category();
+show_results('select * from ' . CATEGORY . ' order by cname');
+show_results('select * from ' . ITEM . ' order by iname');
+
+#-----------------------------------
+# 5. clear up.
+#-----------------------------------
+$dbh->disconnect();
+# $fd->close() if ($fd);
+
+$end_time = time;
+$comm->write_log( "There are total [ $num1 ], [ $num2 ] records was processed succesfully!\n");
+$comm->write_log("Finally, there are total [ " . ($end_time - $start_time) . " ] s were used.\n");
+
+$comm->close_log();
+exit;
+
+#######################################
+
+sub trunc_category
+{
+ my ($text) = @_;
+ ($html) = $text =~ m!
+ <table\ssummary="main"\sid="main">
+ (.*)
+ english\s+\|
+ !xsig;
+
+ return $html;
+}
+
+sub parse_category
+{
+ my ($html) = @_;
+ my $href = {};
+
+ while ($html =~ m{
+ <h4>
+ (?:(?:&nbsp;)+?)
+ <a\s+href="(.*?)">
+ (.*?)
+ </a>
+ (?:.*?)
+ <div\s+class=(?:.*?)>
+ (.*?)
+ </div>
+ (?:.*?)
+ }sgix) {
+ my ($t1,$t2,$t3) = ($1, $2, $3);
+ parse_sub_category($t3);
+ $t1 =~ s/&amp;/&/g if ($t1 =~ m/&amp;/); #/cgi-bin/jobs.cgi?&amp;category=trd/
+ my $t = [ $t2, $t1, 'c' ];
+ unshift (@ary, $t);
+
+ $href->{$t2} = [ @ary ];
+ undef (@ary);
+ }
+ return $href;
+}
+
+sub parse_sub_category
+{
+ my ($html) = @_;
+ my $aref = [];
+ while ($html =~ m{
+ <li>
+ <a\s+href="(.*?)">
+ (.*?)
+ </a>
+ (?:.*?)
+ }sgix) {
+ my ($t1, $t2) = ($1, $2);
+ $t1 =~ s/&amp;/&/g if ($t1 =~ m/&amp;/);
+ # $t2 =~ s/'/\'/g if ($t2 =~ m/'/); # 'skill\'d trade',
+ push(@ary, [ $t2, $t1, 's' ]);
+ }
+}
+
+sub parse_category_personals
+{
+ my ($html) = @_;
+ my $href = {};
+
+ while ($html =~ m{
+ <h4>
+ (?:&nbsp;){1,}
+ (\w+)
+ (?:&nbsp;){1,}
+ </h4>
+ (?:.*?)
+ <div\s+class=(?:.*?)>
+ (.*?)
+ </div>
+ (?:.*?)
+ }sgix) {
+ my ($t1,$t2) = ($1, $2);
+ my $aref = parse_sub_category($t2);
+ my $t = [ $t1, '', 'c' ];
+ unshift (@ary, $t);
+
+ $href->{$t1} = [ @ary ];
+ undef (@ary);
+ }
+
+ return $href;
+}
+
+# deprecated, use show_results instead.
+sub select_category
+{
+ $sth = $dbh->prepare(qq{select * from } . CATEGORY);
+ $sth->execute();
+ while (@row = $sth->fetchrow_array()) {
+ print Dumper (@row);
+ }
+}
+
+sub show_results
+{
+ my $sql = shift;
+ my $count = 0; # number of entries printed so far
+ my @label = (); # column label array
+ my $label_width = 0;
+
+ $sth = $dbh->prepare ($sql);
+ $sth->execute();
+
+ # get column names to use for labels and
+ # determine max column name width for formatting
+ @label = @{$sth->{NAME}};
+ foreach my $label (@label) {
+ $label_width = length ($label) if $label_width < length ($label);
+ }
+
+ while (@row = $sth->fetchrow_array ()) {
+ # print newline before 2nd and subsequent entries
+ print "\n" if ++$count > 1;
+ foreach (my $i = 0; $i < $sth->{NUM_OF_FIELDS}; $i++)
+ {
+ printf "%-*s", $label_width+1, $label[$i] . ":";
+ print " ", $row[$i] if defined ($row[$i]);
+ print "\n";
+ }
+ }
+ print "Total columns: [" . $sth->{NUM_OF_FIELDS} . "]\n";
+ $sth->finish();
+}
345 city.pl
@@ -0,0 +1,345 @@
+#! /cygdrive/c/Perl/bin/perl.exe
+##/usr/bin/perl -w
+# $Id$
+#Bug: manitoba,n brunswick, newf &lab, nova scotia, pei, territories
+
+use lib qw(../lib/);
+use craig_config;
+use common;
+
+use warnings;
+use strict;
+use Data::Dumper;
+use FileHandle;
+use WWW::Mechanize;
+use DBI;
+
+local($|) = 1;
+undef $/;
+
+#-----------------------------------
+# 0. initialize:
+#-----------------------------------
+our @all = ('united states', 'canada', 'asia', 'americas', 'au/nz', 'africa', 'europe');
+#our @all = ('canada');
+our ($html, $aohoa) = ('', []);
+our ($num1, $num2, $start_time, $end_time) = (0,0,0,0);
+my ($mech, $dbh, $sth, @row) = (undef, undef, undef, ());
+
+my $comm = new common() or die;
+my $log = $comm->get_filename(__FILE__) . '_'. $comm->get_time(1);
+$comm->set_log($log);
+$comm->write_log("\n[".$log."]: start at: [".localtime() . "].");
+$start_time = time;
+
+$mech = WWW::Mechanize->new( autocheck => 0 );
+
+my ($host, $user, $pass, $dsn) = (HOST, USER, PASS, DSN);
+$dsn .= ":hostname=$host";
+$dbh = DBI->connect($dsn, $user, $pass, { RaiseError=>1 });
+
+#-----------------------------------
+# 1. scrape:
+#-----------------------------------
+my $aref = [];
+
+# Option A:
+$mech->get (URL1);
+die "Can't even get the home page: ",
+$mech->response->status_line unless $mech->success;
+$html = $mech->content;
+
+=comment
+# Option B:
+my $fd = new FileHandle(CHTML."country.html") or die;
+($html) = (<$fd>);
+=cut
+
+#-----------------------------------
+# 2. parse.
+#-----------------------------------
+my ($h) = (undef);
+foreach my $c (@all) {
+ if ($c eq 'united states') {
+ $h = trunc_us($html);
+ }
+ elsif ($c eq 'canada') {
+ $h = trunc_canada($html);
+ }
+ elsif ($c eq 'asia') {
+ $h = trunc_asia($html);
+ }
+ elsif ($c eq 'americas') {
+ $h = trunc_americas($html);
+ }
+ elsif ($c eq 'au/nz') {
+ $h = trunc_aunz($html);
+ }
+ elsif ($c eq 'africa') {
+ $h = trunc_africa($html);
+ }
+ elsif ($c eq 'europe') {
+ $h = trunc_europe($html);
+ }
+ else {
+ die "exit from " . __LINE__ . "\n";
+ }
+
+ $aref = parse_country($c, $h);
+ foreach my $a (@{$aref}) {
+ # $comm->write_log($a, "${c}:");
+ my $a1 = $dbh->quote($a->[1]);
+ $comm->write_log("[".$a->[0].", ".$a1.", ".$a->[2]."]\n");
+ ++ $num1;
+
+ $sth=$dbh->do(qq{insert ignore into } . COUNTRY_STATE . qq{(sname,surl,area,sdate) values('$a->[0]',$a1,'$a->[2]',now()) });
+ }
+
+ foreach my $t (@{$aref}) {
+ my ($t1, $t2) = @$t;
+
+ # bug: 2 'new york' in the screen. # $mech->follow_link(text => $t1);
+ # $mech->follow_link(url => $t2);
+ # $mech->success or die $mech->response->status_line;
+
+ $mech->get($t2);
+
+ my $aoh = parse_html($c, $t1, $mech->content);
+ my $aoh1 = [];
+
+ # Here bugs: no single city parsed, such as newf&lab->st john's,NL
+ # will fix later.
+ foreach my $m (@{$aoh}) {
+ my ($m1, $m2) = @$m;
+ if ($m1 =~ m/(craigslist|or suggest a new one)/) {
+ next;
+ }
+ if ($m2 =~ m/(forums|wiki)/) {
+ next;
+ }
+ push (@{$aoh1}, $m);
+ }
+
+ # $comm->write_log($aoh1, "${c} -> ${m1}:");
+ # print Dumper($aoh1);
+
+ foreach my $a (@{$aoh1}) {
+ my $a0 = $dbh->quote($a->[0]);
+ my $a1 = $dbh->quote($a->[1]);
+ $comm->write_log("[".$a0.", ".$a1.", ".$a->[2].", ".$a->[3]."]\n");
+ ++ $num2;
+
+ $sth=$dbh->do(qq{ insert ignore into } . CITY . qq{(cname,curl,area1,area2,cdate) values($a0,$a1,'$a->[2]','$a->[3]',now()) });
+ }
+
+ $mech->back();
+ }
+}
+
+#-----------------------------------
+# 4. double check data is stored.
+#-----------------------------------
+show_results('select * from '.COUNTRY_STATE.' order by area');
+show_results('select * from '.CITY. ' order by area2, area1');
+
+#-----------------------------------
+# 5. clear up.
+#-----------------------------------
+$dbh->disconnect();
+# $fd->close() if ($fd);
+
+$end_time = time;
+$comm->write_log( "There are total [ $num1 ], [ $num2 ] records was processed succesfully!\n");
+$comm->write_log("Finally, there are total [ " . ($end_time - $start_time) . " ] s were used.\n");
+$comm->close_log();
+
+exit;
+
+
+#######################################
+
+sub trunc_us
+{
+ my ($text) = @_;
+ my $extract_html = '';
+ ($extract_html) = $text =~ m!
+ wash\sdc</a>
+ (.*)
+ alberta
+ !xsig;
+ return $extract_html;
+}
+sub trunc_canada
+{
+ my ($text) = @_;
+ my $extract_html = '';
+ ($extract_html) = $text =~ m!
+ wyoming</a>
+ (.*)
+ ca\scities
+ !xsig;
+ return $extract_html;
+}
+sub trunc_canada_1
+{
+ my ($text) = @_;
+ my $extract_html = '';
+ ($extract_html) = $text =~ m!
+ wyoming</a>
+ (.*)
+ bangladesh
+ !xsig;
+ return $extract_html;
+}
+sub trunc_asia
+{
+ my ($text) = @_;
+ my $extract_html = '';
+ ($extract_html) = $text =~ m!
+ more\s..</a>
+ (.*)
+ au/nz
+ !xsig;
+ return $extract_html;
+}
+sub trunc_aunz
+{
+ my ($text) = @_;
+ my $extract_html = '';
+ ($extract_html) = $text =~ m!
+ au/nz</a>
+ (.*)
+ argentina
+ !xsig;
+ return $extract_html;
+}
+sub trunc_americas
+{
+ my ($text) = @_;
+ my $extract_html = '';
+ ($extract_html) = $text =~ m!
+ new\szealand</a>
+ (.*)
+ africa
+ !xsig;
+ return $extract_html;
+}
+sub trunc_africa
+{
+ my ($text) = @_;
+ my $extract_html = '';
+ ($extract_html) = $text =~ m!
+ africa
+ (.*)
+ austria
+ !xsig;
+ return $extract_html;
+}
+sub trunc_europe
+{
+ my ($text) = @_;
+ my $extract_html = '';
+ ($extract_html) = $text =~ m!
+ tunisia</a>
+ (.*)
+ amsterdam
+ !xsig;
+ return $extract_html;
+}
+sub parse_country
+{
+ my ($c, $text) = @_;
+ my $aref = [];
+
+ while ($text =~ m{
+ <a\s+href="(.*?)">
+ (.*?)
+ </a>
+ (?:.*?)
+ }sgix) {
+ my ($t1,$t2) = ($1, $2);
+ $t1 =~ s/&amp;/&/g if ($t1 =~ m/&amp;/);
+ $t2 =~ s/&amp;/&/g if ($t2 =~ m/&amp;/);
+ push (@{$aref}, [ $t2, $t1, $c ]);
+ }
+ return $aref;
+}
+
+sub parse_html
+{
+ my ($c, $s, $text) = @_;
+ my $aref = [];
+
+ # this parse is only used by about/sites and its following for city/country.
+ return [] if ($text=~m#href="cal/">event calendar</a>#);
+ return [] if ($text=~m#Ereigniskalender#); #austria
+
+ while ($text =~ m{
+ <a\s+href="(.*?)">
+ (.*?)
+ </a>
+ (?:.*?)
+ }sgix) {
+ # 'newf &amp; lab' => 'geo.craigslist.org/iso/ca/nl'
+ my ($t1,$t2) = ($1, $2);
+ $t1 =~ s/&amp;/&/g if ($t1 =~ m/&amp;/);
+ $t2 =~ s/&amp;/&/g if ($t2 =~ m/&amp;/);
+ if ($t2 =~ m/<b>/) {
+ $t2 =~ s/<b>//g;
+ $t2 =~ s/<\/b>//g;
+ }
+ push (@{$aref}, [$t2,$t1,$s,$c]);
+ }
+ return $aref;
+}
+
+sub select_country_state
+{
+ my @row = ();
+ $sth = $dbh->prepare(qq{select * from } . COUNTRY_STATE . qq{ order by area});
+ $sth->execute();
+ while (@row = $sth->fetchrow_array()) {
+ print Dumper (@row);
+ }
+}
+
+sub select_city
+{
+ my @row = ();
+ $sth = $dbh->prepare(qq{select * from }.CITY);
+ $sth->execute();
+ while (@row = $sth->fetchrow_array()) {
+ print Dumper (@row);
+ }
+}
+
+sub show_results
+{
+ my $sql = shift;
+ my $count = 0;
+ my @label = ();
+ my $label_width = 0;
+
+ $sth = $dbh->prepare ($sql);
+ $sth->execute ();
+
+ # get column names to use for labels and
+ # determine max column name width for formatting
+ @label = @{$sth->{NAME}};
+ foreach my $label (@label) {
+ $label_width = length ($label) if $label_width < length ($label);
+ }
+
+ while (my @ary = $sth->fetchrow_array ()) {
+ # print newline before 2nd and subsequent entries
+ print "\n" if ++$count > 1;
+ foreach (my $i = 0; $i < $sth->{NUM_OF_FIELDS}; $i++)
+ {
+ printf "%-*s", $label_width+1, $label[$i] . ":";
+ print " ", $ary[$i] if defined ($ary[$i]);
+ print "\n";
+ }
+ }
+ print "Total columns: [" . $sth->{NUM_OF_FIELDS} . "]\n";
+ $sth->finish ();
+}
12 clear_log.sh
@@ -0,0 +1,12 @@
+#! /bin/bash
+
+cd $HOME/craig/logs/
+s1=`date +'%y%m%d'`
+s2=`expr $s1 - 3`
+for i in `ls *$s2*.log 2>/dev/null`
+do
+ /bin/rm -f $i 2>/dev/null
+done
+
+cd $HOME/craig/logs/
+find . -size 0 -mtime +1 -exec rm -f {} \;
497 craig.pm
@@ -0,0 +1,497 @@
+package craig;
+
+# $Id$
+# craig inherit ../lib/common.pm
+
+use lib qw(../lib);
+use craig_config;
+use common;
+@ISA = qw(common);
+use strict;
+our ( $dbh, $sth );
+
+sub new {
+ my ( $type, $dbh_handle ) = @_;
+ my $self = {};
+ $self->{dbh} = $dbh_handle;
+ $self->{app} = 'craig';
+ bless $self, $type;
+}
+# <blockquote><p><a href=/web>Continue to web / info design job postings</a></p>
+sub parse_cgi_page {
+ my ( $self, $html ) = @_;
+ while (
+ $html =~ m {
+ <blockquote>
+ (?:.*?)
+ <p>
+ <a(?:.*?)href=(.*?)>
+ (?:.*?)
+ </a>
+ (?:.*?)
+ </blockquote>
+ }sgix
+ )
+ {
+ my $url = $1;
+ $url =~ s/"//g if $url=~m/"/;
+ return $url;
+ }
+ return '';
+}
+
+# return 'today' and Only today's data in a page.
+sub parse_today {
+ my ( $self, $html ) = @_;
+ my ( $m, $n ) = ( undef, undef );
+ while (
+ $html =~ m{
+ <h4>
+ (.*?)
+ </h4>
+ (.*?)
+ (?=<h4>|</body>|<p\salign="center">)
+ }sgix
+ )
+ {
+ if ( !defined $m ) {
+ $m = $1;
+ $n = $2;
+ next;
+ }
+ else {
+ if ( $m ne $1 ) {
+ last;
+ }
+ }
+ }
+ return [ $m, $n ];
+}
+
+# (?:<h4>$t1\s$t2\s$t3</h4>|<p\s+align="center"|</body>)
+# http://auburn.craigslist.org/search/crg?query=+
+sub parse_date {
+ my ( $self, $date, $html ) = @_;
+ my ( $t1, $t2, $t3 ) = $date =~ m/(\w+)\s(\w+)\s(\w+)/g;
+ $html =~ m{
+ (?=<h4\sclass="ban">)
+ (.*?)
+ (?:<h4>$t1\s$t2\s$t3</h4>|<div\sid="footer"|</body>)
+ }sgix;
+ return $1;
+}
+sub parse_gigs_html {
+ my ( $self, $html ) = @_;
+ $html =~ m{
+ <div>
+ sort\sby
+ (.*?)
+ <div>
+ sort\sby
+ }sgix;
+ return $1;
+}
+sub parse_main {
+ my ( $self, $html ) = @_;
+ my $aoh = [];
+
+ while (
+ $html =~ m {
+ <p>
+ <a\s+href="(.*?)"> # keywords url.
+ (.*?) # keywords.
+ </a>
+ (.*?) # location.
+ <small\sclass="gc">
+ <a\s+href="(.*?)"> # item url.
+ (.*?) # item
+ </a>
+ (?:.*?)
+ </small>
+ }sgix
+ )
+ {
+ my ( $t1, $t2, $t3, $t4, $t5 ) = ( $1, $2, $3, $4, $5 );
+ $t2 = $self->trim($t2);
+ $t2 =~ s/\s+-//g if ( $t2 =~ m/\s+-/ );
+ $t3 =~ s/\<.*?\>//g;
+ $t3 =~ s/-//;
+ $t3 = $self->trim($t3);
+ $t3 =~ s/^\(// if ( $t3 =~ m/\(/ );
+ $t3 =~ s/\)$// if ( $t3 =~ m/\)/ );
+ $t5 =~ s/&nbsp;/ /g if ( $t5 =~ m/&nbsp;/ );
+ $t5 =~ s/&amp;/ /g if ( $t5 =~ m/&amp;/ );
+ push( @{$aoh}, [ $t1, $t2, $t3, $t4, $t5 ] );
+ }
+ return $aoh;
+}
+
+sub parse_item_main {
+ my ( $self, $html ) = @_;
+ my $aoh = [];
+ return [] unless $html;
+ while (
+ $html =~ m {
+ <p>
+ (?:.*?) # for gigs: <p>Jun 9 - <a href="...">
+ <a\s+href="(.*?)"> # keywords url.
+ (.*?) # keywords.
+ </a>
+ (.*?)
+ </p>
+ }sgix
+ )
+ {
+ my ( $url, $keywords, $location ) = ( $1, $2, $3 );
+ $keywords = $self->trim($2);
+ $location =~ s/\<.*?\>//g;
+ $location =~ s/-//;
+ $location = $self->trim($location);
+ $location =~ s/^\(// if ( $location =~ m/\(/ );
+ $location =~ s/\)$// if ( $location =~ m/\)/ );
+ push( @{$aoh}, [ $url, $keywords, $location ] );
+ }
+ return $aoh;
+}
+
+sub parse_next_page {
+ my ( $self, $html ) = @_;
+ return '' unless $html;
+ while (
+ $html =~ m {
+ <p\salign="center">
+ <font\ssize="4">
+ <a\shref="(.*?)">
+ (?:.*?)
+ </a>
+ </font>
+ }sgix
+ )
+ {
+ return $1;
+ }
+ return '';
+}
+
+# <a href="http://www.hostelcareers.ca/job-search/hi-banff-alpine-centre/activities-coordinator/263" rel="nofollow">
+# www.hostelcareers.ca<a rel="nofollow">
+sub parse_detail {
+ my ( $self, $html ) = @_;
+ my @ary = ();
+
+ while (
+ $html =~ m{
+ Date:
+ (.*?) # Date
+ <br
+ (?:.*?)
+ Reply\s+to:
+ (.*?) # Email
+ <div\sid="userbody">
+ (.*?) # Content
+ </div>
+ }sgix
+ )
+ {
+ my ( $date, $email, $t3 ) = ( $1, $2, $3 );
+ $date =~ s/^\s+// if ( $date =~ m/^\s/ );
+ $date =~ s/\s+$// if ( $date =~ m/\s$/ );
+ $date =~ s/,\s+/ /;
+ $date =~ s/ \w+$//;
+ $email = $self->get_email($email);
+ my $web = $self->get_web($t3);
+ my $phone = $self->get_phone($t3);
+ my $shtml = $self->strip_craig_userbody($t3);
+ my $email1 = $self->get_email_1($shtml);
+ my $relevant = $self->get_relevant($shtml);
+ push( @ary, $date, $email, $phone, $web, $relevant, $email1 );
+ }
+ return @ary;
+}
+
+# How to get SUPER::get_email of common.pm ?
+sub get_email_1
+{
+ my ($self, $html) = @_;
+ return '' unless $html;
+ my ($email) = $html =~ m{\b([\w\.\-]+@[\w\.\-]+)\b}s;
+ return $email;
+}
+
+sub get_email {
+ my ( $self, $str ) = @_;
+ return '' unless $str;
+ if ( $str =~ m/\@/ ) {
+ $str =~ s/\<a.*?>//s;
+ $str =~ s/<\/a>.*$//s; # </a>
+ $str = $self->trim($str);
+ }
+ elsif ( $str =~ m/see below/i ) {
+ $str = '';
+ }
+ else {
+ $str = '';
+ }
+ return $str;
+}
+
+# Deprecated from Jun 07,2010.
+sub parse_detail_1 {
+ my ( $self, $html ) = @_;
+ my @ary = ();
+
+ while (
+ $html =~ m{
+ Date:
+ (.*?) # Date
+ <br
+ (?:.*?)
+ Reply\s+to:
+ (?:.*?)
+ <a\s(?:.*?)>
+ (.*?) # email
+ </a>
+ (?:.*?)
+ <div\sid="userbody">
+ (.*?) # content
+ </div>
+ }sgix
+ )
+ {
+ my ( $date, $email, $t3 ) = ( $1, $2, $3 );
+ $date =~ s/^\s+// if ( $date =~ m/^\s/ );
+ $date =~ s/\s+$// if ( $date =~ m/\s$/ );
+ $date =~ s/,\s+/ /;
+ $date =~ s/ \w+$//;
+ my $web = $self->get_web($t3);
+ my $phone = $self->get_phone($t3);
+ my $shtml = $self->strip_craig_userbody($t3);
+ my $relevant = $self->get_relevant($shtml);
+ push( @ary, $date, $email, $phone, $web, $relevant );
+ }
+ return @ary;
+}
+sub parse_detail_resumes {
+ my ( $self, $html ) = @_;
+ my @ary = ();
+ while (
+ $html =~ m{
+ Date:
+ (.*?) # Date
+ <br
+ (?:.*?)
+ <div\sid="userbody">
+ (.*?) # content
+ </div>
+ }sgix
+ )
+ {
+ my ( $date, $t3 ) = ( $1, $2);
+ $date =~ s/^\s+// if ( $date =~ m/^\s/ );
+ $date =~ s/\s+$// if ( $date =~ m/\s$/ );
+ $date =~ s/,\s+/ /;
+ $date =~ s/ \w+$//;
+ my $web = $self->get_web($t3);
+ my $phone = $self->get_phone($t3);
+ my $shtml = $self->strip_craig_userbody($t3);
+ my $relevant = $self->get_relevant($shtml);
+ my $email = $self->get_email_1($relevant);
+ push( @ary, $date, $email, $phone, $web, $relevant );
+ }
+ return @ary;
+}
+
+sub select_ca_cities {
+ my $self = shift;
+ my $aref = [];
+ $sth =
+ $self->{dbh}
+ ->prepare( q{select cname from } . CITY . qq{ where area2='canada'} );
+ $sth->execute();
+ $aref = $sth->fetchall_arrayref();
+ $sth->finish();
+
+ # print Dumper($aref);
+ return $aref;
+}
+
+sub select_city {
+ my ( $self, $city ) = @_;
+ my @row = ();
+ my $c1 = $self->{dbh}->quote($city);
+ $sth =
+ $self->{dbh}->prepare( q{ select curl from }
+ . CITY
+ . qq{ where cname=$c1 and area2='canada' } );
+ $sth->execute();
+ @row = $sth->fetchrow_array();
+ $sth->finish();
+ return $row[0];
+}
+
+sub select_us_cities {
+ my $self = shift;
+ my $aref = [];
+ $sth =
+ $self->{dbh}->prepare(
+ q{select cname from } . CITY . qq{ where area2='united states'} );
+ $sth->execute();
+ $aref = $sth->fetchall_arrayref();
+ $sth->finish();
+
+ # print Dumper($aref);
+ return $aref;
+}
+
+sub select_items {
+ my ( $self, $category ) = @_;
+ my $aref = [];
+ if ($category eq 'resumes') {
+ $aref->[0][0] = 'resumes';
+ return $aref;
+ }
+ my $c2 = $self->{dbh}->quote($category);
+ $sth =
+ $self->{dbh}->prepare( q{select iname from }
+ . ITEM
+ . qq{ where category=$c2 and selected='Y' order by iname} );
+ $sth->execute();
+ $aref = $sth->fetchall_arrayref();
+ $sth->finish();
+ return $aref;
+}
+
+sub select_us_city {
+ my ( $self, $city ) = @_;
+ my @row = ();
+ my $c1 = $self->{dbh}->quote($city);
+ $sth =
+ $self->{dbh}->prepare( q{ select curl from }
+ . CITY
+ . qq{ where cname=$c1 and area2='united states' } );
+ $sth->execute();
+ @row = $sth->fetchrow_array();
+ $sth->finish();
+ return $row[0];
+}
+
+sub select_category {
+ my ( $self, $item, $category ) = @_;
+ my @row = ();
+ $sth =
+ $self->{dbh}
+ ->prepare( q{ select curl from } . CATEGORY . qq{ where cname='$item' } );
+ $sth->execute();
+ @row = $sth->fetchrow_array();
+ $sth->finish();
+ if ( $row[0] ) {
+ return $row[0];
+ }
+ else {
+ $sth =
+ $self->{dbh}->prepare(
+ qq{ select iurl from } . ITEM . qq{ where iname='$item' and category = '$category' and selected='Y' } );
+ $sth->execute();
+ @row = $sth->fetchrow_array();
+ $sth->finish();
+ return $row[0];
+ }
+}
+
+sub select_keywords_email {
+ my ( $self, $k, $e ) = @_;
+ my $sql =
+ "select * from "
+ . TOPIC
+ . " where keywords like '%"
+ . $k
+ . "%' and email like '%"
+ . $e . "%'";
+ $self->show_results($sql);
+}
+
+sub select_keywords {
+ my ( $self, $k ) = @_;
+ my $sql = "select * from " . TOPIC . " where keywords like '%" . $k . "%'";
+ $self->show_results($sql);
+}
+
+sub select_email {
+ my ( $self, $e ) = @_;
+ my $sql = "select * from " . TOPIC . " where email like '%" . $e . "%'";
+ $self->show_results($sql);
+}
+
+sub get_relevant {
+ my ( $self, $html ) = @_;
+
+ return unless $html;
+ $html =~ s/<!-- START CLTAGS -->.*$//si;
+ $html =~ s/(<br>|<br\s*\/>)/\n/g;
+ $html =~ s/<div.*?>//g;
+ $html =~ s/<p>/\n/g;
+ $html =~ s/<ul.*?>/\n/g;
+ $html =~ s/<\/div>//g;
+ $html =~ s/<\/p>//g;
+ $html =~ s/<\/ul>//g;
+ $html =~ s/<li>//g;
+ $html =~ s/<\/li>/;/g;
+
+ # $html =~ s/<(?!br).*$//is;
+ # $html =~ s/<.*$//is;
+
+ $html =~ s/<img.*?>//sg if ( $html =~ m/<img/ );
+ $html =~ s/\<font.*?\<\/font\>//sg if ( $html =~ m/<font/ );
+ $html =~ s/\<a\s.*?\<\/a\>//sg if ( $html =~ m/<a\s/ );
+ $html =~ s/<br>//sg if ( $html =~ m/<br>/ );
+ $html =~ s/<b>//sg if ( $html =~ m/<b>/ );
+ $html =~ s/^\W+//mg;
+ $html =~ s/\n/ /g;
+ $html =~ s/&bull;/ /g;
+ $html =~ s/&sdot;/ /g;
+ $html =~ s/\s{2,}/ /g;
+ return $html;
+
+=comment
+ my $ret = $html;
+ if (length($html) > 253) {
+ $ret = substr($html, 0, 252) . '...';
+ }
+ return $ret;
+=cut
+
+}
+
+sub strip_craig_userbody {
+ my ( $self, $html ) = @_;
+ $html =~ s/<!-- CLTAG GeographicArea=NW -->.*$//si
+ if ( $html =~ m/CLTAG GeographicArea/i );
+ return $html;
+}
+
+# Thu 25 Mar
+sub get_end_date {
+ my ( $self, $todate ) = @_;
+ my $sth =
+ $self->{dbh}->prepare( qq{ select date_format(date_sub(now(), interval }
+ . $todate
+ . qq{ day), '%a %d %b' ) } );
+ $sth->execute();
+ my @row = $sth->fetchrow_array();
+ $sth->finish();
+ return $row[0];
+}
+# US uses: Thu Jun 03; while Canada uses: Mon 07 Jun
+sub get_us_end_date {
+ my ( $self, $todate ) = @_;
+ my $sth =
+ $self->{dbh}->prepare( qq{ select date_format(date_sub(now(), interval }
+ . $todate
+ . qq{ day), '%a %b %d' ) } );
+ $sth->execute();
+ my @row = $sth->fetchrow_array();
+ $sth->finish();
+ return $row[0];
+}
+
+1;
30 db.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# $Id$
+
+MYSQL="mysql -u craig -pwilliam -D craig"
+
+echo "1. craigslist_item:"
+$MYSQL <<"EOT"
+select count(*) "TOTAL RECORDS: " from craigslist_item;
+EOT
+
+echo "2. craigslist_category:"
+$MYSQL <<"EOT"
+select count(*) "TOTAL RECORDS: " from craigslist_category;
+EOT
+
+echo "3. craigslist_country_state:"
+$MYSQL <<"EOT"
+select count(*) "TOTAL RECORDS: " from craigslist_country_state;
+EOT
+
+echo "4. craigslist_city:"
+$MYSQL <<"EOT"
+select count(*) "TOTAL RECORDS: " from craigslist_city;
+EOT
+
+echo "5. craigslist_topic:"
+$MYSQL <<"EOT"
+select count(*) "TOTAL RECORDS: " from craigslist_topic;
+EOT
+
6 env.pl
@@ -0,0 +1,6 @@
+#! /usr/bin/perl -w
+
+foreach $key (keys %ENV) {
+ print "$key => $ENV{$key}\n";
+}
+
220 housing.pl
@@ -0,0 +1,220 @@
+#! /cygdrive/c/Perl/bin/perl.exe
+
+use lib qw(../lib/);
+use craig_config;
+use db;
+use craig;
+
+use warnings;
+use strict;
+use Data::Dumper;
+use FileHandle;
+use WWW::Mechanize;
+use DBI;
+use Getopt::Long;
+
+local($|) = 1;
+undef $/;
+
+#-----------------------------------
+# 0. initialize:
+#-----------------------------------
+our ($num, $start_time, $end_time) = (0,0,0);
+our ($start_url, $page_url, $todate) = (URL6, undef, INTERVAL_DATE);
+our ($end_date, $today) = ('', []);
+our ($mech, $db, $craig, $log) = (undef, undef);
+my ($dbh, $sth);
+
+$start_time = time;
+
+my ($host, $user, $pass, $dsn) = (HOST, USER, PASS, DSN);
+$dsn .= ":hostname=$host";
+$db = new db($user, $pass, $dsn) or die;
+$dbh = $db->{dbh};
+
+$craig = new craig($db->{dbh}) or die;
+
+$log = $craig->get_filename(__FILE__);
+$craig->set_log($log);
+$craig->write_log("\n[".$log."]: start at: [".localtime() . "].");
+
+my ($city, $item) = (DEFAULT_CITY, 'housing');
+my ($keywords,$email) =(undef,undef);
+my ($first, $help, $version) = (undef, undef, undef);
+
+usage() unless (GetOptions(
+ 'first' => \$first,
+ 'todate=s' => \$todate,
+ 'city=s' => \$city,
+ 'item=s' => \$item,
+ 'keywords=s' => \$keywords,
+ 'email=s' => \$email,
+ 'version=s' => \$version,
+ 'help|?' => \$help
+));
+
+$help && usage();
+
+if ($first) {
+ my $ca1 = $craig->select_ca_cities();
+ foreach my $ca2 (@$ca1) {
+ foreach my $ca3 (@{$ca2}) {
+ print $ca3 . "\n";
+ }
+ }
+ exit 1;
+}
+if ($version) {
+ print <<EOF;
+
+$0: Version 2.0
+EOF
+ exit 2;
+}
+
+# date +'%a %d %b' -d "2 day ago"
+if ($todate) {
+ $end_date = $craig->get_end_date($todate);
+}
+
+if ($city && $item) {
+ my ($r1, $r2) = ('', '');
+
+ $r1 = $craig->select_city($city);
+ die "No such city: <".$city.">, $0 quit." unless ($r1);
+
+ $r2 = $craig->select_category($item);
+ die "No such category: <".$item.">, $0 quit." unless ($r2);
+
+ $start_url = $r1 . $r2 if ($city && $item);
+ $craig->write_log("URL: <".$start_url.">.");
+ print $start_url . "\n";
+}
+if ($keywords && $email) {
+ $craig->select_keywords_email($keywords, $email);
+}
+elsif ($keywords) {
+ $craig->select_keywords($keywords);
+}
+elsif ($email) {
+ $craig->select_email($email);
+}
+
+$mech = WWW::Mechanize->new( autocheck => 0 );
+
+$page_url = $start_url;
+
+LOOP:
+$mech->get($page_url);
+$mech->success or die $mech->response->status_line;
+my $html = $mech->content;
+
+# Only parse data before $end_date.
+my $ht = $craig->parse_date($end_date, $html);
+unless ($ht) {
+ $dbh->disconnect();
+ $end_time = time;
+ $craig->write_log("$todate dates data: total [ " . ($end_time - $start_time) . " ] seconds used.\n");
+ $craig->write_log( "There are total [ $num ] records was processed succesfully!\n");
+ $craig->close_log();
+ exit 6;
+}
+
+$page_url = $craig->parse_next_page($ht);
+$page_url = $start_url . $page_url if ($page_url);
+# $craig->write_log($aoh);
+
+my $aoh = $craig->parse_main($ht);
+
+
+foreach my $t (@{$aoh})
+{
+ my $url = $t->[0];
+
+ $num ++;
+ $mech->follow_link(url => $url);
+ $mech->success or next;
+
+ my ($pdt,$pemail,$phone,$web,$relevant) = $craig->parse_detail($mech->content);
+ if (defined $pemail)
+ {
+ my ($t0, $t1, $t2, $t3, $t4, $t5) = @{$t};
+ $t0 = $dbh->quote($t->[0]);
+ $t1 = $dbh->quote($t->[1]);
+ $t2 = $dbh->quote($t->[2]);
+ $t3 = $dbh->quote($t->[3]);
+ $t4 = $dbh->quote($t->[4]);
+ $pdt = ' ' unless ($pdt);
+ # phone
+ $phone = $dbh->quote($phone);
+ # web
+ $web = $dbh->quote($web);
+ $relevant = $dbh->quote($relevant);
+
+ my $c1 = $dbh->quote($city); # st john's,NL
+ $craig->write_log("No: ".(++$num)." -- [".$t0.", ".$t1.", ".$t2.", ".$t3.", ".$t4.", ".$pdt.", ".$pemail.", ".$phone.", ".$web.", ".$c1.", ".$item."], [".$relevant."]\n");
+
+ $sth = $dbh->do(qq{ insert ignore into }.TOPIC.qq{
+ (url,keywords,relevant,location,item_url,item,post_time,email,phone,
+ web,city,category,date)
+ values($t0,$t1,$relevant,$t2,$t3,$t4,'$pdt','$pemail',
+ $phone, $web, $c1,'$item',now())});
+
+ }
+
+ $mech->back();
+}
+
+goto LOOP if ($page_url);
+
+
+# $sth->finish();
+$dbh->disconnect();
+
+$end_time = time;
+$craig->write_log("$todate days data: total [ " . ($end_time - $start_time) . " ] seconds used.\n");
+$craig->write_log( "There are total [ $num ] records was processed succesfully!\n");
+$craig->write_log("----------------------------------------------\n");
+$craig->close_log();
+
+exit 8;
+
+
+sub usage
+{
+print <<HELP;
+Uage:
+ $0
+ or:
+ $0 -c city -i category
+ or:
+ $0 -t 3
+ or:
+ $0 -k keyword -e email
+ or:
+ $0 -h [-v]
+Description:
+ -t from what date to download? default it's from 2 days before.
+ -c city, which city to scrape?
+ -i category/item, which category/item to scrape?
+ -k search by keywords, what keyword to search?
+ -e search by email, what email to search
+ -h this help
+ -v version
+
+Description:
+ -f first time to loop all city.
+ -t from when to begin scraping? default is 2 days before.
+ -c city to search
+ -i category to search
+ -k keywords to search
+ -e email to search
+ -f logfile to search
+ -h this help
+
+Default Usage: $0 (e.g: $0 -h localhost -u craig -p -c vancouver -i housing -k 'php develper' -f 'vancouver'
+
+HELP
+exit 3;
+}
+
245 jobs.pl
@@ -0,0 +1,245 @@
+#! /cygdrive/c/Perl/bin/perl.exe
+# $Id$
+# $0 -c vancouver -i jobs
+# http://www.tutorialspoint.com/mysql/mysql-handling-duplicates.htm
+# Issues:
+# 1. follow_link(): html doesn't exists.
+# 2. incremently scraping test.
+
+use lib qw(../lib/);
+use craig_config;
+use craig;
+use db;
+
+use warnings;
+use strict;
+use Data::Dumper;
+use FileHandle;
+use WWW::Mechanize;
+use DBI;
+use Getopt::Long;
+our $dbh;
+
+local($|) = 1;
+undef $/;
+
+#-----------------------------------
+# 0. initialize:
+#-----------------------------------
+our ($num, $start_time, $end_time) = (0,0,0);
+our ($html, $page_url, $today) = (undef, undef, []);
+my ($mech, $sth) = (undef, undef);
+
+my ($host, $user, $pass, $dsn) = (HOST, USER, PASS, DSN);
+$dsn .= ":hostname=$host";
+my $db = new db($user, $pass, $dsn);
+$dbh = $db->{dbh};
+
+my $craig = new craig(CRAIG) or die;
+my $log = $craig->get_filename(__FILE__);
+$craig->set_log($log);
+$start_time = time;
+
+my $end_date = $craig->get_end_date($todate);
+
+my $start_url = URL4;
+
+my ($city,$item) = (DEFAULT_CITY, DEFAULT_CATEGORY);
+my ($help,$keywords,$email,$version,$todate)
+ =(undef,undef,undef,undef,undef);
+
+usage() unless (GetOptions(
+ 'first' => \$first,
+ 'date=s' => \$date,
+ 'city=s' => \$city,
+ 'item=s' => \$item,
+ 'keywords=s' => \$keywords,
+ 'email=s' => \$email,
+ 'help|?' => \$help,
+ 'version' => \$version
+));
+
+$help && usage();
+
+if ($version) {
+ print <<EOF;
+
+$0: Version 1.0
+EOF
+ exit 1;
+}
+
+if ($first) {
+ my $ca1 = $craig->select_ca_cities();
+ foreach my $ca2 (@$ca1) {
+ foreach my $ca3 (@{$ca2}) {
+ print $ca3 . "\n";
+ }
+ }
+ exit 2;
+}
+
+if ($city && $item) {
+ my ($r1, $r2) = ('', '');
+
+ $r1 = select_city($city);
+ if ($r1) {
+ print "City: <".$r1.">...\n";
+ $craig->write_log("City: <".$r1.">.");
+ }
+ else {
+ die "No such city: <".$city.">, $0 quit.";
+ }
+
+ $r2 = select_category($item);
+ if ($r2) {
+ print "Category: <".$r2.">...\n";
+ $craig->write_log("Category: <".$r2.">.");
+ }
+ else {
+ die "No such category: <".$item.">, $0 quit.";
+ }
+ $start_url = $r1 . $r2 if ($city && $item);
+ $craig->write_log("URL: <".$start_url.">.");
+ print $start_url . "\n";
+}
+if ($keywords && $email) {
+ select_keywords_email($keywords, $email);
+}
+elsif ($keywords) {
+ select_keywords($keywords);
+}
+elsif ($email) {
+ select_email($email);
+}
+
+$craig->write_log("[".$log."]: start at: [".localtime() . "].");
+
+$mech = WWW::Mechanize->new( autocheck => 0 );
+
+
+$mech->get($start_url);
+$mech->success or die $mech->response->status_line;
+$page_url = parse_cgi_page($mech->content);
+
+LOOP:
+$mech->follow_link(url => $page_url);
+$mech->success or die $mech->response->status_line;
+$html = $mech->content;
+
+my $todates = parse_date($todate, $mech->content);
+unless ($todates->[1]) {
+ print "No Data can be extracted.\n";
+ exit;
+}
+my $aoh = parse_jobs($todates->[1]);
+
+$page_url = parse_next_page($mech->content);
+
+foreach my $t (@{$aoh})
+{
+ my $url = $t->[0];
+
+ $mech->follow_link(url => $url);
+ $mech->success or next; # $mech->success or die $mech->response->status_line;
+
+ my ($pdt,$pemail,$phone,$web,$relevant) = parse_jobs2($mech->content);
+ if (defined $pemail)
+ {
+ my ($t0, $t1, $t2, $t3, $t4, $t5) = @{$t};
+ $t0 = $dbh->quote($t->[0]);
+ $t1 = $dbh->quote($t->[1]);
+ $t2 = $dbh->quote($t->[2]);
+ $t3 = $dbh->quote($t->[3]);
+ $t4 = $dbh->quote($t->[4]);
+ $pdt = ' ' unless ($pdt);
+ # phone
+ $phone = $dbh->quote($phone);
+ # web
+ $web = $dbh->quote($web);
+ $relevant = $dbh->quote($relevant);
+
+ my $c1 = $dbh->quote($city); # st john's,NL
+ $craig->write_log("No: ".(++$num)." -- [".$t0.", ".$t1.", ".$t2.", ".$t3.", ".$t4.", ".$pdt.", ".$pemail.", ".$phone.", ".$web.", ".$c1.", ".$item."]\n");
+
+ $sth = $dbh->do(qq{ insert ignore into }.TOPIC.qq{
+ (url,keywords,relevant,location,item_url,item,post_time,email,phone,
+ web,city,category,date)
+ values($t0,$t1,$relevant,$t2,$t3,$t4,'$pdt','$pemail',
+ $phone, $web, $c1,'$item',now())});
+
+ }
+
+ $mech->back();
+}
+
+goto LOOP if ($page_url);
+
+
+
+$dbh->disconnect();
+# $fd->close() if ($fd);
+
+$end_time = time;
+$craig->write_log( "There are total [ $num ] records was processed succesfully!\n");
+
+# $craig->write_log("<$todates->[0]>: Finally, there are total [ " . ($end_time - $start_time) . " ] seconds used.\n");
+
+$craig->write_log("Finally, there are total [ " . ($end_time-$start_time) . " ] seconds used.\n");
+$craig->close_log();
+
+exit;
+
+
+sub usage
+{
+print <<HELP;
+Uage:
+ $0 [-d]
+ or:
+ $0 -c city -i category
+ or:
+ $0 -k keyword -e email
+ or:
+ $0 -h [-v]
+Description:
+ -c city, which city to scrape?
+ -i category/item, which category/item to scrape?
+ -k search by keywords, what keyword to search?
+ -e search by email, what email to search
+ -h this help
+ -v version
+
+Examples:
+ (1) $0 # use default
+ (2) $0 -d # such as `date +'%a %d %b' -d "2 day ago"`
+ (3) $0 -c vancouver -i jobs # scrape vancouver's jobs
+ (4) $0 -c calgory -i 'software / qa / dba'
+ (5) $0 -k 'php develper' -e 'email\@dummy.com' # seach keywords & email
+ (6) $0 -k 'php develper' # seach keywords
+ (7) $0 -e 'email\@dummy.com' # seach email
+ (8) $0 -h # get help
+ (9) $0 -v # get version
+
+HELP
+exit 1;
+}
+
+sub init_env
+{
+ $SIG{'INT'} = 'IGNORE';
+ $SIG{'QUIT'} = 'IGNORE';
+ $SIG{'TERM'} = 'IGNORE';
+ $SIG{'PIPE'} = 'IGNORE';
+ $SIG{'CHLD'} = 'IGNORE';
+
+ my $pid = fork();
+ die "$!" unless defined $pid; exit 4 if $pid;
+
+ select(STDOUT);
+ $| = 1;
+}
+
+sub BEGIN {
+ $ENV{'PATH'} = '/usr/bin/:/bin/:.';
+}
12 mail.sh
@@ -0,0 +1,12 @@
+#! /bin/bash
+#uuu.sh $YESTERDAY >$HOME/craig/email.txt
+
+email="$HOME/craig/email.txt"
+
+cd $HOME/craig/
+
+YESTERDAY=`date --date=yesterday +'%F'`
+$HOME/craig/uuu.sh $YESTERDAY >$email
+
+/usr/bin/mail -s "This is an auto generated email for craigslist data in $YESTERDAY." "kevin@alumni.sfu.ca,shawnleslie@gmail.com,jxjwilliam@gmail.com" < $email
+
13 mail_resumes.sh
@@ -0,0 +1,13 @@
+#! /bin/bash
+#uuu.sh $YESTERDAY >$HOME/craig/email.txt
+
+email="$HOME/craig/email_resumes.txt"
+
+cd $HOME/craig/
+
+#YESTERDAY=`date --date=today +'%F'`
+YESTERDAY=`date --date=yesterday +'%F'`
+uuu_resumes.sh $YESTERDAY >$email
+
+/usr/bin/mail -s "This is an auto generated email for craigslist resumes section data in $YESTERDAY." "info@jasonwettstein.com,shawnleslie@gmail.com,jxjwilliam@gmail.com" < $email
+
6 readme
@@ -0,0 +1,6 @@
+running:
+$ ./uscraig.sh 1
+
+1. xampp->start webserver and DB server
+
+2. http://localhost/jaabuu/
93 table.sql
@@ -0,0 +1,93 @@
+use craig;
+
+/*
+cid INT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY,
+sid int UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY,
+cid INT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY,
+iid INT UNSIGNED AUTO_INCREMENT NOT NULL PRIMARY KEY,
+*/
+
+DROP TABLE IF EXISTS craigslist_country_state;
+CREATE TABLE craigslist_country_state
+(
+ sname VARCHAR(100) NOT NULL,
+ surl VARCHAR(255) NOT NULL,
+ area VARCHAR(100) NULL,
+ selected enum('Y', 'N') not null default 'N',
+ sdate DATETIME,
+ PRIMARY KEY (surl)
+);
+
+DROP TABLE IF EXISTS craigslist_city;
+CREATE TABLE craigslist_city
+(
+ cname VARCHAR(100) NOT NULL,
+ curl VARCHAR(255) NOT NULL,
+ area1 VARCHAR(100) NULL,
+ area2 VARCHAR(100) NULL,
+ selected enum('Y', 'N') not null default 'N',
+ cdate DATETIME,
+ PRIMARY KEY (curl)
+);
+
+DROP TABLE IF EXISTS craigslist_category;
+CREATE TABLE craigslist_category
+(
+ cname VARCHAR(100) NOT NULL,
+ curl VARCHAR(255) NOT NULL,
+ selected enum('Y', 'N') not null default 'N',
+ cdate DATETIME,
+ PRIMARY KEY (curl)
+);
+
+DROP TABLE IF EXISTS craigslist_item;
+CREATE TABLE craigslist_item
+(
+ iname VARCHAR(100) NOT NULL,
+ iurl VARCHAR(255) NOT NULL,
+ category VARCHAR(100) NOT NULL,
+ selected enum('Y', 'N') not null default 'N',
+ idate DATETIME,
+ PRIMARY KEY (iurl)
+);
+
+
+DROP TABLE IF EXISTS craigslist_topic;
+CREATE TABLE craigslist_topic
+(
+ url VARCHAR(100) NOT NULL,
+ keywords VARCHAR(255) NOT NULL,
+ relevant VARCHAR(200) NOT NULL,
+ location VARCHAR(100) NULL,
+ item_url varchar(100) NOT NULL,
+ item VARCHAR(100) NOT NULL,
+ post_time DATETIME,
+ email VARCHAR(255) NOT NULL,
+ phone VARCHAR(30) NULL,
+ web VARCHAR(255) NULL,
+ city VARCHAR(100) NOT NULL,
+ category VARCHAR(100) NOT NULL,
+ date DATETIME,
+ primary key (url)
+);
+
+DROP TABLE IF EXISTS craigslist_usjobs;
+CREATE TABLE craigslist_usjobs
+(
+ id int unsigned not null auto_increment primary key,
+ url VARCHAR(100) NOT NULL,
+ keywords VARCHAR(255) NOT NULL,
+ relevant VARCHAR(200) NOT NULL,
+ location VARCHAR(100) NULL,
+ item_url varchar(100) NOT NULL,
+ item VARCHAR(100) NOT NULL,
+ post_time DATETIME,
+ email VARCHAR(255) NOT NULL,
+ phone VARCHAR(30) NULL,
+ web VARCHAR(255) NULL,
+ city VARCHAR(100) NOT NULL,
+ category VARCHAR(100) NOT NULL,
+ date DATETIME,
+ unique (url)