From db0f41d6fc0b0b20c8ee739b6c3607d254db6fc9 Mon Sep 17 00:00:00 2001 From: Olaf Alders Date: Wed, 5 Jan 2011 00:37:45 -0500 Subject: [PATCH 1/7] Adds META.yml to dist info. --- lib/MetaCPAN/Dist.pm | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/lib/MetaCPAN/Dist.pm b/lib/MetaCPAN/Dist.pm index ff7cddf68..eea832d06 100644 --- a/lib/MetaCPAN/Dist.pm +++ b/lib/MetaCPAN/Dist.pm @@ -2,13 +2,15 @@ package MetaCPAN::Dist; use Archive::Tar; use Archive::Tar::Wrapper; +use Data::Dump qw( dump ); use Devel::SimpleTrace; use File::Slurp; use Moose; use MooseX::Getopt; use Modern::Perl; -use Data::Dump qw( dump ); use Try::Tiny; +use WWW::Mechanize::Cached; +use YAML; use MetaCPAN::Pod::XHTML; @@ -34,6 +36,7 @@ has 'files' => ( lazy_build => 1, ); +has 'mech' => ( is => 'rw', lazy_build => 1 ); has 'module' => ( is => 'rw', isa => 'MetaCPAN::Schema::Result::Module' ); has 'module_rs' => ( is => 'rw' ); @@ -266,8 +269,17 @@ sub index_dist { my $module = $self->module; my $dist_name = $module->distvname; $dist_name =~ s{\-\d.*}{}g; - + my $data = { name => $dist_name, author => $module->pauseid }; + + my $res = $self->mech->get( $self->source_url('META.yml') ); + + if ( $res->code == 200 ) { + # wrap this in some flavour of eval? + my $meta_yml = Load( $res->content ); + $data->{meta_yml} = $meta_yml; + } + my @cols = ( 'download_url', 'archive', 'release_date', 'version', 'distvname' ); @@ -296,8 +308,7 @@ sub index_module { my $dist_name = $module->distvname; $dist_name =~ s{\-\d.*}{}g; - my $src_url = sprintf( 'http://search.metacpan.org/source/%s/%s/%s', - $module->pauseid, $module->distvname, $module->file ); + my $src_url = $self->source_url( $module->file ); my $data = { name => $module->name, @@ -390,6 +401,13 @@ sub _build_files { } +sub _build_mech { + + my $self = shift; + return WWW::Mechanize::Cached->new( autocheck => 0 ); + +} + sub _build_metadata { my $self = shift; @@ -479,6 +497,15 @@ sub set_archive_parent { } +sub source_url { + + my $self = shift; + my $file = shift; + return sprintf( 'http://search.metacpan.org/source/%s/%s/%s', + $self->module->pauseid, $self->module->distvname, $file ); + +} + 1; =pod From 16263b05f9812e179effb2408b6051a172786e45 Mon Sep 17 00:00:00 2001 From: Olaf Alders Date: Wed, 5 Jan 2011 00:46:21 -0500 Subject: [PATCH 2/7] Adds abstract to module info. --- lib/MetaCPAN/Dist.pm | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/lib/MetaCPAN/Dist.pm b/lib/MetaCPAN/Dist.pm index eea832d06..873aee663 100644 --- a/lib/MetaCPAN/Dist.pm +++ b/lib/MetaCPAN/Dist.pm @@ -8,6 +8,7 @@ use File::Slurp; use Moose; use MooseX::Getopt; use Modern::Perl; +use Pod::POM; use Try::Tiny; use WWW::Mechanize::Cached; use YAML; @@ -164,6 +165,24 @@ sub process_cookbooks { } +sub get_abstract { + + my $self = shift; + my $parser = Pod::POM->new; + my $pom = $parser->parse_text( shift ) || return; + + foreach my $s ( @{ $pom->head1 } ) { + if ( $s->title eq 'NAME' ) { + my $content = $s->content; + $content =~ s{\A.*\-\s}{}; + $content =~ s{\s*\z}{}; + return $content; + } + } + + return; +} + sub get_content { my $self = shift; @@ -251,7 +270,8 @@ sub index_pod { #my %cols = $module->get_columns; #say dump( \%cols ); - $self->index_module( $file ); + my $abstract = $self->get_abstract( $content ); + $self->index_module( $file, $abstract ); push @{ $self->es_inserts }, \%pod_insert; @@ -304,6 +324,7 @@ sub index_module { my $self = shift; my $file = shift; + my $abstract = shift; my $module = $self->module; my $dist_name = $module->distvname; $dist_name =~ s{\-\d.*}{}g; @@ -323,6 +344,8 @@ sub index_module { foreach my $col ( @cols ) { $data->{$col} = $module->$col; } + + $data->{abstract} = $abstract if $abstract; my %es_insert = ( index => { @@ -333,7 +356,7 @@ sub index_module { } ); - #say dump( \%es_insert ); + say dump( \%es_insert ); push @{ $self->es_inserts }, \%es_insert; } From bc8bfecc3cd7fc5823e6e10895c30f0146b9508e Mon Sep 17 00:00:00 2001 From: Olaf Alders Date: Wed, 5 Jan 2011 22:41:29 -0600 Subject: [PATCH 3/7] Fixes uninitialized warning. --- lib/MetaCPAN/Author.pm | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/MetaCPAN/Author.pm b/lib/MetaCPAN/Author.pm index ab2f57360..6c9926928 100755 --- a/lib/MetaCPAN/Author.pm +++ b/lib/MetaCPAN/Author.pm @@ -100,7 +100,11 @@ sub author_config { # uncomment this when search.metacpan can deal with lists in values my @lists = qw( website email books blog_url blog_feed cats dogs ); foreach my $key ( @lists ) { - if ( exists $conf->{$key} && reftype( $conf->{$key} ) ne 'ARRAY' ) { + if (exists $conf->{$key} + && ( !reftype( $conf->{$key} ) + || reftype( $conf->{$key} ) ne 'ARRAY' ) + ) + { $conf->{$key} = [ $conf->{$key} ]; } } @@ -109,6 +113,8 @@ sub author_config { } + + sub _build_author_fh { my $self = shift; From be4090e373412a31bc7cbc20d1b7f51546d09bd3 Mon Sep 17 00:00:00 2001 From: Olaf Alders Date: Wed, 5 Jan 2011 22:42:09 -0600 Subject: [PATCH 4/7] Adds abstract to module mapping. --- elasticsearch/map_modules.pl | 1 + 1 file changed, 1 insertion(+) diff --git a/elasticsearch/map_modules.pl b/elasticsearch/map_modules.pl index c69477003..caf5e6973 100755 --- a/elasticsearch/map_modules.pl +++ b/elasticsearch/map_modules.pl @@ -30,6 +30,7 @@ sub put_mapping { #_source => { compress => 1 }, properties => { + abstract => { type => "string" }, archive => { type => "string" }, author => { type => "string" }, distname => { type => "string" }, From 185af6db44c53c3d5ede8db724e9d1b2867be10d Mon Sep 17 00:00:00 2001 From: Olaf Alders Date: Wed, 5 Jan 2011 22:42:45 -0600 Subject: [PATCH 5/7] Extends timeout on ES indexing --- lib/MetaCPAN/Role/Common.pm | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/MetaCPAN/Role/Common.pm b/lib/MetaCPAN/Role/Common.pm index a75a2217d..fe78d6e4b 100644 --- a/lib/MetaCPAN/Role/Common.pm +++ b/lib/MetaCPAN/Role/Common.pm @@ -49,7 +49,8 @@ sub _build_es { my $e = ElasticSearch->new( servers => 'localhost:9200', - transport => 'http', # default 'http' + transport => 'httplite', # default 'http' + timeout => 300, #trace_calls => 'log_file', ); From d00f1da7ac5f00b864eecd651f7dfb21ff3cfe89 Mon Sep 17 00:00:00 2001 From: Olaf Alders Date: Wed, 5 Jan 2011 22:43:06 -0600 Subject: [PATCH 6/7] Stringifies vstrings created when META.yml files are loaded. Indexing now skips already indexed dists by default. --- elasticsearch/index_dists.pl | 38 +++++++------- lib/MetaCPAN/Dist.pm | 97 ++++++++++++++++++++++++++++++------ 2 files changed, 102 insertions(+), 33 deletions(-) diff --git a/elasticsearch/index_dists.pl b/elasticsearch/index_dists.pl index 57ea991b7..f76a2d584 100755 --- a/elasticsearch/index_dists.pl +++ b/elasticsearch/index_dists.pl @@ -24,7 +24,8 @@ if ( $cpan->dist_like ) { say "searching for dists like: " . $cpan->dist_like; - $dists = search_dists( { dist => { like => $cpan->dist_like, '!=' => undef, } } ); + $dists = search_dists( + { dist => { like => $cpan->dist_like, '!=' => undef, } } ); } elsif ( $cpan->dist_name ) { @@ -37,7 +38,9 @@ $dists = search_dists(); } -foreach my $dist ( @{$dists} ) { +my %reverse = reverse %{$dists}; + +foreach my $dist ( sort values %{$dists} ) { process_dist( $dist ); } @@ -51,22 +54,28 @@ sub process_dist { say '+' x 20 . " DIST: $distvname" if $cpan->debug; - my $dist = MetaCPAN::Dist->new( distvname => $distvname, module_rs => $cpan->module_rs ); + my $dist = MetaCPAN::Dist->new( + distvname => $distvname, + dist_name => $reverse{$distvname}, + module_rs => $cpan->module_rs + ); + $dist->process; say "Found " . scalar @{ $dist->processed } . " modules in dist"; - $dist->tar->clear if $dist->tar; + + #$dist->tar->clear if $dist->tar; $dist = undef; - + ++$attempts; - + # diagnostics if ( every( $every ) ) { my $iter_time = tv_interval( $t0, [gettimeofday] ); my $elapsed = tv_interval( $t_begin, [gettimeofday] ); say '#' x 78; - + say "$distvname"; # if $icpan->debug; say "$iter_time to process dist"; say "$elapsed so far... ($attempts dists out of $total_dists)"; @@ -81,7 +90,6 @@ sub process_dist { } - return; } @@ -92,23 +100,19 @@ sub search_dists { my $search = $cpan->module_rs->search( $constraints, - { +select => ['distvname', 'dist',], + { +select => [ 'distvname', 'dist', ], distinct => 1, order_by => 'distvname ASC', } ); - - my %dist = ( ); + + my %dist = (); while ( my $row = $search->next ) { $dist{ $row->dist } = $row->distvname; } + say "found " . scalar (keys %dist) . " dists"; - my @dists = sort values %dist; - - $total_dists = scalar @dists; - say "found $total_dists distros"; - - return \@dists; + return \%dist; } diff --git a/lib/MetaCPAN/Dist.pm b/lib/MetaCPAN/Dist.pm index 873aee663..da13a2d27 100644 --- a/lib/MetaCPAN/Dist.pm +++ b/lib/MetaCPAN/Dist.pm @@ -20,6 +20,8 @@ with 'MetaCPAN::Role::DB'; has 'archive_parent' => ( is => 'rw', ); +has 'dist_name' => ( is => 'rw', ); + has 'distvname' => ( is => 'rw', required => 1, @@ -37,6 +39,7 @@ has 'files' => ( lazy_build => 1, ); +has 'max_bulk' => ( is => 'rw', default => 10 ); has 'mech' => ( is => 'rw', lazy_build => 1 ); has 'module' => ( is => 'rw', isa => 'MetaCPAN::Schema::Result::Module' ); @@ -68,6 +71,7 @@ has 'tar_wrapper' => ( lazy_build => 1, ); +has 'update_only' => ( is => 'rw', default => 1 ); sub archive_path { @@ -80,6 +84,13 @@ sub process { my $self = shift; my $success = 0; + + # skip dists already in the index + if ( $self->update_only && $self->is_indexed ) { + say '-'x200 . 'skipped: ' . $self->distvname; + return; + } + my $module_rs = $self->module_rs->search({ distvname => $self->distvname }); my @modules = (); @@ -117,12 +128,13 @@ MODULE: } - $self->index_dist; $self->process_cookbooks; + $self->index_dist; if ( $self->es_inserts ) { + #$self->es->transport->JSON->convert_blessed(1); my $result = $self->es->bulk( $self->es_inserts ); - #say dump( $self->es_inserts ); + #say dump( $result ); } elsif ( $self->debug ) { @@ -165,6 +177,22 @@ sub process_cookbooks { } +sub push_inserts { + + my $self = shift; + my $inserts = shift; + + push @{$self->es_inserts}, @{$inserts}; + if ( scalar @{$self->es_inserts } > $self->max_bulk ) { + my $result = $self->es->bulk( $self->es_inserts ); + say dump( $result ); + $self->es_inserts([]); + } + + return; + +} + sub get_abstract { my $self = shift; @@ -273,7 +301,7 @@ sub index_pod { my $abstract = $self->get_abstract( $content ); $self->index_module( $file, $abstract ); - push @{ $self->es_inserts }, \%pod_insert; + $self->push_inserts([ \%pod_insert ]); # if this line is uncommented some pod, like Dancer docs gets skipped delete $self->files->{$file}; @@ -287,19 +315,29 @@ sub index_dist { my $self = shift; my $module = $self->module; - my $dist_name = $module->distvname; - $dist_name =~ s{\-\d.*}{}g; - - my $data = { name => $dist_name, author => $module->pauseid }; - my $res = $self->mech->get( $self->source_url('META.yml') ); - + my $data = { name => $self->dist_name, author => $module->pauseid }; + my $res = $self->mech->get( $self->source_url( 'META.yml' ) ); + if ( $res->code == 200 ) { - # wrap this in some flavour of eval? - my $meta_yml = Load( $res->content ); - $data->{meta_yml} = $meta_yml; + + # some meta files are missing a trailing newline + my $meta_yml = try { Load( $res->content . "\n" ) } catch {undef}; + + if ( exists $meta_yml->{provides} ) { + foreach my $key ( keys %{ $meta_yml->{provides} } ) { + if ( exists $meta_yml->{provides}->{$key}->{version} ) { + $meta_yml->{provides}->{$key}->{version} .= ''; + } + } + } + if ( exists $meta_yml->{version} ) { + $meta_yml->{version} .= ''; + } + $data->{meta_yml} = $meta_yml if $meta_yml; + } - + my @cols = ( 'download_url', 'archive', 'release_date', 'version', 'distvname' ); @@ -307,19 +345,24 @@ sub index_dist { $data->{$col} = $module->$col; } + #say dump( $data ); + my %es_insert = ( index => { index => 'cpan', type => 'dist', - id => $dist_name, + id => $self->dist_name, data => $data, } ); - push @{ $self->es_inserts }, \%es_insert; + $self->push_inserts( [ \%es_insert ] ); + + return; } + sub index_module { my $self = shift; @@ -357,7 +400,7 @@ sub index_module { ); say dump( \%es_insert ); - push @{ $self->es_inserts }, \%es_insert; + $self->push_inserts([ \%es_insert ]); } @@ -388,6 +431,28 @@ sub get_files { } +sub is_indexed { + + my $self = shift; + my $success = 0; + say "looking for " . $self->dist_name; + my $get = try { + $self->es->get( + index => 'cpan', + type => 'dist', + id => $self->dist_name, + ); + }; + + if ( $get->{_source}->{distvname} eq $self->distvname ) { + return 1; + } + #say dump( $get ); + + return $success; + +} + sub _build_files { my $self = shift; From e5668f3981e907437f35266cdf4855cbc11f68a6 Mon Sep 17 00:00:00 2001 From: Olaf Alders Date: Wed, 5 Jan 2011 22:44:50 -0600 Subject: [PATCH 7/7] Adds a script which can be used to create and index a new instance of ES. --- elasticsearch/start_fresh.sh | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 elasticsearch/start_fresh.sh diff --git a/elasticsearch/start_fresh.sh b/elasticsearch/start_fresh.sh new file mode 100644 index 000000000..232939782 --- /dev/null +++ b/elasticsearch/start_fresh.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +perl delete_index.pl cpan +perl create_index.pl cpan +perl map_modules.pl +perl index_authors.pl +perl index_cpanratings.pl +perl index_dists.pl --refresh_db +#cd /home/cpan/elasticsearch && zip -r /home/cpan/data_snapshot.zip data