Permalink
Browse files

fix calls to knownWork, tidy up bib parsing. Closes #33

  • Loading branch information...
1 parent a2ff855 commit e3ba741873bcb78237e88740a0417f04b1ab79fd @wo committed Aug 6, 2011
Showing with 71 additions and 74 deletions.
  1. +54 −50 Extractor.pm
  2. +13 −20 rules/Bib_Features.pm
  3. +1 −1 rules/Helper.pm
  4. +3 −2 rules/Keywords.pm
  5. +0 −1 rules/Line_Features.pm
View
@@ -2,9 +2,9 @@ package Extractor;
use strict;
use warnings;
use Memoize;
-use List::Util qw/min max reduce/;
+use List::Util qw/min max reduce first/;
use Statistics::Lite 'stddev';
-use Text::Names 'samePerson';
+use Text::Names qw/samePerson parseNames reverseName/;
use Cwd 'abs_path';
use File::Basename;
use Encode;
@@ -990,28 +990,25 @@ sub extract_bibliography {
say(5, "Quality: ", $parsing->{quality});
push @parsings, $parsing;
}
- say(5, "end of parsings");
@parsings = sort { $b->{quality} <=> $a->{quality} } @parsings;
my $parsing = shift @parsings;
say(3, "best parsing", $parsing->{text});
foreach my $block (@{$parsing->{blocks}}) {
- my $entry = $self->parsebib($block);
- if ($entry) {
- if (@{$entry->{authors}} && $entry->{authors}->[0] eq '-'
- && @{$self->{bibliography}}) {
- $entry->{authors} = $self->{bibliography}->[-1]->{authors};
- }
- push @{$self->{bibliography}}, $entry;
- }
+ # need to pass previous authors in case author field is '--':
+ my @last_authors = @{$self->{bibliography}} ?
+ @{$self->{bibliography}->[-1]->{authors}} : ();
+ my $entry = $self->parsebib($block, @last_authors);
+ push(@{$self->{bibliography}}, $entry) if $entry;
}
}
sub parsebib {
my $self = shift;
my $entry = shift;
+ my @last_authors = @_;
say(3, "\nparsing bib entry: ", $entry->{text});
$entry->{text} = tidy_text($entry->{text});
@@ -1070,22 +1067,16 @@ sub parsebib {
say(5, "evaluating parsing $counter (sat $satisfaction)");
my @blocks;
my $mkblock = make_block(' ');
- my ($author, $title);
+ my %fields;
for (my $i=0; $i < @$chunks; $i++) {
my $chunk = $chunks->[$i];
my $is = $chunk->{label};
- say(5, " $i: ",($is->{TITLE} ? 'TITLE ' : ''),
- ($is->{AUTHOR} ? 'AUTHOR ' : ''),
- ($is->{YEAR} ? 'YEAR ' : ''),
- '| ', $chunk->{text});
- if ($is->{TITLE} && $title || $is->{AUTHOR} && $author) {
+ my $label = first { $is->{$_} } @labels;
+ say(5, " $i: $label | ", $chunk->{text});
+ if (grep($label, ('AUTHOR', 'TITLE')) && $fields{$label}) {
say(5, "double title or author");
next PARSING;
}
- my $label = '';
- foreach (@labels) {
- $label = $_ if $is->{$_};
- }
my @block_chunks = ($chunk);
while ($chunks->[$i+1]
&& $chunks->[$i+1]->{label}->{$label}) {
@@ -1094,10 +1085,13 @@ sub parsebib {
my $block = $mkblock->(@block_chunks);
$block->{id} = scalar @blocks;
pushlink @blocks, $block;
- $title = $block if $is->{TITLE};
- $author = $block if $is->{AUTHOR};
+ $fields{$label} = $block;
}
my $parsing = parsing(\@blocks);
+ if ($fields{'AUTHORDASH'}) {
+ $parsing->{dash_authors} = \@last_authors;
+ }
+ $parsing->{bib} = bib_from_parsing($parsing);
my $quality = $evaluator->($parsing);
say(5, "Quality: $quality");
if ($quality > 0.5) {
@@ -1107,37 +1101,47 @@ sub parsebib {
}
}
- my $res = { authors => [] };
+ return { authors => [] } unless @parsings;
- if (@parsings) {
- @parsings = sort { $b->{quality} <=> $a->{quality} } @parsings;
+ @parsings = sort { $b->{quality} <=> $a->{quality} } @parsings;
- my $parsing = shift @parsings;
- say(3, "best parsing", $parsing->{text});
- foreach my $block (@{$parsing->{blocks}}) {
- if ($block->{label}->{TITLE}) {
- $res->{title} = tidy_text($block->{text});
- say(3, "title: $res->{title}");
- }
- elsif ($block->{label}->{AUTHORDASH}) {
- $res->{authors} = ['-'];
- say(3, 'authors: -');
- }
- elsif ($block->{label}->{AUTHOR}) {
- my @authors = Text::Names::parseNames($block->{text});
- @authors = map { Text::Names::reverseName($_) } @authors;
- $res->{authors} = \@authors;
- say(3, 'authors: '.join('; ', @authors));
- }
- elsif ($block->{label}->{YEAR} && !$res->{year}) {
- $res->{year} = $block->{text};
- $res->{year} =~ s/.*(\d{4}(?:$re_dash\d{2,4})?).*/$1/;
- say(3, "year: $res->{year}");
- }
+ my $parsing = shift @parsings;
+ say(3, "best parsing", $parsing->{text});
+ my $res = $parsing->{bib};
+ say(3, "authors: ", @{$res->{authors}}, "; ",
+ "title: $res->{title}; year: $res->{year}; ",
+ "known id: $res->{known_id}");
+ return $res;
+
+}
+
+sub bib_from_parsing {
+ my $parsing = shift;
+ my %fields;
+ foreach my $bl (@{$parsing->{blocks}}) {
+ foreach my $label (keys %{$bl->{label}}) {
+ $fields{$label} ||= $bl->{text};
}
- $res->{id} = $parsing->{known_id};
}
-
+ my $res;
+ $res->{title} = tidy_text($fields{TITLE} || '');
+ $res->{title} =~ s/[\.,]$//;
+ $res->{year} = $fields{YEAR} || '';
+ $res->{year} =~ s/.*(\d{4}(?:$re_dash\d{2,4})?).*/$1/;
+ $res->{authors} = [];
+ if ($fields{AUTHOR}) {
+ my @authors = Text::Names::parseNames($fields{AUTHOR});
+ @authors = map { Text::Names::reverseName($_) } @authors;
+ $res->{authors} = \@authors;
+ }
+ if ($parsing->{dash_authors}) {
+ $res->{authors} = [@{$parsing->{dash_authors}}, @{$res->{authors}}];
+ }
+ if ($verbosity > 4) {
+ say(5, 'tidied-up bib:');
+ use Data::Dumper;
+ print Dumper $res;
+ }
return $res;
}
View
@@ -31,6 +31,7 @@ $fragment_features{AUTHOR} = [
['early in entry', [0.2, -0.4]],
['in quotes', [-0.7, 0.05]],
['italic', [-0.4, 0.05]],
+ ['publication status', [-0.7, 0]],
['after year string', [-0.4, 0.1]],
['after parenthesis', [-0.3, 0.05]],
['continues author', [0.6, -0.6], 2],
@@ -57,6 +58,7 @@ $fragment_features{TITLE} = [
['after italics', [-0.4, 0]],
['after quote', [-0.4, 0.05]],
['contains editor string', [-0.5, 0]],
+ ['publication status', [-0.7, 0]],
['probable AUTHOR', [-0.2, 0.05], 2],
['part of best author sequence', [-0.3, 0.3], 2],
['contains journal or publisher word', [-0.2, 0], 2],
@@ -68,8 +70,8 @@ $fragment_features{TITLE} = [
];
$fragment_features{YEAR} = [
- ['year', [0.9, -0.8]],
- ['publication status', [0.9, -0.1]],
+ [$or->('year', 'publication status'), [0.8, -0.8]],
+ ['year', [0.9, -0.1]],
];
$fragment_features{OTHER} = [
@@ -164,14 +166,14 @@ my $re_year = '(?<!\d)[1-2]\d{3}(?!\d)';
$f{'after year string'} = memoize(sub {
my $w = $_[0];
while ($w = $w->{prev}) {
- return 1 if ($w->{text} =~ /$re_year|$re_year_words/);
+ return 1 if ($w->{text} =~ /$re_year|$re_year_words/i);
}
return 0;
});
$f{'follows year'} = sub {
if ($_[0]->{prev} &&
- $_[0]->{text} =~ /$re_year|$re_year_words/) {
+ $_[0]->{text} =~ /$re_year|$re_year_words/i) {
return 1;
}
return 0;
@@ -536,22 +538,13 @@ $f{'lengthy OTHER block before title'} = sub {
our $known_work = \&known_work;
$f{'is known work'} = sub {
- my (@authors, $title, $year);
- foreach my $bl (@{$_[0]->{blocks}}) {
- if ($bl->{label}->{TITLE}) {
- $title = $bl->{text};
- }
- elsif ($bl->{label}->{AUTHOR}) {
- push @authors, $bl->{text};
- }
- }
- return 0 unless (@authors && $title);
- # TODO: tidy up authors and title?
- my $id = $known_work->(authors => \@authors,
- title => $title,
- year => $year);
- $_[0]->{known_id} = $id if $id;
- return 1;
+ my $bib = $_[0]->{bib};
+ return 0 unless (@{$bib->{authors}} && $bib->{title});
+ my $id = $known_work->(authors => $bib->{authors},
+ title => $bib->{title},
+ year => $bib->{year});
+ $bib->{known_id} = $id if $id;
+ return $id ? 1 : 0;
};
View
@@ -208,7 +208,7 @@ sub known_work {
my $work = {
authors => [ split(/, /, $authors) ],
title => $title,
- date => undef
+ date => ''
};
if (sameWork(\%args, $work)) {
return $id;
View
@@ -216,8 +216,9 @@ our $re_lquote = '["“`‘¨‘‛‟„‵‶‷❛❝]';
our $re_rquote = '["¨´’’‛”′″‴⁗❜❞]';
-our $re_year_words =
- '\b(?:forthcoming|manuscript|unpublished|typescript|draft)\b';
+our $re_year_words = qr/\b(?:
+ forthcoming|manuscript|unpublished|typescript|draft
+ )/ix;
my $re_bad_abstract = qr/(?:
^\s*<sup>|
View
@@ -5,7 +5,6 @@ use File::Basename;
use Cwd 'abs_path';
use String::Approx 'amatch';
use Memoize;
-use Text::Names;
use List::Util qw/min max/;
use util::Functools qw/someof allof/;
use util::String;

0 comments on commit e3ba741

Please sign in to comment.