Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[VUFIND-1630] Alphabrowse: new normalizer for titles based on SolrMarc titleSortLower #3024

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
920ba46
[VUFIND-1630] Alphabrowse: new normalizer for titles based on SolrMar…
damien-git Aug 4, 2023
862a820
[VUFIND-1630] SOLR_JAR_PATH env + classpath fix
damien-git Aug 10, 2023
230f790
[VUFIND-1630] Classpath optimization
damien-git Aug 10, 2023
6eac2f8
[VUFIND-1630] Removed duplicate solrmarc_core.jar, updated browse-han…
damien-git Aug 10, 2023
44c8488
[VUFIND-1630] Updated browse-indexing.jar
damien-git Aug 14, 2023
b2e92f5
[VUFIND-1630] Added check for single solrmarc jar
damien-git Aug 18, 2023
81110e0
Merge branch 'dev' into fix_alphabrowse_title_normalizer
demiankatz Sep 7, 2023
0c19788
Add titleSortLower helper to XSLT code.
demiankatz Sep 7, 2023
81ba97c
Use new helper in XSLT examples.
demiankatz Sep 7, 2023
b915e4e
Fix whitespace glitch.
demiankatz Sep 7, 2023
d641363
Add another test case.
demiankatz Sep 7, 2023
0acbc25
[VUFIND-1630] Removed Log4j classpath as import/lib/reload4j-*.jar wi…
damien-git Sep 12, 2023
eca24b9
Merge branch 'dev' into fix_alphabrowse_title_normalizer
demiankatz Sep 12, 2023
beff9a5
Add cleanData logic.
demiankatz Sep 12, 2023
6818df3
php-cs-fixer.
demiankatz Sep 12, 2023
b9e80dd
Fix phpstan issues.
demiankatz Sep 12, 2023
d8fa924
Further phpstan refinement.
demiankatz Sep 12, 2023
b5bb9ea
More regex fixes.
demiankatz Sep 12, 2023
e895ce0
Add SOLR_JAR_PATH support and comment.
demiankatz Sep 13, 2023
61db918
Bare minimum SOLRMARC_CLASSPATH support.
demiankatz Sep 13, 2023
72c220d
Add counting/validation logic for jar detection.
demiankatz Sep 13, 2023
2f715f3
Merge branch 'dev' into fix_alphabrowse_title_normalizer
demiankatz Sep 14, 2023
2cacb9b
Expand/simplify AlphabrowseTest.
demiankatz Sep 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Binary file modified import/browse-indexing.jar
Binary file not shown.
2 changes: 2 additions & 0 deletions import/marc.properties
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ title_auth = 245ab, first
title_alt = 100t:130adfgklnpst:240a:246abnp:505t:700t:710t:711t:730adfgklnpst:740a
title_old = 780ast
title_new = 785ast
# If titleSortLower is changed in title_sort, the normalizer in solr/vufind/biblio/conf/solrconfig.xml and
# index-alphabetic-browse.sh should be changed accordingly.
title_sort = 245abkp,titleSortLower,first
series = 440ap:800abcdfpqt:830ap
series2 = 490a
Expand Down
2 changes: 1 addition & 1 deletion import/xsl/archivesspace.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@
<xsl:value-of select="dc:title[normalize-space()]"/>
</field>
<field name="title_sort">
<xsl:value-of select="php:function('VuFind::stripArticles', string(dc:title[normalize-space()]))"/>
<xsl:value-of select="php:function('VuFind::titleSortLower', php:function('VuFind::stripArticles', string(dc:title[normalize-space()])))"/>
</field>
</xsl:if>

Expand Down
2 changes: 1 addition & 1 deletion import/xsl/doaj.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@
<xsl:value-of select="oai_doaj:title[normalize-space()]"/>
</field>
<field name="title_sort">
<xsl:value-of select="php:function('VuFind::stripArticles', string(oai_doaj:title[normalize-space()]))"/>
<xsl:value-of select="php:function('VuFind::titleSortLower', php:function('VuFind::stripArticles', string(oai_doaj:title[normalize-space()])))"/>
</field>
</xsl:if>

Expand Down
2 changes: 1 addition & 1 deletion import/xsl/dspace-dim.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@
<xsl:value-of select="dim:field[@element='title']"/>
</field>
<field name="title_sort">
<xsl:value-of select="php:function('VuFind::stripArticles', string(dim:field[@element='title'][normalize-space()]))"/>
<xsl:value-of select="php:function('VuFind::titleSortLower', php:function('VuFind::stripArticles', string(dim:field[@element='title'][normalize-space()])))"/>
</field>
</xsl:if>

Expand Down
2 changes: 1 addition & 1 deletion import/xsl/dspace.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@
<xsl:value-of select="dc:title[normalize-space()]"/>
</field>
<field name="title_sort">
<xsl:value-of select="php:function('VuFind::stripArticles', string(dc:title[normalize-space()]))"/>
<xsl:value-of select="php:function('VuFind::titleSortLower', php:function('VuFind::stripArticles', string(dc:title[normalize-space()])))"/>
</field>
</xsl:if>

Expand Down
2 changes: 1 addition & 1 deletion import/xsl/gsdl.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@
<xsl:value-of select="dc:title[normalize-space()]"/>
</field>
<field name="title_sort">
<xsl:value-of select="php:function('VuFind::stripArticles', string(dc:title[normalize-space()]))"/>
<xsl:value-of select="php:function('VuFind::titleSortLower', php:function('VuFind::stripArticles', string(dc:title[normalize-space()])))"/>
</field>
</xsl:if>
<!-- PUBLISHER -->
Expand Down
2 changes: 1 addition & 1 deletion import/xsl/intech.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@
<xsl:value-of select="dc:title[normalize-space()]"/>
</field>
<field name="title_sort">
<xsl:value-of select="php:function('VuFind::stripArticles', string(dc:title[normalize-space()]))"/>
<xsl:value-of select="php:function('VuFind::titleSortLower', php:function('VuFind::stripArticles', string(dc:title[normalize-space()])))"/>
</field>
</xsl:if>

Expand Down
2 changes: 1 addition & 1 deletion import/xsl/ndltd.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@
<xsl:value-of select="dc:title[normalize-space()]"/>
</field>
<field name="title_sort">
<xsl:value-of select="php:function('VuFind::stripArticles', string(dc:title[normalize-space()]))"/>
<xsl:value-of select="php:function('VuFind::titleSortLower', php:function('VuFind::stripArticles', string(dc:title[normalize-space()])))"/>
</field>
</xsl:if>

Expand Down
2 changes: 1 addition & 1 deletion import/xsl/nlm_ojs.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@
<xsl:value-of select="//nlm:article-title[normalize-space()]"/>
</field>
<field name="title_sort">
<xsl:value-of select="php:function('VuFind::stripArticles', string(//nlm:article-title[normalize-space()]))"/>
<xsl:value-of select="php:function('VuFind::titleSortLower', php:function('VuFind::stripArticles', string(//nlm:article-title[normalize-space()])))"/>
</field>
<field name="title_alt">
<xsl:value-of select="//nlm:trans-title[normalize-space()]"/>
Expand Down
4 changes: 2 additions & 2 deletions import/xsl/ojs.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@
<xsl:value-of select="dc:title[@xml:lang=$preferred_lang][normalize-space()]"/>
</field>
<field name="title_sort">
<xsl:value-of select="php:function('VuFind::stripArticles', string(dc:title[@xml:lang=$preferred_lang][normalize-space()]))"/>
<xsl:value-of select="php:function('VuFind::titleSortLower', php:function('VuFind::stripArticles', string(dc:title[@xml:lang=$preferred_lang][normalize-space()])))"/>
</field>
<xsl:for-each select="dc:title[@xml:lang!=$preferred_lang][normalize-space()]">
<field name="title_alt">
Expand All @@ -137,7 +137,7 @@
<xsl:value-of select="."/>
</field>
<field name="title_sort">
<xsl:value-of select="php:function('VuFind::stripArticles', string(.))"/>
<xsl:value-of select="php:function('VuFind::titleSortLower', php:function('VuFind::stripArticles', string(.)))"/>
</field>
</xsl:if>
<xsl:if test="position()>1">
Expand Down
2 changes: 1 addition & 1 deletion import/xsl/subjectsplus.xsl
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@
<xsl:value-of select="dc:title[normalize-space()]"/>
</field>
<field name="title_sort">
<xsl:value-of select="php:function('VuFind::stripArticles', string(dc:title[normalize-space()]))"/>
<xsl:value-of select="php:function('VuFind::titleSortLower', php:function('VuFind::stripArticles', string(dc:title[normalize-space()])))"/>
</field>
</xsl:if>

Expand Down
2 changes: 1 addition & 1 deletion import/xsl/vudl_FOXML.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
<xsl:value-of select="$DC//dc:title[normalize-space()]"/>
</field>
<field name="title_sort">
<xsl:value-of select="php:function('VuFind::stripArticles', string($DC//dc:title[normalize-space()]))"/>
<xsl:value-of select="php:function('VuFind::titleSortLower', php:function('VuFind::stripArticles', string($DC//dc:title[normalize-space()])))"/>
</field>

<!-- title_alt / dc:titel[gt 1] -->
Expand Down
21 changes: 19 additions & 2 deletions index-alphabetic-browse.bat
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,23 @@ set SOLR_JAR_PATH=%SOLR_HOME%\..\vendor
:solrjarpathfound

cd %VUFIND_HOME%\import
SET CLASSPATH="browse-indexing.jar;%VUFIND_HOME%\import\lib\*;%SOLR_HOME%\jars\*;%SOLR_JAR_PATH%\modules\analysis-extras\lib\*;%SOLR_JAR_PATH%\server\solr-webapp\webapp\WEB-INF\lib\*"
setlocal enabledelayedexpansion
set SOLRMARC_MATCHCOUNT=x
for %%a in (solrmarc_core*.jar) do (
set SOLRMARC_CLASSPATH=%%a
set SOLRMARC_MATCHCOUNT=!SOLRMARC_MATCHCOUNT!x
)
setlocal disabledelayedexpansion
rem Make sure we found one, and only one, SolrMarc jar file
if "%SOLRMARC_MATCHCOUNT%"=="xx" goto onesolrmarcfound
if "%SOLRMARC_MATCHCOUNT%"=="x" goto nosolrmarcfound
echo Error: more than one solrmarc_core*.jar in import; exiting.
goto end
:nosolrmarcfound
echo "Error: could not find solrmarc_core*.jar in import; exiting.
goto end
:onesolrmarcfound
SET CLASSPATH="browse-indexing.jar;%SOLRMARC_CLASSPATH%;%VUFIND_HOME%\import\lib\*;%SOLR_HOME%\jars\*;%SOLR_JAR_PATH%\modules\analysis-extras\lib\*;%SOLR_JAR_PATH%\server\solr-webapp\webapp\WEB-INF\lib\*"

SET bib_index=%SOLR_HOME%\biblio\index
SET auth_index=%SOLR_HOME%\authority\index
Expand All @@ -67,8 +83,9 @@ if exist %index_dir% goto nomakeindexdir
mkdir "%index_dir%"
:nomakeindexdir

rem These parameters should match the ones in solr/vufind/biblio/conf/solrconfig.xml - BrowseRequestHandler
call %VUFIND_HOME%\index-alphabetic-browse.bat build_browse hierarchy hierarchy_browse
call %VUFIND_HOME%\index-alphabetic-browse.bat build_browse title title_fullStr 1 "-Dbibleech=StoredFieldLeech -Dsortfield=title_sort -Dvaluefield=title_fullStr"
call %VUFIND_HOME%\index-alphabetic-browse.bat build_browse title title_fullStr 1 "-Dbibleech=StoredFieldLeech -Dsortfield=title_sort -Dvaluefield=title_fullStr -Dbrowse.normalizer=org.vufind.util.TitleNormalizer"
call %VUFIND_HOME%\index-alphabetic-browse.bat build_browse topic topic_browse
call %VUFIND_HOME%\index-alphabetic-browse.bat build_browse author author_browse
call %VUFIND_HOME%\index-alphabetic-browse.bat build_browse lcc callnumber-raw 1 "-Dbrowse.normalizer=org.vufind.util.LCCallNormalizer"
Expand Down
11 changes: 9 additions & 2 deletions index-alphabetic-browse.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,13 @@ set -e
set -x

cd "`dirname $0`/import"
CLASSPATH="browse-indexing.jar:${VUFIND_HOME}/import/lib/*:${SOLR_HOME}/jars/*:${SOLR_JAR_PATH}/modules/analysis-extras/lib/*:${SOLR_JAR_PATH}/server/solr-webapp/webapp/WEB-INF/lib/*"
SOLRMARC_CLASSPATH=$(echo solrmarc_core*.jar)
if [[ `wc -w <<<"$SOLRMARC_CLASSPATH"` -gt 1 ]]
then
echo "Error: more than one solrmarc_core*.jar in import/; exiting."
exit 1
fi
CLASSPATH="browse-indexing.jar:${SOLRMARC_CLASSPATH}:${VUFIND_HOME}/import/lib/*:${SOLR_HOME}/jars/*:${SOLR_JAR_PATH}/modules/analysis-extras/lib/*:${SOLR_JAR_PATH}/server/solr-webapp/webapp/WEB-INF/lib/*"

# make index work with replicated index
# current index is stored in the last line of index.properties
Expand Down Expand Up @@ -93,8 +99,9 @@ function build_browse
mv "${browse}_browse.db" "$index_dir/${browse}_browse.db-updated"
touch "$index_dir/${browse}_browse.db-ready"
}
# These parameters should match the ones in solr/vufind/biblio/conf/solrconfig.xml - BrowseRequestHandler
build_browse "hierarchy" "hierarchy_browse"
build_browse "title" "title_fullStr" 1 "-Dbibleech=StoredFieldLeech -Dsortfield=title_sort -Dvaluefield=title_fullStr"
build_browse "title" "title_fullStr" 1 "-Dbibleech=StoredFieldLeech -Dsortfield=title_sort -Dvaluefield=title_fullStr -Dbrowse.normalizer=org.vufind.util.TitleNormalizer"
build_browse "topic" "topic_browse"
build_browse "author" "author_browse"
build_browse "lcc" "callnumber-raw" 1 "-Dbrowse.normalizer=org.vufind.util.LCCallNormalizer"
Expand Down
131 changes: 131 additions & 0 deletions module/VuFind/src/VuFind/XSLT/Import/VuFind.php
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,137 @@ public static function stripArticles($in)
return $text;
}

/**
* Strip accents from a string.
*
* @param string $str String to process.
*
* @return string Processed string.
*/
public static function stripAccents(string $str): string
{
$tl = \Transliterator::create('Latin-ASCII;');
return $tl->transliterate($str);
}

/**
* Strip punctuation from a string.
*
* @param string $str String to process.
*
* @return string Processed string.
*/
public static function stripPunctuation(string $str): string
{
// Convert strings of spaces and punctuation into single spaces, for
// consistency with SolrMarc behavior.
return preg_replace('/[[:punct:]\s]+/', ' ', $str);
}

/**
* Remove single square bracket characters if they are the start and/or end
* chars (matched or unmatched) and are the only square bracket chars in the
* string.
*
* Ported from SolrMarc's DataUtil class.
*
* @param string $str Text string with possible enclosing brackets
*
* @return string Processed string with the brackets removed.
*/
public static function removeOuterBrackets(string $str): string
{
$result = trim($str);
if (strlen($result) > 0) {
$openBracketFirst = str_starts_with($result, '[');
$closeBracketLast = str_ends_with($result, ']');
$totalLefts = substr_count($result, '[');
$totalRights = substr_count($result, ']');
if ($openBracketFirst && $closeBracketLast && $totalLefts === 1 && $totalRights === 1) {
// only square brackets are at beginning and end
$result = substr($result, 1, strlen($result) - 2);
} elseif ($openBracketFirst && $totalRights === 0) {
// starts with '[' but no ']'; remove open bracket
$result = substr($result, 1);
} elseif ($closeBracketLast && $totalLefts === 0) {
// ends with ']' but no '['; remove close bracket
$result = substr($result, 0, strlen($result) - 1);
}
}
return $result;
}

/**
* Port of logic from SolrMarc's DataUtil::cleanData method.
*
* @param string $str String to process.
*
* @return string Processed string.
*/
public static function solrMarcStyleCleanData(string $str): string
{
$needsPeriodStripping = function ($strToCheck) {
$noStrippingRegex = [
'/.*[JS]r\.$/', // don't strip period off of Jr. or Sr.
];
$strippingRegex = [
'/.*\w\w\.$/',
'/.*\p{L}\p{L}\.$/',
// The following regex is unsupported by PHP but retained for reference:
//'/.*\w\p{InCombiningDiacriticalMarks}?\w\p{InCombiningDiacriticalMarks}?\.$/u',
'/.*\p{P}\.$/u',
];
foreach ($noStrippingRegex as $regex) {
if (preg_match($regex, $strToCheck)) {
return false;
}
}
foreach ($strippingRegex as $regex) {
if (preg_match($regex, $strToCheck)) {
return true;
}
}
return false;
};

$current = $str;
do {
$previous = $current;
$current = trim($current);
$current = preg_replace('|\s*([,/;:])$|', '', $current);
if (str_ends_with($current, '.')) {
if ($needsPeriodStripping($current)) {
$current = mb_substr($current, 0, mb_strlen($current, 'UTF-8') - 1, 'UTF-8');
}
}
$current = static::removeOuterBrackets($current);
if (strlen($current) === 0) {
return $current;
}
} while ($current !== $previous);
return $current;
}

/**
* Perform text processing roughly equivalent to SolrMarc's titleSortLower
* feature to allow consistent indexing into the title_sort field.
*
* @param string $str String to process.
*
* @return string Processed string.
*/
public static function titleSortLower(string $str): string
{
return mb_strtolower(
static::solrMarcStyleCleanData(
static::stripPunctuation(
static::stripAccents($str)
)
),
'UTF-8'
);
}

/**
* Convert provided nodes into XML and return as text. This is useful for
* populating the fullrecord field with the raw input XML.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,21 +41,59 @@
*/
class AlphabrowseTest extends \VuFindTest\Integration\MinkTestCase
{
/**
* Data provider for testTitleSearchNormalization
*
* @return array
*/
public function titleSearchNormalizationProvider(): array
{
return [
'bracket stripping' => ['[arithmetic facts]', 'Arithmetic Facts'],
'multi-bracket stripping' => ['[[[[[arithmetic facts]]]]]', 'Arithmetic Facts'],
'accent stripping' => ['arithmétic facts', 'Arithmetic Facts'],
'punctuation collapsing' => ['arithmetic facts /:/:', 'Arithmetic Facts'],
'whitespace collapsing' => ['arithmetic facts', 'Arithmetic Facts'],
];
}

/**
* Test that appropriate normalization is applied to title searches.
*
* @param string $query Alphabrowse query to perform
* @param string $expectedFirstTitle Expected first title in result list
*
* @return void
*
* @dataProvider titleSearchNormalizationProvider
*/
public function testTitleSearchNormalization($query, $expectedFirstTitle): void
{
$session = $this->getMinkSession();
$session->visit($this->getVuFindUrl() . '/Alphabrowse/Home');
$page = $session->getPage();
$this->findCssAndSetValue($page, '#alphaBrowseForm_source', 'title');
$this->findCssAndSetValue($page, '#alphaBrowseForm_from', $query);
$this->clickCss($page, '#alphaBrowseForm .btn-primary');
$this->waitForPageLoad($page);
$this->assertEquals(
$expectedFirstTitle,
$this->findCss($page, 'table.alphabrowse td.title')->getText()
);
}

/**
* Test that extra attributes are escaped correctly.
*
* @return void
*/
public function testExtraAttributeEscaping()
public function testExtraAttributeEscaping(): void
{
$session = $this->getMinkSession();
$session->visit($this->getVuFindUrl() . '/Alphabrowse/Home?source=lcc&from=PS3552.R878+T47+2011');
$page = $session->getPage();
$extras = $this->findCss($page, 'table.alphabrowse td.lcc ~ td');
$text = $extras->getText();
$this->assertTrue(
str_contains($text, '<HTML> The Basics'),
"Could not find '<HTML> The Basics' in '$text'"
);
$this->assertStringContainsString('<HTML> The Basics', $text);
}
}