Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor ItemsRule on merging #521

Merged
merged 17 commits into from
Apr 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,20 @@ trait WorksGenerators
def createSierraPhysicalWork: UnidentifiedWork =
createUnidentifiedSierraWorkWith(items = List(createPhysicalItem))

def createSierraWorkWithDigitisedMergeCandidate = {
val physicalSierraWork = createSierraPhysicalWork
val digitisedCopyOfSierraWork = createUnidentifiedSierraWork
val physicalSierraWorkWithMergeCandidate = physicalSierraWork.copy(
data = physicalSierraWork.data.copy(
mergeCandidates = List(
MergeCandidate(
identifier = digitisedCopyOfSierraWork.sourceIdentifier,
reason = Some("Physical/digitised Sierra work")
))))

(physicalSierraWorkWithMergeCandidate, digitisedCopyOfSierraWork)
}

def createSierraDigitalWork: UnidentifiedWork =
createSierraDigitalWorkWith()

Expand Down
Binary file added docs/.gitbook/assets/merger_linking_works.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3 changes: 2 additions & 1 deletion docs/SUMMARY.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,6 @@
- [Fetching records from Sierra](adapters/fetching_records_from_sierra.md)
- [Sierra](sierra/README.md)
- [Sierra IDs](sierra/sierra_ids.md)
- [Pipeline](pipeline.md)
- [Pipeline](pipeline/README.md)
- [Merging](pipeline/merging.md)
- [APM](apm.md)
23 changes: 23 additions & 0 deletions docs/pipeline/merging.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Merging

## The process
Merging works is split up into 2 steps:

### 1. Linking works

Using features from the source data to calculate which works are linked.

e.g:
* the `BNumber` from a Calm record
* Marcfield `776$w` linking a Sierra record to it's digitised counterpart

This is carried out by the transformers of the respective sources data.

#### How are works currently linked?
![How works are currently linked](../.gitbook/merger_linking_works.png)
https://excalidraw.com/#json=5964037271584768,ojtCECzrMrgSJuBp3VCvHw


### 2. Merging linked works
TBD.

Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,7 @@ package uk.ac.wellcome.platform.merger.rules
import uk.ac.wellcome.models.work.internal._
import uk.ac.wellcome.platform.merger.logging.MergerLogging
import uk.ac.wellcome.platform.merger.models.FieldMergeResult
import uk.ac.wellcome.platform.merger.rules.WorkPredicates.{
WorkPredicate,
WorkPredicateOps
}

import uk.ac.wellcome.platform.merger.rules.WorkPredicates.WorkPredicate
import cats.data.NonEmptyList

/*
Expand All @@ -23,28 +19,50 @@ object ItemsRule extends FieldMergeRule with MergerLogging {

override def merge(
target: UnidentifiedWork,
sources: Seq[TransformedBaseWork]): FieldMergeResult[FieldData] =
sources: Seq[TransformedBaseWork]): FieldMergeResult[FieldData] = {
val rules =
List(mergeCalmItems, mergeMetsItems, mergeMiroItems)

val items = mergeCalmItems(target, sources)
.orElse(mergeMetsItems(target, sources))
.orElse(mergeMiroItems(target, sources))
.getOrElse(target.data.items)

val mergedSources = sources.filter { source =>
rules.exists(_(target, source).isDefined)
} ++ getDigitisedCopiesOfSierraWork(target, sources)

FieldMergeResult(
data = mergeItems(target, sources),
sources = sources.filter { source =>
(mergeMetsItems(target, source) orElse mergeMiroPhysicalAndDigitalItems(
target,
source)).isDefined
}
data = items,
sources = mergedSources
)
}

private def mergeItems(target: UnidentifiedWork,
sources: Seq[TransformedBaseWork]): FieldData = {
// TODO: the merging behaviour here is temporary until jtweed confirms the
// exact rules
val mergedTarget = mergeCalmItems(target, sources)
.map(items => target.withData(_.copy(items = items)))
.getOrElse(target)
mergeMetsItems(mergedTarget, sources)
.orElse(mergeMiroPhysicalAndDigitalItems(mergedTarget, sources))
.getOrElse(mergedTarget.data.items)
/** This is when we've found a digitised sierra work of a sierra physical work
* uk.ac.wellcome.platform.transformer.sierra.transformers.SierraMergeCandidates.get776mergeCandidates
*
* We get all the digitised SourceIdentifiers from the merge candidates
* and search through the sources for them.
*/
private def getDigitisedCopiesOfSierraWork(
target: UnidentifiedWork,
sources: Seq[TransformedBaseWork]): Seq[TransformedBaseWork] = {
target.data.mergeCandidates
.filter(_.reason.contains("Physical/digitised Sierra work"))
.map(_.identifier)
.flatMap(sourceIdentifier =>
sources.filter(source => source.sourceIdentifier == sourceIdentifier))
}

/**
* If there is 1 Sierra item, we replace the `PhysicalLocation` with
* the one from the Calm record.
*
* Otherwise we add the item to the items list.
*
* This logic is going to be removed soon as we make the Calm
* record the `target`.
*/
private val mergeCalmItems = new PartialRule {
val isDefinedForTarget: WorkPredicate = WorkPredicates.sierraWork
val isDefinedForSource: WorkPredicate = WorkPredicates.calmWork
Expand All @@ -61,16 +79,24 @@ object ItemsRule extends FieldMergeRule with MergerLogging {
case List(sierraItem) =>
List(
sierraItem.copy(
locations = calmLocation :: sierraItem.locations.collect {
case location: DigitalLocation => location
}
// We remove any `PhysicalLocation`s and add the calmLocation
// as the only `PhysicalLocation`
locations = calmLocation :: sierraItem.locations.filter(_ match {
case _: PhysicalLocation => false
case _ => true
})
)
)
case items => calmItem :: items
case multipleItems => calmItem :: multipleItems
}
}
}

/**
* If there is 1 Sierra item, we add the location from the METS record
* else we just append METS items as we wouldn't know which item to
* augment.
*/
private val mergeMetsItems = new PartialRule {
val isDefinedForTarget: WorkPredicate = WorkPredicates.sierraWork
val isDefinedForSource: WorkPredicate = WorkPredicates.singleItemDigitalMets
Expand All @@ -79,51 +105,46 @@ object ItemsRule extends FieldMergeRule with MergerLogging {
sources: NonEmptyList[TransformedBaseWork]): FieldData = {
val sierraItems = target.data.items
val metsItems = sources.toList.flatMap(_.data.items)
val metsUrls = metsItems.flatMap(_.locations).collect {
case DigitalLocation(url, _, _, _, _, _) => url
}

debug(s"Merging METS items from ${describeWorks(sources)}")
sierraItems match {
case List(sierraItem) =>
List(
sierraItem.copy(
locations = sierraItem.locations.filterNot(hasUrl(metsUrls)) ++
metsItems.flatMap(_.locations)
locations = sierraItem.locations ++ metsItems.flatMap(_.locations)
)
)
case _ =>
sierraItems.filterNot(_.locations.exists(hasUrl(metsUrls))) ++
metsItems
case multipleItems =>
multipleItems ++ metsItems
}
}
}

private val mergeMiroPhysicalAndDigitalItems = new PartialRule {
val isDefinedForTarget: WorkPredicate = WorkPredicates.sierraWork
val isDefinedForSource
: WorkPredicate = WorkPredicates.miroWork or WorkPredicates.digitalSierra
/** We merge the Miro location to the Sierra work with a single item
* as we assume that this is the only item that the Miro image
* could be associated with.
*
* If we have multiple items, we assume it is definitely associated with
* one of the Sierra items, but unsure of which. We thus don't append it
* to the Sierra items to avoid certain duplication, and leave the works
* unmerged.
*/
private val mergeMiroItems = new PartialRule {
val isDefinedForTarget: WorkPredicate = WorkPredicates.singleItemSierra
val isDefinedForSource: WorkPredicate = WorkPredicates.miroWork

def rule(target: UnidentifiedWork,
sources: NonEmptyList[TransformedBaseWork]): FieldData =
(target.data.items, sources.toList.partition(WorkPredicates.miroWork)) match {
case (List(sierraSingleItem), (miroSources, sierraSources)) =>
List(
sierraSingleItem.copy(
locations = sierraSingleItem.locations ++
(sierraSources ++ miroSources).flatMap(
_.data.items.flatMap(_.locations)
)
)
)
case (multipleSierraItems, (_, sierraSources)) =>
multipleSierraItems ++ sierraSources.flatMap(_.data.items)
}
}
sources: NonEmptyList[TransformedBaseWork]): FieldData = {
// This is safe due to the `singleItemSierra` predicate
val sierraItem = target.data.items.head
val miroItems = sources.toList.flatMap(_.data.items)

private def hasUrl(matchUrls: Seq[String])(location: Location) =
location match {
case DigitalLocation(url, _, _, _, _, _) if matchUrls.contains(url) =>
true
case _ => false
debug(s"Merging Miro items from ${describeWorks(sources)}")

List(
sierraItem.copy(
locations = sierraItem.locations ++ miroItems.flatMap(_.locations)
))
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ object OtherIdentifiersRule extends FieldMergeRule with MergerLogging {

private val physicalDigitalIdsRule = new PartialRule {
val isDefinedForTarget: WorkPredicate = WorkPredicates.physicalSierra
val isDefinedForSource: WorkPredicate = WorkPredicates.digitalSierra
val isDefinedForSource: WorkPredicate = WorkPredicates.sierraWork

override def rule(
target: UnidentifiedWork,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,6 @@ object WorkPredicates {
val physicalSierra: WorkPredicate =
satisfiesAll(sierraWork, physicalLocationExists)

val digitalSierra: WorkPredicate =
satisfiesAll(sierraWork, singleItem, allDigitalLocations)

val sierraPicture: WorkPredicate =
satisfiesAll(sierraWork, workType(WorkType.Pictures))

Expand Down
Loading