Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename all instances of versus #20

Merged
merged 1 commit into from Feb 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
26 changes: 13 additions & 13 deletions src/sutta_processor/application/check_service/bd_reference.py
Expand Up @@ -13,10 +13,10 @@
)
from sutta_processor.application.domain_models.base import BaseFileAggregate
from sutta_processor.application.domain_models.bilara_concordance.root import (
ConcordanceVersus,
ConcordanceVerses,
)
from sutta_processor.application.domain_models.bilara_reference.root import (
ReferenceVersus,
ReferenceVerses,
)
from sutta_processor.application.value_objects import UID, BaseTextKey, MsId
from sutta_processor.shared.config import Config
Expand Down Expand Up @@ -182,28 +182,28 @@ def get_surplus_ref(c: Counter) -> dict:
@classmethod
def get_references_stem(cls, reference: BilaraReferenceAggregate) -> list:
stems = set()
for verse in reference.index.values(): # type: ReferenceVersus
for verse in reference.index.values(): # type: ReferenceVerses
stems.update((v.reference_root for v in verse.references))
reference_stems = list(sorted(stems))
log.error("Reference stems: %s", reference_stems)
return reference_stems

def get_wrong_pts_cs_no(self, reference: BilaraReferenceAggregate):
ignore = {"pts-cs75", "pts-cs1.10", "pts-cs7", "pts-cs8", "pts-cs12"}
for versus in reference.index.values(): # type: ReferenceVersus
if not versus.uid.key.raw.startswith("dn"):
for verses in reference.index.values(): # type: ReferenceVerses
if not verses.uid.key.raw.startswith("dn"):
continue
elif not versus.references.pts_cs:
elif not verses.references.pts_cs:
# No pts_cs reference in the reference list so skip it
continue
elif versus.references.pts_cs in ignore:
elif verses.references.pts_cs in ignore:
continue
if not versus.uid.key.seq.raw.startswith(versus.references.pts_cs.pts_no):
if not verses.uid.key.seq.raw.startswith(verses.references.pts_cs.pts_no):
log.error(
"[%s] wrong uid '%s' for pts_cs number: %s",
self.name,
versus.uid,
versus.references.pts_cs,
verses.uid,
verses.references.pts_cs,
)

def get_missing_ms_id_from_reference(self, aggregate: YuttaAggregate):
Expand Down Expand Up @@ -264,7 +264,7 @@ def update_ref_based_on_html_uids(
# continue
# else:
# log.error("Empty index for file: %s", f_aggr.f_pth)
# f_aggr.index[concordance_verse.uid] = ReferenceVersus(
# f_aggr.index[concordance_verse.uid] = ReferenceVerses(
# raw_uid=concordance_verse.raw_uid,
# verse=",".join(concordance_verse.references),
# )
Expand Down Expand Up @@ -362,8 +362,8 @@ def match_uid():
@classmethod
def get_wrong_segments_based_on_nya(cls, reference: BilaraReferenceAggregate):
wrong_keys = set()
for uid, ref_versus in reference.index.items():
nya_id = ref_versus.references.nya
for uid, ref_verses in reference.index.items():
nya_id = ref_verses.references.nya
if not nya_id:
continue
is_uid_ok = nya_id == f"nya{uid.key.seq[0]}"
Expand Down
42 changes: 21 additions & 21 deletions src/sutta_processor/application/check_service/check.py
Expand Up @@ -12,7 +12,7 @@
BilaraTranslationAggregate,
BilaraVariantAggregate,
)
from sutta_processor.application.domain_models.base import BaseRootAggregate, BaseVersus
from sutta_processor.application.domain_models.base import BaseRootAggregate, BaseVerses
from sutta_processor.application.value_objects.uid import UID, UidKey
from sutta_processor.shared.config import Config

Expand Down Expand Up @@ -85,10 +85,10 @@ def is_ignored(uid: UID) -> bool:
def is_0_in_header_uid(self, aggregate: BilaraHtmlAggregate) -> Set[UID]:
error_uids = set()
prog = re.compile(r"<h\d")
for uid, versus in aggregate.index.items():
for uid, verses in aggregate.index.items():
if uid in self.cfg.exclude.headers_without_0:
continue
elif prog.match(versus.verse) and 0 not in uid.key.seq:
elif prog.match(verses.verse) and 0 not in uid.key.seq:
omg = "[%s] Possible header not starting the section: '%s'"
log.error(omg, self.name, uid)
error_uids.add(uid)
Expand Down Expand Up @@ -138,8 +138,8 @@ def get_wrong_uid_with_arrow(
) -> Set[UID]:
missing_word_keys = set()

for uid, versus in aggregate.index.items():
word, *rest = versus.verse.split("→")
for uid, verses in aggregate.index.items():
word, *rest = verses.verse.split("→")
if not rest:
continue
word, *_ = word.split('…')
Expand All @@ -165,8 +165,8 @@ def get_wrong_uid_with_arrow(

def get_unknown_variants(self, aggregate: BilaraVariantAggregate) -> Set[UID]:
unknown_keys = set()
for uid, versus in aggregate.index.items():
word, *rest = versus.verse.split("→")
for uid, verses in aggregate.index.items():
word, *rest = verses.verse.split("→")
if rest or uid in self.cfg.exclude.get_unknown_variants:
continue
unknown_keys.add(uid)
Expand Down Expand Up @@ -269,25 +269,25 @@ def check_uid_sequence_in_file(self, aggregate: BilaraRootAggregate):
msg = "[%s] There are '%s' sequence key errors: %s"
log.error(msg, self.name, len(error_keys), error_keys)

def get_duplicated_versus_next_to_each_other(
def get_duplicated_verses_next_to_each_other(
self, aggregate: BilaraRootAggregate
) -> set:
error_keys = set()
prev_versus = ""
for uid, versus in aggregate.index.items(): # type: UID, BaseVersus
verse = versus.verse.strip()
prev_verses = ""
for uid, verses in aggregate.index.items(): # type: UID, BaseVerses
verse = verses.verse.strip()
if not verse:
continue
if (
verse == prev_versus
and uid not in self.cfg.exclude.get_duplicated_versus_next_to_each_other
verse == prev_verses
and uid not in self.cfg.exclude.get_duplicated_verses_next_to_each_other
):
error_keys.add(uid)
msg = "[%s] Same versus next to each other. '%s': '%s'"
msg = "[%s] Same verses next to each other. '%s': '%s'"
log.error(msg, self.name, uid, verse)
prev_versus = verse
prev_verses = verse
if error_keys:
msg = "[%s] There are '%s' duplicated versus error"
msg = "[%s] There are '%s' duplicated verses error"
log.error(msg, self.name, len(error_keys))
msg = "[%s] dupes UIDs: %s"
log.error(msg, self.name, sorted(error_keys))
Expand All @@ -297,15 +297,15 @@ def get_empty_verses(self, aggregate: BilaraRootAggregate) -> set:
error_keys = set()
pattern = r"(\(\s\)|^\s$)"
prog = re.compile(pattern)
for uid, versus in aggregate.index.items(): # type: UID, BaseVersus
result = prog.match(versus.verse)
for uid, verses in aggregate.index.items(): # type: UID, BaseVerses
result = prog.match(verses.verse)
if result:
error_keys.add(uid)
msg = "[%s] Key has blank value: '%s': '%s'"
log.error(msg, self.name, uid, versus.verse)
log.error(msg, self.name, uid, verses.verse)

if error_keys:
msg = "[%s] There are '%s' blank versus error"
msg = "[%s] There are '%s' blank verses error"
log.error(msg, self.name, len(error_keys))
msg = "[%s] blank UIDs: %s"
log.error(msg, self.name, sorted(error_keys))
Expand All @@ -331,7 +331,7 @@ def get_unordered_segments(self, aggregate: BaseRootAggregate):


class SequenceCheck(ServiceBase):
def get_unordered_segments(self, index: Dict[UID, BaseVersus]) -> Set[UID]:
def get_unordered_segments(self, index: Dict[UID, BaseVerses]) -> Set[UID]:
wrong_uid = set()
previous = UidKey(":0-0")
for uid in index:
Expand Down
50 changes: 25 additions & 25 deletions src/sutta_processor/application/check_service/text_check.py
Expand Up @@ -8,8 +8,8 @@
BilaraRootAggregate,
YuttaAggregate,
)
from sutta_processor.application.domain_models.base import BaseVersus
from sutta_processor.application.domain_models.ms_yuttadhammo.base import YuttaVersus
from sutta_processor.application.domain_models.base import BaseVerses
from sutta_processor.application.domain_models.ms_yuttadhammo.base import YuttaVerses
from sutta_processor.application.value_objects import (
UID,
BaseUID,
Expand All @@ -20,7 +20,7 @@
)
from sutta_processor.shared.exceptions import NoTokensError

from ..domain_models.bilara_root.root import Versus
from ..domain_models.bilara_root.root import Verses
from .base import ServiceBase
from .bd_reference import SCReferenceService
from .tokenizer import VersetTokenizer
Expand All @@ -43,16 +43,16 @@ class TextMatcher:

def __init__(self, root: BilaraRootAggregate, pali: YuttaAggregate):
def get_unmatched_root_index() -> Dict[RootUID, RootUidTokens]:
index_combined: Dict[RootUID, List[Versus]] = defaultdict(list)
index_combined: Dict[RootUID, List[Verses]] = defaultdict(list)
index_text_combined: Dict[RootUID, RootUidTokens] = {}
for uid, versus in root.index.items():
for uid, verses in root.index.items():
if 0 in uid.key.seq:
continue
index_combined[uid.root].append(versus)
index_combined[uid.root].append(verses)

for root_uid, versus_list in index_combined.items():
for root_uid, verses_list in index_combined.items():
try:
txt = " ".join((versus.verse for versus in versus_list))
txt = " ".join((verses.verse for verses in verses_list))
tokens = VersetTokenizer.get_tokens(txt)
index_text_combined[root_uid] = RootUidTokens(root_uid, tokens)
except NoTokensError as e:
Expand All @@ -70,41 +70,41 @@ def get_unmatched_root_index() -> Dict[RootUID, RootUidTokens]:

def get_missing_root_text_from_ms(self) -> set:

for i, versus in enumerate(self.pali.index.values()):
# ms_id, versus = item # type: MsId, YuttaVersus
for i, verses in enumerate(self.pali.index.values()):
# ms_id, verses = item # type: MsId, YuttaVerses
# if "ms25Cn_738" not in uids:
# continue
if i > 30:
break
try:
self.process_yutta_verse(i=i, versus=versus)
self.process_yutta_verse(i=i, verses=verses)
except Exception as e:
log.exception(e)
log.error("-" * 80)
log.error("-" * 80)
return self.wrong_keys

def process_yutta_verse(self, i: int, versus: YuttaVersus):
def process_yutta_verse(self, i: int, verses: YuttaVerses):
def get_ratio():
ratio_map = {}
matcher = SequenceMatcher()
matcher.set_seq1(versus.verse.tokens)
matcher.set_seq1(verses.verse.tokens)
for root_tokens in self.roots_uid_tokens_index.values():
matcher.set_seq2(root_tokens.tokens)
ratio = matcher.quick_ratio()
if ratio > 0.7:
omg = "Ratio for yt: '%s' root: '%s', ratio: %s"
log.error(omg, versus.ms_id, root_tokens.uid, ratio)
log.error(omg, verses.ms_id, root_tokens.uid, ratio)
ratio_map[root_tokens.uid] = ratio
if not ratio_map:
omg = "Couldn't find a match ms_uid: '%s' tokes: %s"
log.error(omg, versus.ms_id, versus.verse.tokens)
log.error(omg, verses.ms_id, verses.verse.tokens)
return ratio_map

self.ratios: Dict[MsId, Dict[RootUID, float]] = defaultdict(dict)
self.c["all"] += 1
try:
self.ratios[versus.ms_id] = get_ratio()
self.ratios[verses.ms_id] = get_ratio()
except Exception as e:
log.exception(e)
return
Expand Down Expand Up @@ -161,13 +161,13 @@ def is_key_missing(uid_: UID) -> bool:
return True
return False

for uid, versus in root.index.items():
for uid, verses in root.index.items():
if is_skipped(uid) or is_key_missing(uid):
continue

c["all"] += 1
ms_id = self.reference.reference_engine.uid_index[uid]
root_tokens = versus.verse.tokens
root_tokens = verses.verse.tokens
pali_tokens = pali.index[ms_id].verse.tokens
if root_tokens != pali_tokens:
omg = (
Expand All @@ -178,7 +178,7 @@ def is_key_missing(uid_: UID) -> bool:
omg,
self.name,
uid,
versus.verse,
verses.verse,
pali.index[ms_id].verse,
root_tokens,
pali_tokens,
Expand Down Expand Up @@ -221,15 +221,15 @@ def get_missing_text_ms_source(
) -> Set[UID]:
wrong_keys = set()

def get_root_versus(ms_id) -> Optional[BaseVersus]:
def get_root_verses(ms_id) -> Optional[BaseVerses]:
root_ids: set = self.reference.reference_engine.ms_id_index.get(ms_id)
if not root_ids:
c["missing_in_reference"] += 1
return None
if len(root_ids) != 1:
omg = f"MsId '{ms_id}' referencing more than one segment id:{root_ids}"
raise RuntimeError(omg)
root_vers: BaseVersus = root.index[root_ids.pop()]
root_vers: BaseVerses = root.index[root_ids.pop()]
return root_vers

c: Counter = Counter(
Expand All @@ -251,8 +251,8 @@ def get_root_versus(ms_id) -> Optional[BaseVersus]:
# TODO: Make multi id compilant
ms_id = uids.pop()
try:
root_versus = get_root_versus(ms_id=ms_id)
if root_versus and "EMPTY" in root_versus.verse.tokens.head_key:
root_verses = get_root_verses(ms_id=ms_id)
if root_verses and "EMPTY" in root_verses.verse.tokens.head_key:
# TODO: Handle empty tokens (do more validation with that)
c["all"] -= 1
continue
Expand All @@ -278,12 +278,12 @@ def print_verse_details(
self, ms_id: MsId, root: BilaraRootAggregate, pali: YuttaAggregate,
):
def print_details(uid_):
root_vers: BaseVersus = root.index[uid_]
root_vers: BaseVerses = root.index[uid_]
log.error("Root verset: '%s'", root_vers.verse)
log.error("Root tokens: '%s'", root_vers.verse.tokens.head_key)

log.error(f"{'=' * 40} %s {'=' * 40}", ms_id)
pali_vers: YuttaVersus = pali.index[ms_id]
pali_vers: YuttaVerses = pali.index[ms_id]
log.error("Pali verset: '%s'", pali_vers.verse)
log.error("Pali tokens: '%s'", pali_vers.verse.tokens.head_key)

Expand Down