Skip to content

Commit

Permalink
Add Wikidata language qualified string hack.
Browse files Browse the repository at this point in the history
  • Loading branch information
CraigMiloRogers committed Sep 2, 2020
1 parent 3b506aa commit 6f50cbb
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 3 deletions.
11 changes: 8 additions & 3 deletions kgtk/value/kgtkvalue.py
Original file line number Diff line number Diff line change
Expand Up @@ -1081,6 +1081,7 @@ def is_boolean(self, validate: bool = False)->bool:
# with a country code or dialect name suffix after the language code.
lax_language_qualified_string_re: typing.Pattern = re.compile(r"^'(?P<text>.*)'@(?P<lang_suffix>(?P<lang>[a-zA-Z]{2,3})(?P<suffix>-[a-zA-Z0-9]+)?)$")
strict_language_qualified_string_re: typing.Pattern = re.compile(r"^'(?P<text>(?:[^'\\]|\\.)*)'@(?P<lang_suffix>(?P<lang>[a-zA-Z]{2,3})(?P<suffix>-[a-zA-Z0-9]+)?)$")
wikidata_language_qualified_string_re: typing.Pattern = re.compile(r"^'(?P<text>(?:[^'\\]|\\.)*)'@(?P<lang_suffix>(?P<lang>[a-zA-Z]{2,})(?P<suffix>-[-a-zA-Z0-9]+)?)$")

def is_language_qualified_string(self, validate: bool=False)->bool:
"""
Expand All @@ -1103,13 +1104,17 @@ def is_language_qualified_string(self, validate: bool=False)->bool:
# Validate the language qualified string.
# print("checking %s" % self.value)
m: typing.Optional[typing.Match]
if self.options.allow_lax_lq_strings:
if self.options.allow_wikidata_lq_strings:
m = KgtkValue.wikidata_language_qualified_string_re.match(self.value)
elif self.options.allow_lax_lq_strings:
m = KgtkValue.lax_language_qualified_string_re.match(self.value)
else:
m = KgtkValue.strict_language_qualified_string_re.match(self.value)
if m is None:
if self.verbose:
if self.options.allow_lax_lq_strings:
if self.options.allow_wikidata_lq_strings:
print("KgtkValue.wikidata_language_qualified_string_re.match failed for %s" % self.value, file=self.error_file, flush=True)
elif self.options.allow_lax_lq_strings:
print("KgtkValue.lax_language_qualified_string_re.match failed for %s" % self.value, file=self.error_file, flush=True)
else:
print("KgtkValue.strict_language_qualified_string_re.match failed for %s" % self.value, file=self.error_file, flush=True)
Expand All @@ -1121,7 +1126,7 @@ def is_language_qualified_string(self, validate: bool=False)->bool:
# print("lang_and_suffix: %s" % lang_and_suffix)

# Validate the language code:
if not LanguageValidator.validate(lang_and_suffix.lower(), options=self.options):
if not self.options.allow_wikidata_lq_strings and not LanguageValidator.validate(lang_and_suffix.lower(), options=self.options):
if self.verbose:
print("language validation failed for %s" % self.value, file=self.error_file, flush=True)
self.valid = False
Expand Down
9 changes: 9 additions & 0 deletions kgtk/value/kgtkvalueoptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ class KgtkValueOptions:
# check if internal single quotes are excaped by backslash.
allow_lax_lq_strings: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False)

# Permit Wikidata language qualifier extensions.
allow_wikidata_lq_strings: bool = attr.ib(validator=attr.validators.instance_of(bool), default=False)

allow_language_suffixes: bool = attr.ib(validator=attr.validators.instance_of(bool), default=True)

# If this list gets long, we may want to turn it into a map to make lookup
Expand Down Expand Up @@ -172,6 +175,10 @@ def d(default: typing.Any)->typing.Mapping[str, typing.Any]:
help=h(prefix3 + "Do not check if single quotes are backslashed inside language qualified strings. (default=%(default)s)."),
type=optional_bool, nargs='?', const=True, **d(default=False))

vgroup.add_argument( prefix1 + "allow-wikidata-lq-strings", dest=prefix2 + "allow_wikidata_lq_strings",
help=h(prefix3 + "Allow Wikidata language qualifiers. (default=%(default)s)."),
type=optional_bool, nargs='?', const=True, **d(default=False))

vgroup.add_argument( prefix1 + "require-iso8601-extended", dest=prefix2 + "require_iso8601_extended",
help=h(prefix3 + "Require colon(:) and hyphen(-) in dates and times. (default=%(default)s)."),
type=optional_bool, nargs='?', const=True, **d(default=False))
Expand Down Expand Up @@ -282,6 +289,7 @@ def from_dict(cls, d: dict, who: str = "")->'KgtkValueOptions':
allow_language_suffixes=d.get(prefix + "allow_language_suffixes", True),
allow_lax_strings=d.get(prefix + "allow_lax_strings", False),
allow_lax_lq_strings=d.get(prefix + "allow_lax_lq_strings", False),
allow_wikidata_lq_strings=d.get(prefix + "allow_wikidata_lq_strings", False),
additional_language_codes=d.get(prefix + "additional_language_codes", None),

require_iso8601_extended=d.get(prefix + "require_iso8601_extended", False),
Expand Down Expand Up @@ -324,6 +332,7 @@ def show(self, who: str="", out: typing.TextIO=sys.stderr):
print("%sallow-language-suffixes=%s" % (prefix, str(self.allow_language_suffixes)), file=out)
print("%sallow-lax-strings=%s" % (prefix, str(self.allow_lax_strings)), file=out)
print("%sallow-lax-lq-strings=%s" % (prefix, str(self.allow_lax_lq_strings)), file=out)
print("%sallow-wikidata-lq-strings=%s" % (prefix, str(self.allow_wikidata_lq_strings)), file=out)
if self.additional_language_codes is not None:
print("%sadditional-language-codes=%s" % (prefix, " ".join(self.additional_language_codes)), file=out)

Expand Down

0 comments on commit 6f50cbb

Please sign in to comment.