diff --git a/src/somajo/sentence_splitter.py b/src/somajo/sentence_splitter.py index 0b95a72..9db5fd1 100644 --- a/src/somajo/sentence_splitter.py +++ b/src/somajo/sentence_splitter.py @@ -21,11 +21,11 @@ def __init__(self, is_tuple=False, language="de_CMC"): self.closing_punct = re.compile(r"^(?:['\"\p{Pf}\p{Pe}])$") # International quotes: «» “” ‹› ‘’ # German quotes: »« „“ ›‹ ‚‘ - self.problematic_quotes = set(['"']) + self.problematic_quotes = {'"'} if language == "de" or language == "de_CMC": # German opening quotes [»›] have category Pf # German closing quotes [“‘«‹] have category Pi - self.problematic_quotes = set(['"', "»", "«", "›", "‹", "“", "‘"]) + self.problematic_quotes = {'"', "»", "«", "›", "‹", "“", "‘"} self.eos_abbreviations = utils.read_abbreviation_file("eos_abbreviations.txt") # We match these via regular expressions because users could # call the split or split_xml methods with pretokenized input diff --git a/src/somajo/somajo.py b/src/somajo/somajo.py index bd41211..70df21c 100644 --- a/src/somajo/somajo.py +++ b/src/somajo/somajo.py @@ -31,9 +31,9 @@ class SoMaJo: """ - supported_languages = set(["de_CMC", "en_PTB"]) + supported_languages = {"de_CMC", "en_PTB"} _default_language = "de_CMC" - paragraph_separators = set(["empty_lines", "single_newlines"]) + paragraph_separators = {"empty_lines", "single_newlines"} _default_parsep = "empty_lines" def __init__(self, language, *, split_camel_case=False, split_sentences=True, xml_sentences=None): diff --git a/src/somajo/token.py b/src/somajo/token.py index 1307cf3..0f57a46 100644 --- a/src/somajo/token.py +++ b/src/somajo/token.py @@ -29,7 +29,7 @@ class Token: """ - token_classes = set([ + token_classes = { "URL", "XML_entity", "XML_tag", @@ -48,7 +48,7 @@ class Token: "semester", "symbol", "time", - ]) + } def __init__(self, text, *, markup=False, markup_class=None, markup_eos=None, locked=False, token_class=None, space_after=True, original_spelling=None, first_in_sentence=False, last_in_sentence=False): self.text = text diff --git a/src/somajo/tokenizer.py b/src/somajo/tokenizer.py index 0ebc4e7..0c72981 100644 --- a/src/somajo/tokenizer.py +++ b/src/somajo/tokenizer.py @@ -14,7 +14,7 @@ class Tokenizer(): - _supported_languages = set(["de", "de_CMC", "en", "en_PTB"]) + _supported_languages = {"de", "de_CMC", "en", "en_PTB"} _default_language = "de_CMC" def __init__(self, split_camel_case=False, token_classes=False, extra_info=False, language="de_CMC"): @@ -131,53 +131,52 @@ def __init__(self, split_camel_case=False, token_classes=False, extra_info=False );""", re.VERBOSE | re.IGNORECASE) # EMOTICONS - emoticon_set = set(["(-.-)", "(T_T)", "(♥_♥)", ")':", ")-:", - "(-:", ")=", ")o:", ")x", ":'C", ":/", - ":<", ":C", ":[", "=(", "=)", "=D", "=P", - ">:", "\\:", "]:", "x(", "^^", "o.O", - "\\O/", "\\m/", ":;))", "_))", "*_*", - "._.", ">_<", "*<:-)", ":!:", ":;-))", - "x'D", ":^)", "(>_<)", ":->", "\\o/", - "B-)", ":-$", "O:-)", "=-O", ":O", ":-!", - ":-x", ":-|", ":-\\", ":-[", ">:-(", - "^.^"]) + emoticon_set = {"(-.-)", "(T_T)", "(♥_♥)", ")':", ")-:", + "(-:", ")=", ")o:", ")x", ":'C", ":/", ":<", + ":C", ":[", "=(", "=)", "=D", "=P", ">:", + "\\:", "]:", "x(", "^^", "o.O", "\\O/", + "\\m/", ":;))", "_))", "*_*", "._.", ">_<", + "*<:-)", ":!:", ":;-))", "x'D", ":^)", + "(>_<)", ":->", "\\o/", "B-)", ":-$", "O:-)", + "=-O", ":O", ":-!", ":-x", ":-|", ":-\\", + ":-[", ">:-(", "^.^"} # From https://textfac.es/ - textfaces_space = set(['⚆ _ ⚆', '˙ ͜ʟ˙', '◔ ⌣ ◔', '( ゚ヮ゚)', '(• ε •)', - '(づ ̄ ³ ̄)づ', '♪~ ᕕ(ᐛ)ᕗ', '\\ (•◡•) /', '( ಠ ͜ʖರೃ)', - '( ⚆ _ ⚆ )', '(▀̿Ĺ̯▀̿ ̿)', '༼ つ ◕_◕ ༽つ', '༼ つ ಥ_ಥ ༽つ', - '( ͡° ͜ʖ ͡°)', '( ͡°╭͜ʖ╮͡° )', '(╯°□°)╯︵ ┻━┻', - '( ͡ᵔ ͜ʖ ͡ᵔ )', '┬──┬ ノ( ゜-゜ノ)', '┬─┬ノ( º _ ºノ)', - '(ง ͠° ͟ل͜ ͡°)ง', '(͡ ͡° ͜ つ ͡͡°)', "﴾͡๏̯͡๏﴿ O'RLY?", - '(╯°□°)╯︵( .o.)', '(° ͡ ͜ ͡ʖ ͡ °)', '┬─┬ ︵ /(.□. )', - '(/) (°,,°) (/)', '| (• ◡•)| (❍ᴥ❍ʋ)', - '༼ つ ͡° ͜ʖ ͡° ༽つ', '(╯°□°)╯︵ ʞooqǝɔɐɟ', '┻━┻ ︵ヽ(`Д´)ノ︵ ┻━┻', - '┬┴┬┴┤ ͜ʖ ͡°) ├┬┴┬┴', '(ó ì_í)=óò=(ì_í ò)', - '(•_•) ( •_•)>⌐■-■ (⌐■_■)', '(ノ◕ヮ◕)ノ*:・゚✧ ✧゚・: *ヽ(◕ヮ◕ヽ)', - '[̲̅$̲̅(̲̅ ͡° ͜ʖ ͡°̲̅)̲̅$̲̅]', '/╲/\\╭( ͡° ͡° ͜ʖ ͡° ͡°)╮/\\╱\\', - '( ͡°( ͡° ͜ʖ( ͡° ͜ʖ ͡°)ʖ ͡°) ͡°)', '(._.) ( l: ) ( .-. ) ( :l ) (._.)', - "̿ ̿ ̿'̿'\\̵͇̿̿\\з=(•_•)=ε/̵͇̿̿/'̿'̿ ̿", '༼ ºل͟º ༼ ºل͟º ༼ ºل͟º ༽ ºل͟º ༽ ºل͟º ༽', - "̿'̿'\\̵͇̿̿\\з=( ͠° ͟ʖ ͡°)=ε/̵͇̿̿/'̿̿ ̿ ̿ ̿ ̿ ̿", - "̿̿ ̿̿ ̿̿ ̿'̿'\\̵͇̿̿\\з= ( ▀ ͜͞ʖ▀) =ε/̵͇̿̿/’̿’̿ ̿ ̿̿ ̿̿ ̿̿", - # From Signal: - "ヽ(°◇° )ノ", "■-■¬ <(•_•)"]) - textfaces_emoji = set(['♥‿♥', '☼.☼', '≧☉_☉≦', '(°ロ°)☝', '(☞゚∀゚)☞', '☜(˚▽˚)☞', '☜(⌒▽⌒)☞', - '(☞ຈل͜ຈ)☞', 'ヾ(⌐■_■)ノ♪', '(☞゚ヮ゚)☞', '☜(゚ヮ゚☜)']) - textfaces_wo_emoji = set(['=U', 'ಠ_ಠ', '◉_◉', 'ಥ_ಥ', ":')", 'ಠ⌣ಠ', 'ಠ~ಠ', 'ಠ_ಥ', 'ಠ‿↼', - 'ʘ‿ʘ', 'ಠoಠ', 'ರ_ರ', '◔̯◔', '¬_¬', 'ب_ب', '°Д°', '^̮^', '^̮^', '^̮^', - '>_>', '^̮^', '^̮^', 'ಠ╭╮ಠ', '(>ლ)', 'ʕ•ᴥ•ʔ', '(ಥ﹏ಥ)', '(ᵔᴥᵔ)', - '(¬‿¬)', '⌐╦╦═─', '(•ω•)', '(¬_¬)', '。◕‿◕。', '(ʘ‿ʘ)', '٩◔̯◔۶', - '(>人<)', '(~_^)', '(^̮^)', '(・.◤)', '(◕‿◕✿)', '。◕‿‿◕。', '(─‿‿─)', - '(;一_一)', "(ʘᗩʘ')", '(✿´‿`)', 'ლ(ಠ益ಠლ)', '~(˘▾˘~)', '(~˘▾˘)~', - '(。◕‿◕。)', '(っ˘ڡ˘ς)', 'ლ(´ڡ`ლ)', 'ƪ(˘⌣˘)ʃ', '(´・ω・`)', - '(ღ˘⌣˘ღ)', '(▰˘◡˘▰)', '〆(・∀・@)', '༼ʘ̚ل͜ʘ̚༽', 'ᕙ(⇀‸↼‶)ᕗ', - 'ᕦ(ò_óˇ)ᕤ', '(。◕‿‿◕。)', 'ヽ༼ຈل͜ຈ༽ノ', '(ง°ل͜°)ง', '╚(ಠ_ಠ)=┐', - '(´・ω・)っ由', 'Ƹ̵̡Ӝ̵̨̄Ʒ', '¯\\_(ツ)_/¯', '▄︻̷̿┻̿═━一', "(ง'̀-'́)ง", - '¯\\(°_o)/¯', '。゜(`Д´)゜。', '(づ。◕‿‿◕。)づ', '(;´༎ຶД༎ຶ`)', - '(ノಠ益ಠ)ノ彡┻━┻', 'ლ,ᔑ•ﺪ͟͠•ᔐ.ლ', '(ノ◕ヮ◕)ノ*:・゚✧', '┬┴┬┴┤(・_├┬┴┬┴', - '[̲̅$̲̅(̲̅5̲̅)̲̅$̲̅]']) + textfaces_space = {'⚆ _ ⚆', '˙ ͜ʟ˙', '◔ ⌣ ◔', '( ゚ヮ゚)', '(• ε •)', + '(づ ̄ ³ ̄)づ', '♪~ ᕕ(ᐛ)ᕗ', '\\ (•◡•) /', '( ಠ ͜ʖರೃ)', + '( ⚆ _ ⚆ )', '(▀̿Ĺ̯▀̿ ̿)', '༼ つ ◕_◕ ༽つ', '༼ つ ಥ_ಥ ༽つ', + '( ͡° ͜ʖ ͡°)', '( ͡°╭͜ʖ╮͡° )', '(╯°□°)╯︵ ┻━┻', + '( ͡ᵔ ͜ʖ ͡ᵔ )', '┬──┬ ノ( ゜-゜ノ)', '┬─┬ノ( º _ ºノ)', + '(ง ͠° ͟ل͜ ͡°)ง', '(͡ ͡° ͜ つ ͡͡°)', "﴾͡๏̯͡๏﴿ O'RLY?", + '(╯°□°)╯︵( .o.)', '(° ͡ ͜ ͡ʖ ͡ °)', '┬─┬ ︵ /(.□. )', + '(/) (°,,°) (/)', '| (• ◡•)| (❍ᴥ❍ʋ)', + '༼ つ ͡° ͜ʖ ͡° ༽つ', '(╯°□°)╯︵ ʞooqǝɔɐɟ', '┻━┻ ︵ヽ(`Д´)ノ︵ ┻━┻', + '┬┴┬┴┤ ͜ʖ ͡°) ├┬┴┬┴', '(ó ì_í)=óò=(ì_í ò)', + '(•_•) ( •_•)>⌐■-■ (⌐■_■)', '(ノ◕ヮ◕)ノ*:・゚✧ ✧゚・: *ヽ(◕ヮ◕ヽ)', + '[̲̅$̲̅(̲̅ ͡° ͜ʖ ͡°̲̅)̲̅$̲̅]', '/╲/\\╭( ͡° ͡° ͜ʖ ͡° ͡°)╮/\\╱\\', + '( ͡°( ͡° ͜ʖ( ͡° ͜ʖ ͡°)ʖ ͡°) ͡°)', '(._.) ( l: ) ( .-. ) ( :l ) (._.)', + "̿ ̿ ̿'̿'\\̵͇̿̿\\з=(•_•)=ε/̵͇̿̿/'̿'̿ ̿", '༼ ºل͟º ༼ ºل͟º ༼ ºل͟º ༽ ºل͟º ༽ ºل͟º ༽', + "̿'̿'\\̵͇̿̿\\з=( ͠° ͟ʖ ͡°)=ε/̵͇̿̿/'̿̿ ̿ ̿ ̿ ̿ ̿", + "̿̿ ̿̿ ̿̿ ̿'̿'\\̵͇̿̿\\з= ( ▀ ͜͞ʖ▀) =ε/̵͇̿̿/’̿’̿ ̿ ̿̿ ̿̿ ̿̿", + # From Signal: + "ヽ(°◇° )ノ", "■-■¬ <(•_•)"} + textfaces_emoji = {'♥‿♥', '☼.☼', '≧☉_☉≦', '(°ロ°)☝', '(☞゚∀゚)☞', '☜(˚▽˚)☞', '☜(⌒▽⌒)☞', + '(☞ຈل͜ຈ)☞', 'ヾ(⌐■_■)ノ♪', '(☞゚ヮ゚)☞', '☜(゚ヮ゚☜)'} + textfaces_wo_emoji = {'=U', 'ಠ_ಠ', '◉_◉', 'ಥ_ಥ', ":')", 'ಠ⌣ಠ', 'ಠ~ಠ', 'ಠ_ಥ', 'ಠ‿↼', + 'ʘ‿ʘ', 'ಠoಠ', 'ರ_ರ', '◔̯◔', '¬_¬', 'ب_ب', '°Д°', '^̮^', '^̮^', '^̮^', + '>_>', '^̮^', '^̮^', 'ಠ╭╮ಠ', '(>ლ)', 'ʕ•ᴥ•ʔ', '(ಥ﹏ಥ)', '(ᵔᴥᵔ)', + '(¬‿¬)', '⌐╦╦═─', '(•ω•)', '(¬_¬)', '。◕‿◕。', '(ʘ‿ʘ)', '٩◔̯◔۶', + '(>人<)', '(~_^)', '(^̮^)', '(・.◤)', '(◕‿◕✿)', '。◕‿‿◕。', '(─‿‿─)', + '(;一_一)', "(ʘᗩʘ')", '(✿´‿`)', 'ლ(ಠ益ಠლ)', '~(˘▾˘~)', '(~˘▾˘)~', + '(。◕‿◕。)', '(っ˘ڡ˘ς)', 'ლ(´ڡ`ლ)', 'ƪ(˘⌣˘)ʃ', '(´・ω・`)', + '(ღ˘⌣˘ღ)', '(▰˘◡˘▰)', '〆(・∀・@)', '༼ʘ̚ل͜ʘ̚༽', 'ᕙ(⇀‸↼‶)ᕗ', + 'ᕦ(ò_óˇ)ᕤ', '(。◕‿‿◕。)', 'ヽ༼ຈل͜ຈ༽ノ', '(ง°ل͜°)ง', '╚(ಠ_ಠ)=┐', + '(´・ω・)っ由', 'Ƹ̵̡Ӝ̵̨̄Ʒ', '¯\\_(ツ)_/¯', '▄︻̷̿┻̿═━一', "(ง'̀-'́)ง", + '¯\\(°_o)/¯', '。゜(`Д´)゜。', '(づ。◕‿‿◕。)づ', '(;´༎ຶД༎ຶ`)', + '(ノಠ益ಠ)ノ彡┻━┻', 'ლ,ᔑ•ﺪ͟͠•ᔐ.ლ', '(ノ◕ヮ◕)ノ*:・゚✧', '┬┴┬┴┤(・_├┬┴┬┴', + '[̲̅$̲̅(̲̅5̲̅)̲̅$̲̅]'} self.textfaces_space = re.compile(r"|".join([re.escape(_) for _ in sorted(textfaces_space, key=len, reverse=True)])) self.textfaces_emoji = re.compile(r"|".join([re.escape(_) for _ in sorted(textfaces_emoji, key=len, reverse=True)])) - textfaces_signal = set(["\\(ˆ˚ˆ)/", "(╥﹏╥)", "(╯°□°)╯︵", "┻━┻", "┬─┬", "ノ(°–°ノ)", "(^._.^)ノ", "ฅ^•ﻌ•^ฅ", "(•_•)", "(■_■¬)", "ƪ(ړײ)ƪ"]) + textfaces_signal = {"\\(ˆ˚ˆ)/", "(╥﹏╥)", "(╯°□°)╯︵", "┻━┻", "┬─┬", "ノ(°–°ノ)", "(^._.^)ノ", "ฅ^•ﻌ•^ฅ", "(•_•)", "(■_■¬)", "ƪ(ړײ)ƪ"} emoticon_list = sorted(emoticon_set | textfaces_wo_emoji | textfaces_signal, key=len, reverse=True) self.emoticon = re.compile(r"""(?:(?:[:;]|(?