Skip to content

Commit

Permalink
More straightforward definition of set literals
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas Proisl committed Jul 10, 2023
1 parent 2ead5c4 commit bc24de9
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 51 deletions.
4 changes: 2 additions & 2 deletions src/somajo/sentence_splitter.py
Expand Up @@ -21,11 +21,11 @@ def __init__(self, is_tuple=False, language="de_CMC"):
self.closing_punct = re.compile(r"^(?:['\"\p{Pf}\p{Pe}])$")
# International quotes: «» “” ‹› ‘’
# German quotes: »« „“ ›‹ ‚‘
self.problematic_quotes = set(['"'])
self.problematic_quotes = {'"'}
if language == "de" or language == "de_CMC":
# German opening quotes [»›] have category Pf
# German closing quotes [“‘«‹] have category Pi
self.problematic_quotes = set(['"', "»", "«", "›", "‹", "“", "‘"])
self.problematic_quotes = {'"', "»", "«", "›", "‹", "“", "‘"}
self.eos_abbreviations = utils.read_abbreviation_file("eos_abbreviations.txt")
# We match these via regular expressions because users could
# call the split or split_xml methods with pretokenized input
Expand Down
4 changes: 2 additions & 2 deletions src/somajo/somajo.py
Expand Up @@ -31,9 +31,9 @@ class SoMaJo:
"""

supported_languages = set(["de_CMC", "en_PTB"])
supported_languages = {"de_CMC", "en_PTB"}
_default_language = "de_CMC"
paragraph_separators = set(["empty_lines", "single_newlines"])
paragraph_separators = {"empty_lines", "single_newlines"}
_default_parsep = "empty_lines"

def __init__(self, language, *, split_camel_case=False, split_sentences=True, xml_sentences=None):
Expand Down
4 changes: 2 additions & 2 deletions src/somajo/token.py
Expand Up @@ -29,7 +29,7 @@ class Token:
"""

token_classes = set([
token_classes = {
"URL",
"XML_entity",
"XML_tag",
Expand All @@ -48,7 +48,7 @@ class Token:
"semester",
"symbol",
"time",
])
}

def __init__(self, text, *, markup=False, markup_class=None, markup_eos=None, locked=False, token_class=None, space_after=True, original_spelling=None, first_in_sentence=False, last_in_sentence=False):
self.text = text
Expand Down
89 changes: 44 additions & 45 deletions src/somajo/tokenizer.py
Expand Up @@ -14,7 +14,7 @@

class Tokenizer():

_supported_languages = set(["de", "de_CMC", "en", "en_PTB"])
_supported_languages = {"de", "de_CMC", "en", "en_PTB"}
_default_language = "de_CMC"

def __init__(self, split_camel_case=False, token_classes=False, extra_info=False, language="de_CMC"):
Expand Down Expand Up @@ -131,53 +131,52 @@ def __init__(self, split_camel_case=False, token_classes=False, extra_info=False
);""", re.VERBOSE | re.IGNORECASE)

# EMOTICONS
emoticon_set = set(["(-.-)", "(T_T)", "(♥_♥)", ")':", ")-:",
"(-:", ")=", ")o:", ")x", ":'C", ":/",
":<", ":C", ":[", "=(", "=)", "=D", "=P",
">:", "\\:", "]:", "x(", "^^", "o.O",
"\\O/", "\\m/", ":;))", "_))", "*_*",
"._.", ">_<", "*<:-)", ":!:", ":;-))",
"x'D", ":^)", "(>_<)", ":->", "\\o/",
"B-)", ":-$", "O:-)", "=-O", ":O", ":-!",
":-x", ":-|", ":-\\", ":-[", ">:-(",
"^.^"])
emoticon_set = {"(-.-)", "(T_T)", "(♥_♥)", ")':", ")-:",
"(-:", ")=", ")o:", ")x", ":'C", ":/", ":<",
":C", ":[", "=(", "=)", "=D", "=P", ">:",
"\\:", "]:", "x(", "^^", "o.O", "\\O/",
"\\m/", ":;))", "_))", "*_*", "._.", ">_<",
"*<:-)", ":!:", ":;-))", "x'D", ":^)",
"(>_<)", ":->", "\\o/", "B-)", ":-$", "O:-)",
"=-O", ":O", ":-!", ":-x", ":-|", ":-\\",
":-[", ">:-(", "^.^"}
# From https://textfac.es/
textfaces_space = set(['⚆ _ ⚆', '˙ ͜ʟ˙', '◔ ⌣ ◔', '( ゚ヮ゚)', '(• ε •)',
'(づ ̄ ³ ̄)づ', '♪~ ᕕ(ᐛ)ᕗ', '\\ (•◡•) /', '( ಠ ͜ʖರೃ)',
'( ⚆ _ ⚆ )', '(▀̿Ĺ̯▀̿ ̿)', '༼ つ ◕_◕ ༽つ', '༼ つ ಥ_ಥ ༽つ',
'( ͡° ͜ʖ ͡°)', '( ͡°╭͜ʖ╮͡° )', '(╯°□°)╯︵ ┻━┻',
'( ͡ᵔ ͜ʖ ͡ᵔ )', '┬──┬ ノ( ゜-゜ノ)', '┬─┬ノ( º _ ºノ)',
'(ง ͠° ͟ل͜ ͡°)ง', '(͡ ͡° ͜ つ ͡͡°)', "﴾͡๏̯͡๏﴿ O'RLY?",
'(╯°□°)╯︵( .o.)', '(° ͡ ͜ ͡ʖ ͡ °)', '┬─┬ ︵ /(.□. )',
'(/) (°,,°) (/)', '| (• ◡•)| (❍ᴥ❍ʋ)',
'༼ つ ͡° ͜ʖ ͡° ༽つ', '(╯°□°)╯︵ ʞooqǝɔɐɟ', '┻━┻ ︵ヽ(`Д´)ノ︵ ┻━┻',
'┬┴┬┴┤ ͜ʖ ͡°) ├┬┴┬┴', '(ó ì_í)=óò=(ì_í ò)',
'(•_•) ( •_•)>⌐■-■ (⌐■_■)', '(ノ◕ヮ◕)ノ*:・゚✧ ✧゚・: *ヽ(◕ヮ◕ヽ)',
'[̲̅$̲̅(̲̅ ͡° ͜ʖ ͡°̲̅)̲̅$̲̅]', '/╲/\\╭( ͡° ͡° ͜ʖ ͡° ͡°)╮/\\\\',
'( ͡°( ͡° ͜ʖ( ͡° ͜ʖ ͡°)ʖ ͡°) ͡°)', '(._.) ( l: ) ( .-. ) ( :l ) (._.)',
"̿ ̿ ̿'̿'\\̵͇̿̿\\з=(•_•)=ε/̵͇̿̿/'̿'̿ ̿", '༼ ºل͟º ༼ ºل͟º ༼ ºل͟º ༽ ºل͟º ༽ ºل͟º ༽',
"̿'̿'\\̵͇̿̿\\з=( ͠° ͟ʖ ͡°)=ε/̵͇̿̿/'̿̿ ̿ ̿ ̿ ̿ ̿",
"̿̿ ̿̿ ̿̿ ̿'̿'\\̵͇̿̿\\з= ( ▀ ͜͞ʖ▀) =ε/̵͇̿̿/’̿’̿ ̿ ̿̿ ̿̿ ̿̿",
# From Signal:
"ヽ(°◇° )ノ", "■-■¬ <(•_•)"])
textfaces_emoji = set(['♥‿♥', '☼.☼', '≧☉_☉≦', '(°ロ°)☝', '(☞゚∀゚)☞', '☜(˚▽˚)☞', '☜(⌒▽⌒)☞',
'(☞ຈل͜ຈ)☞', 'ヾ(⌐■_■)ノ♪', '(☞゚ヮ゚)☞', '☜(゚ヮ゚☜)'])
textfaces_wo_emoji = set(['=U', 'ಠ_ಠ', '◉_◉', 'ಥ_ಥ', ":')", 'ಠ⌣ಠ', 'ಠ~ಠ', 'ಠ_ಥ', 'ಠ‿↼',
'ʘ‿ʘ', 'ಠoಠ', 'ರ_ರ', '◔̯◔', '¬_¬', 'ب_ب', '°Д°', '^̮^', '^̮^', '^̮^',
'>_>', '^̮^', '^̮^', 'ಠ╭╮ಠ', '(>ლ)', 'ʕ•ᴥ•ʔ', '(ಥ﹏ಥ)', '(ᵔᴥᵔ)',
'(¬‿¬)', '⌐╦╦═─', '(•ω•)', '(¬_¬)', '。◕‿◕。', '(ʘ‿ʘ)', '٩◔̯◔۶',
'(>人<)', '(~_^)', '(^̮^)', '(・.◤)', '(◕‿◕✿)', '。◕‿‿◕。', '(─‿‿─)',
'(;一_一)', "(ʘᗩʘ')", '(✿´‿`)', 'ლ(ಠ益ಠლ)', '~(˘▾˘~)', '(~˘▾˘)~',
'(。◕‿◕。)', '(っ˘ڡ˘ς)', 'ლ(´ڡ`ლ)', 'ƪ(˘⌣˘)ʃ', '(´・ω・`)',
'(ღ˘⌣˘ღ)', '(▰˘◡˘▰)', '〆(・∀・@)', '༼ʘ̚ل͜ʘ̚༽', 'ᕙ(⇀‸↼‶)ᕗ',
'ᕦ(ò_óˇ)ᕤ', '(。◕‿‿◕。)', 'ヽ༼ຈل͜ຈ༽ノ', '(ง°ل͜°)ง', '╚(ಠ_ಠ)=┐',
'(´・ω・)っ由', 'Ƹ̵̡Ӝ̵̨̄Ʒ', \\_(ツ)_/¯', '▄︻̷̿┻̿═━一', "(ง'̀-'́)ง",
\\(°_o)/¯', '。゜(`Д´)゜。', '(づ。◕‿‿◕。)づ', '(;´༎ຶД༎ຶ`)',
'(ノಠ益ಠ)ノ彡┻━┻', 'ლ,ᔑ•ﺪ͟͠•ᔐ.ლ', '(ノ◕ヮ◕)ノ*:・゚✧', '┬┴┬┴┤(・_├┬┴┬┴',
'[̲̅$̲̅(̲̅5̲̅)̲̅$̲̅]'])
textfaces_space = {'⚆ _ ⚆', '˙ ͜ʟ˙', '◔ ⌣ ◔', '( ゚ヮ゚)', '(• ε •)',
'(づ ̄ ³ ̄)づ', '♪~ ᕕ(ᐛ)ᕗ', '\\ (•◡•) /', '( ಠ ͜ʖರೃ)',
'( ⚆ _ ⚆ )', '(▀̿Ĺ̯▀̿ ̿)', '༼ つ ◕_◕ ༽つ', '༼ つ ಥ_ಥ ༽つ',
'( ͡° ͜ʖ ͡°)', '( ͡°╭͜ʖ╮͡° )', '(╯°□°)╯︵ ┻━┻',
'( ͡ᵔ ͜ʖ ͡ᵔ )', '┬──┬ ノ( ゜-゜ノ)', '┬─┬ノ( º _ ºノ)',
'(ง ͠° ͟ل͜ ͡°)ง', '(͡ ͡° ͜ つ ͡͡°)', "﴾͡๏̯͡๏﴿ O'RLY?",
'(╯°□°)╯︵( .o.)', '(° ͡ ͜ ͡ʖ ͡ °)', '┬─┬ ︵ /(.□. )',
'(/) (°,,°) (/)', '| (• ◡•)| (❍ᴥ❍ʋ)',
'༼ つ ͡° ͜ʖ ͡° ༽つ', '(╯°□°)╯︵ ʞooqǝɔɐɟ', '┻━┻ ︵ヽ(`Д´)ノ︵ ┻━┻',
'┬┴┬┴┤ ͜ʖ ͡°) ├┬┴┬┴', '(ó ì_í)=óò=(ì_í ò)',
'(•_•) ( •_•)>⌐■-■ (⌐■_■)', '(ノ◕ヮ◕)ノ*:・゚✧ ✧゚・: *ヽ(◕ヮ◕ヽ)',
'[̲̅$̲̅(̲̅ ͡° ͜ʖ ͡°̲̅)̲̅$̲̅]', '/╲/\\╭( ͡° ͡° ͜ʖ ͡° ͡°)╮/\\\\',
'( ͡°( ͡° ͜ʖ( ͡° ͜ʖ ͡°)ʖ ͡°) ͡°)', '(._.) ( l: ) ( .-. ) ( :l ) (._.)',
"̿ ̿ ̿'̿'\\̵͇̿̿\\з=(•_•)=ε/̵͇̿̿/'̿'̿ ̿", '༼ ºل͟º ༼ ºل͟º ༼ ºل͟º ༽ ºل͟º ༽ ºل͟º ༽',
"̿'̿'\\̵͇̿̿\\з=( ͠° ͟ʖ ͡°)=ε/̵͇̿̿/'̿̿ ̿ ̿ ̿ ̿ ̿",
"̿̿ ̿̿ ̿̿ ̿'̿'\\̵͇̿̿\\з= ( ▀ ͜͞ʖ▀) =ε/̵͇̿̿/’̿’̿ ̿ ̿̿ ̿̿ ̿̿",
# From Signal:
"ヽ(°◇° )ノ", "■-■¬ <(•_•)"}
textfaces_emoji = {'♥‿♥', '☼.☼', '≧☉_☉≦', '(°ロ°)☝', '(☞゚∀゚)☞', '☜(˚▽˚)☞', '☜(⌒▽⌒)☞',
'(☞ຈل͜ຈ)☞', 'ヾ(⌐■_■)ノ♪', '(☞゚ヮ゚)☞', '☜(゚ヮ゚☜)'}
textfaces_wo_emoji = {'=U', 'ಠ_ಠ', '◉_◉', 'ಥ_ಥ', ":')", 'ಠ⌣ಠ', 'ಠ~ಠ', 'ಠ_ಥ', 'ಠ‿↼',
'ʘ‿ʘ', 'ಠoಠ', 'ರ_ರ', '◔̯◔', '¬_¬', 'ب_ب', '°Д°', '^̮^', '^̮^', '^̮^',
'>_>', '^̮^', '^̮^', 'ಠ╭╮ಠ', '(>ლ)', 'ʕ•ᴥ•ʔ', '(ಥ﹏ಥ)', '(ᵔᴥᵔ)',
'(¬‿¬)', '⌐╦╦═─', '(•ω•)', '(¬_¬)', '。◕‿◕。', '(ʘ‿ʘ)', '٩◔̯◔۶',
'(>人<)', '(~_^)', '(^̮^)', '(・.◤)', '(◕‿◕✿)', '。◕‿‿◕。', '(─‿‿─)',
'(;一_一)', "(ʘᗩʘ')", '(✿´‿`)', 'ლ(ಠ益ಠლ)', '~(˘▾˘~)', '(~˘▾˘)~',
'(。◕‿◕。)', '(っ˘ڡ˘ς)', 'ლ(´ڡ`ლ)', 'ƪ(˘⌣˘)ʃ', '(´・ω・`)',
'(ღ˘⌣˘ღ)', '(▰˘◡˘▰)', '〆(・∀・@)', '༼ʘ̚ل͜ʘ̚༽', 'ᕙ(⇀‸↼‶)ᕗ',
'ᕦ(ò_óˇ)ᕤ', '(。◕‿‿◕。)', 'ヽ༼ຈل͜ຈ༽ノ', '(ง°ل͜°)ง', '╚(ಠ_ಠ)=┐',
'(´・ω・)っ由', 'Ƹ̵̡Ӝ̵̨̄Ʒ', \\_(ツ)_/¯', '▄︻̷̿┻̿═━一', "(ง'̀-'́)ง",
\\(°_o)/¯', '。゜(`Д´)゜。', '(づ。◕‿‿◕。)づ', '(;´༎ຶД༎ຶ`)',
'(ノಠ益ಠ)ノ彡┻━┻', 'ლ,ᔑ•ﺪ͟͠•ᔐ.ლ', '(ノ◕ヮ◕)ノ*:・゚✧', '┬┴┬┴┤(・_├┬┴┬┴',
'[̲̅$̲̅(̲̅5̲̅)̲̅$̲̅]'}
self.textfaces_space = re.compile(r"|".join([re.escape(_) for _ in sorted(textfaces_space, key=len, reverse=True)]))
self.textfaces_emoji = re.compile(r"|".join([re.escape(_) for _ in sorted(textfaces_emoji, key=len, reverse=True)]))
textfaces_signal = set(["\\(ˆ˚ˆ)/", "(╥﹏╥)", "(╯°□°)╯︵", "┻━┻", "┬─┬", "ノ(°–°ノ)", "(^._.^)ノ", "ฅ^•ﻌ•^ฅ", "(•_•)", "(■_■¬)", "ƪ(ړײ)ƪ"])
textfaces_signal = {"\\(ˆ˚ˆ)/", "(╥﹏╥)", "(╯°□°)╯︵", "┻━┻", "┬─┬", "ノ(°–°ノ)", "(^._.^)ノ", "ฅ^•ﻌ•^ฅ", "(•_•)", "(■_■¬)", "ƪ(ړײ)ƪ"}
emoticon_list = sorted(emoticon_set | textfaces_wo_emoji | textfaces_signal, key=len, reverse=True)
self.emoticon = re.compile(r"""(?:(?:[:;]|(?<!\d)8) # a variety of eyes, alt.: [:;8]
[-'oO]? # optional nose or tear
Expand Down

0 comments on commit bc24de9

Please sign in to comment.