diff --git a/README.md b/README.md index e94344e..41f7821 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,6 @@ pip install laonlp[extra1,extra2,...] List of possible extras - `full` (install everything) -- `anyascii` (for support of the `anyascii` engine of Lao transliteration functionalities) - `word_vector` (for support of word vector functionalities) diff --git a/laonlp/transliterate/__init__.py b/laonlp/transliterate/__init__.py index a12d4d6..42dc075 100644 --- a/laonlp/transliterate/__init__.py +++ b/laonlp/transliterate/__init__.py @@ -19,88 +19,172 @@ "thai2lao_script", "lao2thai_transliteration", "thai2lao_transliteration", - "transliterate" + "transliterate" ] -try: - from anyascii import anyascii -except ModuleNotFoundError: - raise ModuleNotFoundError('The anyascii engine of Lao transliteration functionalities require AnyAscii which is not currently installed. Please try installing the package via "pip install anyascii".') # Naive Lao script to Thai script transliteration. # Data from https://github.com/google/language-resources/blob/master/lo/Laoo-Thai.txt lao2thai_transliteration = { - "\u0e81":"\u0e01", # LAO LETTER KO KAY → THAI CHARACTER KO KAI - "\u0e82":"\u0e02", # LAO LETTER KHO KHAY → THAI CHARACTER KHO KHAI - "\u0e84":"\u0e04", # LAO LETTER KHO KHUAY → THAI CHARACTER KHO KHWAI - "\u0e87":"\u0e07", # LAO LETTER NGO NGU → THAI CHARACTER NGO NGU - "\u0e88":"\u0e08", # LAO LETTER CO COK → THAI CHARACTER CHO CHAN - "\u0e8a":"\u0e0a", # LAO LETTER SO SANG → THAI CHARACTER CHO CHANG - "\u0e8d":"\u0e0d", # LAO LETTER NYO NYUNG → THAI CHARACTER YO YING - "\u0e94":"\u0e14", # LAO LETTER DO DEK → THAI CHARACTER DO DEK - "\u0e95":"\u0e15", # LAO LETTER TO TA → THAI CHARACTER TO TAO - "\u0e96":"\u0e16", # LAO LETTER THO THONG → THAI CHARACTER THO THUNG - "\u0e97":"\u0e17", # LAO LETTER THO THUNG → THAI CHARACTER THO THAHAN - "\u0e99":"\u0e19", # LAO LETTER NO NOK → THAI CHARACTER NO NU - "\u0e9a":"\u0e1a", # LAO LETTER BO BE → THAI CHARACTER BO BAIMAI - "\u0e9b":"\u0e1b", # LAO LETTER PO PA → THAI CHARACTER PO PLA - "\u0e9c":"\u0e1c", # LAO LETTER PHO PHENG → THAI CHARACTER PHO PHUNG - "\u0e9d":"\u0e1d", # LAO LETTER FO FA → THAI CHARACTER FO FA - "\u0e9e":"\u0e1e", # LAO LETTER PHO PHU → THAI CHARACTER PHO PHAN - "\u0e9f":"\u0e1f", # LAO LETTER FO FAY → THAI CHARACTER FO FAN - "\u0ea1":"\u0e21", # LAO LETTER MO MA → THAI CHARACTER MO MA - "\u0ea2":"\u0e22", # LAO LETTER YO YA → THAI CHARACTER YO YAK - "\u0ea3":"\u0e23", # LAO LETTER RO ROT → THAI CHARACTER RO RUA - "\u0ea5":"\u0e25", # LAO LETTER LO LING → THAI CHARACTER LO LING - "\u0ea7":"\u0e27", # LAO LETTER WO WI → THAI CHARACTER WO WAEN - "\u0eaa":"\u0e2a", # LAO LETTER SO SYA → THAI CHARACTER SO SUA - "\u0eab":"\u0e2b", # LAO LETTER HO HAY → THAI CHARACTER HO HIP - "\u0ead":"\u0e2d", # LAO LETTER O O → THAI CHARACTER O ANG - "\u0eae":"\u0e2e", # LAO LETTER HO HYA → THAI CHARACTER HO NOKHUK - "\u0eaf":"\u0e2f", # LAO ELLIPSIS → THAI CHARACTER PAIYAN NOI - "\u0eb0":"\u0e30", # LAO VOWEL SIGN A → THAI CHARACTER SARA A - "\u0eb1":"\u0e31", # LAO VOWEL SIGN MAI KAN → THAI CHARACTER MAI HAN-AKAT - "\u0eb2":"\u0e32", # LAO VOWEL SIGN AA → THAI CHARACTER SARA AA - "\u0eb3":"\u0e33", # LAO VOWEL SIGN AM → THAI CHARACTER SARA AM - "\u0eb4":"\u0e34", # LAO VOWEL SIGN I → THAI CHARACTER SARA I - "\u0eb5":"\u0e35", # LAO VOWEL SIGN II → THAI CHARACTER SARA II - "\u0eb6":"\u0e36", # LAO VOWEL SIGN Y → THAI CHARACTER SARA UE - "\u0eb7":"\u0e37", # LAO VOWEL SIGN YY → THAI CHARACTER SARA UEE - "\u0eb8":"\u0e38", # LAO VOWEL SIGN U → THAI CHARACTER SARA U - "\u0eb9":"\u0e39", # LAO VOWEL SIGN UU → THAI CHARACTER SARA UU - "\u0ebb":"", # LAO VOWEL SIGN MAI KONG", cf. Lao ເຈົ້າ vs. Thai เจ้า - "\u0ebc":"\u0e25", # LAO SEMIVOWEL SIGN LO → THAI CHARACTER LO LING - "\u0ebd":"\u0e0d", # LAO SEMIVOWEL SIGN NYO FYANG → THAI CHARACTER YO YING - "\u0ec0":"\u0e40", # LAO VOWEL SIGN E → THAI CHARACTER SARA E - "\u0ec1":"\u0e41", # LAO VOWEL SIGN EI → THAI CHARACTER SARA AE - "\u0ec2":"\u0e42", # LAO VOWEL SIGN O → THAI CHARACTER SARA O - "\u0ec3":"\u0e43", # LAO VOWEL SIGN AY MAI MUAN → THAI CHARACTER SARA AI MAI MUAN - "\u0ec4":"\u0e44", # LAO VOWEL SIGN AI MAI MAY → THAI CHARACTER SARA AI MAI MALAI - "\u0ec6":"\u0e46", # LAO KO LA → THAI CHARACTER MAI YAMOK - "\u0ec8":"\u0e48", # LAO TONE MAI EK → THAI CHARACTER MAI EK - "\u0ec9":"\u0e49", # LAO TONE MAI THO → THAI CHARACTER MAI THO - "\u0eca":"\u0e4a", # LAO TONE MAI TI → THAI CHARACTER MAI TRI - "\u0ecb":"\u0e4b", # LAO TONE MAI CATAWA → THAI CHARACTER MAI CHATTAWA - "\u0ecc":"\u0e4c", # LAO CANCELLATION MARK → THAI CHARACTER THANTHAKHAT - "\u0ecd":"\u0e4d", # LAO NIGGAHITA → THAI CHARACTER NIKHAHIT - "\u0ed0":"\u0e50", # LAO DIGIT ZERO → THAI DIGIT ZERO - "\u0ed1":"\u0e51", # LAO DIGIT ONE → THAI DIGIT ONE - "\u0ed2":"\u0e52", # LAO DIGIT TWO → THAI DIGIT TWO - "\u0ed3":"\u0e53", # LAO DIGIT THREE → THAI DIGIT THREE - "\u0ed4":"\u0e54", # LAO DIGIT FOUR → THAI DIGIT FOUR - "\u0ed5":"\u0e55", # LAO DIGIT FIVE → THAI DIGIT FIVE - "\u0ed6":"\u0e56", # LAO DIGIT SIX → THAI DIGIT SIX - "\u0ed7":"\u0e57", # LAO DIGIT SEVEN → THAI DIGIT SEVEN - "\u0ed8":"\u0e58", # LAO DIGIT EIGHT → THAI DIGIT EIGHT - "\u0ed9":"\u0e59", # LAO DIGIT NINE → THAI DIGIT NINE - "\u0edc":"\u0e2b\u0e19", # LAO HO NO → HO HIP + NO NU - "\u0edd":"\u0e2b\u0e21", # LAO HO MO → HO HIP + MO MA + "\u0e81": "\u0e01", # LAO LETTER KO KAY → THAI CHARACTER KO KAI + "\u0e82": "\u0e02", # LAO LETTER KHO KHAY → THAI CHARACTER KHO KHAI + "\u0e84": "\u0e04", # LAO LETTER KHO KHUAY → THAI CHARACTER KHO KHWAI + "\u0e87": "\u0e07", # LAO LETTER NGO NGU → THAI CHARACTER NGO NGU + "\u0e88": "\u0e08", # LAO LETTER CO COK → THAI CHARACTER CHO CHAN + "\u0e8a": "\u0e0a", # LAO LETTER SO SANG → THAI CHARACTER CHO CHANG + "\u0e8d": "\u0e0d", # LAO LETTER NYO NYUNG → THAI CHARACTER YO YING + "\u0e94": "\u0e14", # LAO LETTER DO DEK → THAI CHARACTER DO DEK + "\u0e95": "\u0e15", # LAO LETTER TO TA → THAI CHARACTER TO TAO + "\u0e96": "\u0e16", # LAO LETTER THO THONG → THAI CHARACTER THO THUNG + "\u0e97": "\u0e17", # LAO LETTER THO THUNG → THAI CHARACTER THO THAHAN + "\u0e99": "\u0e19", # LAO LETTER NO NOK → THAI CHARACTER NO NU + "\u0e9a": "\u0e1a", # LAO LETTER BO BE → THAI CHARACTER BO BAIMAI + "\u0e9b": "\u0e1b", # LAO LETTER PO PA → THAI CHARACTER PO PLA + "\u0e9c": "\u0e1c", # LAO LETTER PHO PHENG → THAI CHARACTER PHO PHUNG + "\u0e9d": "\u0e1d", # LAO LETTER FO FA → THAI CHARACTER FO FA + "\u0e9e": "\u0e1e", # LAO LETTER PHO PHU → THAI CHARACTER PHO PHAN + "\u0e9f": "\u0e1f", # LAO LETTER FO FAY → THAI CHARACTER FO FAN + "\u0ea1": "\u0e21", # LAO LETTER MO MA → THAI CHARACTER MO MA + "\u0ea2": "\u0e22", # LAO LETTER YO YA → THAI CHARACTER YO YAK + "\u0ea3": "\u0e23", # LAO LETTER RO ROT → THAI CHARACTER RO RUA + "\u0ea5": "\u0e25", # LAO LETTER LO LING → THAI CHARACTER LO LING + "\u0ea7": "\u0e27", # LAO LETTER WO WI → THAI CHARACTER WO WAEN + "\u0eaa": "\u0e2a", # LAO LETTER SO SYA → THAI CHARACTER SO SUA + "\u0eab": "\u0e2b", # LAO LETTER HO HAY → THAI CHARACTER HO HIP + "\u0ead": "\u0e2d", # LAO LETTER O O → THAI CHARACTER O ANG + "\u0eae": "\u0e2e", # LAO LETTER HO HYA → THAI CHARACTER HO NOKHUK + "\u0eaf": "\u0e2f", # LAO ELLIPSIS → THAI CHARACTER PAIYAN NOI + "\u0eb0": "\u0e30", # LAO VOWEL SIGN A → THAI CHARACTER SARA A + "\u0eb1": "\u0e31", # LAO VOWEL SIGN MAI KAN → THAI CHARACTER MAI HAN-AKAT + "\u0eb2": "\u0e32", # LAO VOWEL SIGN AA → THAI CHARACTER SARA AA + "\u0eb3": "\u0e33", # LAO VOWEL SIGN AM → THAI CHARACTER SARA AM + "\u0eb4": "\u0e34", # LAO VOWEL SIGN I → THAI CHARACTER SARA I + "\u0eb5": "\u0e35", # LAO VOWEL SIGN II → THAI CHARACTER SARA II + "\u0eb6": "\u0e36", # LAO VOWEL SIGN Y → THAI CHARACTER SARA UE + "\u0eb7": "\u0e37", # LAO VOWEL SIGN YY → THAI CHARACTER SARA UEE + "\u0eb8": "\u0e38", # LAO VOWEL SIGN U → THAI CHARACTER SARA U + "\u0eb9": "\u0e39", # LAO VOWEL SIGN UU → THAI CHARACTER SARA UU + "\u0ebb": "", # LAO VOWEL SIGN MAI KONG", cf. Lao ເຈົ້າ vs. Thai เจ้า + "\u0ebc": "\u0e25", # LAO SEMIVOWEL SIGN LO → THAI CHARACTER LO LING + "\u0ebd": "\u0e0d", # LAO SEMIVOWEL SIGN NYO FYANG → THAI CHARACTER YO YING + "\u0ec0": "\u0e40", # LAO VOWEL SIGN E → THAI CHARACTER SARA E + "\u0ec1": "\u0e41", # LAO VOWEL SIGN EI → THAI CHARACTER SARA AE + "\u0ec2": "\u0e42", # LAO VOWEL SIGN O → THAI CHARACTER SARA O + "\u0ec3": "\u0e43", # LAO VOWEL SIGN AY MAI MUAN → THAI CHARACTER SARA AI MAI MUAN + "\u0ec4": "\u0e44", # LAO VOWEL SIGN AI MAI MAY → THAI CHARACTER SARA AI MAI MALAI + "\u0ec6": "\u0e46", # LAO KO LA → THAI CHARACTER MAI YAMOK + "\u0ec8": "\u0e48", # LAO TONE MAI EK → THAI CHARACTER MAI EK + "\u0ec9": "\u0e49", # LAO TONE MAI THO → THAI CHARACTER MAI THO + "\u0eca": "\u0e4a", # LAO TONE MAI TI → THAI CHARACTER MAI TRI + "\u0ecb": "\u0e4b", # LAO TONE MAI CATAWA → THAI CHARACTER MAI CHATTAWA + "\u0ecc": "\u0e4c", # LAO CANCELLATION MARK → THAI CHARACTER THANTHAKHAT + "\u0ecd": "\u0e4d", # LAO NIGGAHITA → THAI CHARACTER NIKHAHIT + "\u0ed0": "\u0e50", # LAO DIGIT ZERO → THAI DIGIT ZERO + "\u0ed1": "\u0e51", # LAO DIGIT ONE → THAI DIGIT ONE + "\u0ed2": "\u0e52", # LAO DIGIT TWO → THAI DIGIT TWO + "\u0ed3": "\u0e53", # LAO DIGIT THREE → THAI DIGIT THREE + "\u0ed4": "\u0e54", # LAO DIGIT FOUR → THAI DIGIT FOUR + "\u0ed5": "\u0e55", # LAO DIGIT FIVE → THAI DIGIT FIVE + "\u0ed6": "\u0e56", # LAO DIGIT SIX → THAI DIGIT SIX + "\u0ed7": "\u0e57", # LAO DIGIT SEVEN → THAI DIGIT SEVEN + "\u0ed8": "\u0e58", # LAO DIGIT EIGHT → THAI DIGIT EIGHT + "\u0ed9": "\u0e59", # LAO DIGIT NINE → THAI DIGIT NINE + "\u0edc": "\u0e2b\u0e19", # LAO HO NO → HO HIP + NO NU + "\u0edd": "\u0e2b\u0e21", # LAO HO MO → HO HIP + MO MA } lao_char = list(lao2thai_transliteration.keys()) thai2lao_transliteration = dict((v,k) for k,v in lao2thai_transliteration.items()) thai_char = list(thai2lao_transliteration.keys()) +# Lao transliteration to ASCII from AnyAscii (Line 3147-3230): +# https://github.com/anyascii/anyascii/blob/master/table.tsv +lao2ascii = str.maketrans({ + "ກ": "k", + "ຂ": "kh", + "ຄ": "kh", + "ຆ": "gh", + "ງ": "ng", + "ຈ": "ch", + "ຉ": "ch", + "ຊ": "x", + "ຌ": "jh", + "ຍ": "gn", + "ຎ": "n", + "ຏ": "t", + "ຐ": "th", + "ຑ": "d", + "ຒ": "dh", + "ຓ": "n", + "ດ": "d", + "ຕ": "t", + "ຖ": "th", + "ທ": "th", + "ຘ": "dh", + "ນ": "n", + "ບ": "b", + "ປ": "p", + "ຜ": "ph", + "ຝ": "f", + "ພ": "ph", + "ຟ": "f", + "ຠ": "bh", + "ມ": "m", + "ຢ": "y", + "ຣ": "r", + "ລ": "l", + "ວ": "v", + "ຨ": "s", + "ຩ": "s", + "ສ": "s", + "ຫ": "h", + "ຬ": "l", + "ອ": "ອ", + "ຮ": "h", + "ຯ": "...", + "ະ": "a", + "ັ": "a", + "າ": "a", + "ຳ": "am", + "ິ": "i", + "ີ": "i", + "ຶ": "u", + "ື": "u", + "ຸ": "ou", + "ູ": "ou", + "຺": "຺", + "ົ": "o", + "ຼ": "l", + "ຽ": "y", + "ເ": "e", + "ແ": "e", + "ໂ": "o", + "ໃ": "ai", + "ໄ": "ai", + "ໆ": "-", + "່": "່", + "້": "້", + "໊": "໊", + "໋": "໋", + "໌": "໌", + "ໍ": "o", + "໎": "໎", + "໐": "0", + "໑": "1", + "໒": "2", + "໓": "3", + "໔": "4", + "໕": "5", + "໖": "6", + "໗": "7", + "໘": "8", + "໙": "9", + "ໜ": "n", + "ໝ": "m", + "ໞ": "g", + "ໟ": "gn", + "ༀ": "Om", +}) -def lao2thai_script(text: str)->str: +def lao2thai_script(text: str) -> str: """ Lao to Thai script @@ -133,8 +217,9 @@ def thai2lao_script(text: str) -> str: new_text += c return new_text -def transliterate(lao_word: str, engine:str="anyascii")->str: - """ + +def transliterate(lao_word: str, engine: str = "anyascii") -> str: + """ Lao transliterate :param str sent: Lao text @@ -142,4 +227,4 @@ def transliterate(lao_word: str, engine:str="anyascii")->str: :return: returns a Lao transliteration. :rtype: str """ - return anyascii(lao_word) + return lao_word.translate(lao2ascii) diff --git a/setup.py b/setup.py index 5b4ed5b..474762f 100644 --- a/setup.py +++ b/setup.py @@ -16,17 +16,15 @@ """ from setuptools import find_packages, setup -with open("README.md","r",encoding="utf-8-sig") as f: +with open("README.md", "r", encoding="utf-8-sig") as f: readme = f.read() -with open("requirements.txt","r",encoding="utf-8-sig") as f: +with open("requirements.txt", "r", encoding="utf-8-sig") as f: requirements = [i.strip() for i in f.readlines()] extras = { - "anyascii": ["anyascii>=0.3.2"], "word_vector": ["gensim", "huggingface-hub"], "full": [ - "anyascii>=0.3.2", "gensim", "huggingface-hub" ]