Skip to content

Commit

Permalink
Merge pull request #4 from vTuanpham/feat/Providers
Browse files Browse the repository at this point in the history
Feat/providers
  • Loading branch information
vTuanpham committed Jan 5, 2024
2 parents 8f21cac + 8e712bf commit f63503a
Show file tree
Hide file tree
Showing 9 changed files with 225 additions and 17 deletions.
12 changes: 9 additions & 3 deletions .github/workflows/test_translate.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,15 @@ on:
- main
- dev
- feat/*

jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
python-version: [3.8]

runs-on: ${{ matrix.os }}

steps:
- name: Check Out Code
Expand All @@ -17,7 +23,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.8 # Specify your desired Python version
python-version: ${{ matrix.python-version }}

- name: Install Dependencies
run: |
Expand All @@ -27,4 +33,4 @@ jobs:
- name: Run Unit Tests
run: |
python -m unittest discover -s tests -p "*_test.py"
working-directory: ./
working-directory: ./
3 changes: 3 additions & 0 deletions providers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .base_provider import Provider
from .google_provider import GoogleProvider
from .multiple_providers import MultipleProviders
49 changes: 49 additions & 0 deletions providers/base_provider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from typing import Union, List, Any
from abc import ABC, abstractmethod
from types import SimpleNamespace


class Provider(ABC):
"""
Base Provider that must be inherited by all Provider class, implement your own provider by inheriting this class
"""
@abstractmethod
def __init__(self):
self.translator = None

@abstractmethod
def _do_translate(self, input_data: Union[str, List[str]], src: str, dest: str, **kwargs) -> Union[str, List[str], Any]:
raise NotImplemented(" The function _do_translate has not been implemented.")

def translate(self, input_data: Union[str, List[str]], src: str, dest: str) -> Union[SimpleNamespace, List[SimpleNamespace]]:
"""
Translate text input_data from a language to another language
:param input_data: The input_data (Can be string or list of strings)
:param src: The source lang of input_data
:param dest: The target lang you want input_data to be translated
:return: SimpleNamespace object or list of SimpleNamespace objects with 'text' attribute
"""

# Type check for input_data
if not isinstance(input_data, (str, list)):
raise TypeError(f"input_data must be of type str or List[str], not {type(input_data).__name__}")

if isinstance(input_data, list) and not all(isinstance(item, str) for item in input_data):
raise TypeError("All elements of input_data list must be of type str")

# Ensure the translator is set
assert self.translator, "Please assign the translator object instance to self.translator"

# Perform the translation
translated_instance = self._do_translate(input_data, src=src, dest=dest)

# Wrap non-list objects in SimpleNamespace if they don't have a 'text' attribute
if not isinstance(translated_instance, list):
if not hasattr(translated_instance, 'text'):
return SimpleNamespace(text=translated_instance)
else:
# Wrap each item in the list in SimpleNamespace if the item doesn't have a 'text' attribute
return [SimpleNamespace(text=item) if not hasattr(item, 'text') else item for item in translated_instance]

return translated_instance

36 changes: 36 additions & 0 deletions providers/google_provider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import sys
from typing import Union, List, Any
sys.path.insert(0, r'/')
from googletrans import Translator
from .base_provider import Provider


# https://github.com/ssut/py-googletrans
# This is the best reliable provider, as this has access to API call instead of using the crawling method
class GoogleProvider(Provider):
def __init__(self):
self.translator = Translator()

def _do_translate(self, input_data: Union[str, List[str]], src: str, dest: str, **kwargs) -> Union[str, List[str], Any]:
"""
translate(text, dest='en', src='auto', **kwargs)
Translate text from source language to destination language
Parameters:
text (UTF-8 str; unicode; string sequence (list, tuple, iterator, generator)) – The source text(s) to be translated. Batch translation is supported via sequence input.
dest – The language to translate the source text into. The value should be one of the language codes listed in googletrans.LANGUAGES or one of the language names listed in googletrans.LANGCODES.
dest – str; unicode
src – The language of the source text. The value should be one of the language codes listed in googletrans.LANGUAGES or one of the language names listed in googletrans.LANGCODES. If a language is not specified, the system will attempt to identify the source language automatically.
src – str; unicode
Return type:
Translated
Return type: list (when a list is passed) else str
"""

return self.translator.translate(input_data, src=src, dest=dest)


if __name__ == '__main__':
test = GoogleProvider()
print(test.translate("Hello", src="en", dest="vi").text)
75 changes: 75 additions & 0 deletions providers/multiple_providers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import sys
sys.path.insert(0, r'/')
from typing import Union, List
import translators as ts
from .base_provider import Provider


# https://github.com/UlionTse/translators
# This library is not as reliable of a provider as googletrans, use this if you want to try out other translation services
class MultipleProviders(Provider):
def __init__(self, cache: bool = False):
self.translator = ts
self.config = {
"translator": "baidu",
"timeout": 10.0,
"if_ignore_empty_query": True
}
if cache:
_ = self.translator.preaccelerate_and_speedtest() # Optional. Caching sessions in advance, which can help improve access speed.

def _do_translate(self, input_data: Union[str, List[str]], src: str, dest: str) -> Union[str, List[str]]:
"""
translate_text(query_text: str, translator: str = 'bing', from_language: str = 'auto', to_language: str = 'en', **kwargs) -> Union[str, dict]
:param query_text: str, must.
:param translator: str, default 'bing'.
:param from_language: str, default 'auto'.
:param to_language: str, default 'en'.
:param if_use_preacceleration: bool, default False.
:param **kwargs:
:param is_detail_result: bool, default False.
:param professional_field: str, default None. Support alibaba(), baidu(), caiyun(), cloudTranslation(), elia(), sysTran(), youdao(), volcEngine() only.
:param timeout: float, default None.
:param proxies: dict, default None.
:param sleep_seconds: float, default 0.
:param update_session_after_freq: int, default 1000.
:param update_session_after_seconds: float, default 1500.
:param if_use_cn_host: bool, default False. Support google(), bing() only.
:param reset_host_url: str, default None. Support google(), yandex() only.
:param if_check_reset_host_url: bool, default True. Support google(), yandex() only.
:param if_ignore_empty_query: bool, default False.
:param limit_of_length: int, default 20000.
:param if_ignore_limit_of_length: bool, default False.
:param if_show_time_stat: bool, default False.
:param show_time_stat_precision: int, default 2.
:param if_print_warning: bool, default True.
:param lingvanex_mode: str, default 'B2C', choose from ("B2C", "B2B").
:param myMemory_mode: str, default "web", choose from ("web", "api").
:return: str or dict
"""
# This provider does not support batch translation
if isinstance(input_data, list):
translated_data = []
for text in input_data:
translated_text = self.translator.translate_text(text, from_language=src, to_language=dest, **self.config)
translated_data.append(translated_text)
else:
translated_data = self.translator.translate_text(input_data, from_language=src, to_language=dest, **self.config)

return translated_data


if __name__ == '__main__':
test = MultipleProviders()
print(test.translate("Hello", src="en", dest="vi").text)

"""
Supported languages:
['ach', 'afr', 'aka', 'alb', 'amh', 'ara', 'arg', 'arm', 'arq', 'asm', 'ast', 'auto', 'aym', 'aze', 'bak', 'bal', 'baq', 'bel', 'bem', 'ben', 'ber', 'bho', 'bis', 'bl
i', 'bos', 'bre', 'bul', 'bur', 'cat', 'ceb', 'chr', 'cht', 'chv', 'cor', 'cos', 'cre', 'cri', 'cs', 'dan', 'de', 'div', 'el', 'en', 'eno', 'epo', 'est', 'fao', 'fil', 'fin', 'fra', 'fri', 'frm', 'frn', 'fry', 'ful', 'geo', 'gla', 'gle
', 'glg', 'glv', 'gra', 'grn', 'guj', 'hak', 'hau', 'haw', 'heb', 'hi', 'hil', 'hkm', 'hmn', 'hrv', 'ht', 'hu', 'hup', 'ibo', 'ice', 'id', 'ido', 'iku', 'ina', 'ing', 'it', 'jav', 'jp', 'kab', 'kah', 'kal', 'kan', 'kas', 'kau', 'kin',
'kir', 'kli', 'kok', 'kon', 'kor', 'kur', 'lag', 'lao', 'lat', 'lav', 'lim', 'lin', 'lit', 'log', 'loj', 'los', 'ltz', 'lug', 'mac', 'mah', 'mai', 'mal', 'mao', 'mar', 'mau', 'may', 'mg', 'mlt', 'mot', 'nbl', 'nea', 'nep', 'nl', 'nno',
'nob', 'nor', 'nqo', 'nya', 'oci', 'oji', 'ori', 'orm', 'oss', 'pam', 'pan', 'pap', 'ped', 'per', 'pl', 'pot', 'pt', 'pus', 'que', 'ro', 'roh', 'rom', 'ru', 'ruy', 'san', 'sco', 'sec', 'sha', 'sil', 'sin', 'sk', 'slo', 'sm', 'sme', 's
na', 'snd', 'sol', 'som', 'sot', 'spa', 'src', 'srd', 'srp', 'sun', 'swa', 'swe', 'syr', 'tam', 'tat', 'tel', 'tet', 'tgk', 'tgl', 'th', 'tir', 'tr', 'tso', 'tua', 'tuk', 'twi', 'ukr', 'ups', 'urd', 'ven', 'vie', 'wel', 'wln', 'wol', '
wyw', 'xho', 'yid', 'yor', 'yue', 'zaz', 'zh', 'zul']
"""
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
googletrans==3.1.0a0
translators
datasets
tqdm
20 changes: 17 additions & 3 deletions tests/eli5_qaconfig_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,18 @@ def step4(self):

def step5(self):
try:
self.translated_dataset = load_dataset("json", data_files=self.output_path, keep_in_memory=False)
self.parsed_dataset = load_dataset("json", data_files=self.output_path, keep_in_memory=False)
self.translated_dataset = load_dataset("json", data_files=self.output_path_translated, keep_in_memory=False)
except Exception as e:
raise SyntaxError("Invalid syntax for save function, the data output must be in the form of"
f"line-delimited json,\n Error message: {e}")

def step6(self):
self.assertEqual(len(self.translated_dataset['train']), len(self.parser.converted_data),
"The parsed translated dataset does not match the length of the parsed dataset")
self.assertEqual(len(self.parsed_dataset['train']), len(self.parser.converted_data),
msg="The parsed dataset does not match the length of the parsed dataset")
self.assertAlmostEqualInt(len(self.translated_dataset['train']), len(self.parser.converted_data),
msg="The parsed translated dataset fail too much and does not meet the length criteria of the parsed dataset",
tolerance=50)

def step7(self):
if os.path.exists(self.output_path):
Expand All @@ -63,6 +67,16 @@ def test_steps(self):
except Exception as e:
self.fail(f"{step} failed ({type(e)}: {e})")

def assertAlmostEqualInt(self, int1, int2, tolerance=1, msg=None):
"""
Asserts that two integers are almost equal within a specified tolerance range.
"""
if abs(int1 - int2) > tolerance:
standard_msg = f"{int1} and {int2} are not almost equal within a tolerance of {tolerance}."
if msg:
standard_msg = f"{msg}: {standard_msg}"
raise self.failureException(standard_msg)


if __name__ == '__main__':
unittest.main()
Expand Down
20 changes: 17 additions & 3 deletions tests/eli5_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,18 @@ def step4(self):

def step5(self):
try:
self.translated_dataset = load_dataset("json", data_files=self.output_path, keep_in_memory=False)
self.parsed_dataset = load_dataset("json", data_files=self.output_path, keep_in_memory=False)
self.translated_dataset = load_dataset("json", data_files=self.output_path_translated, keep_in_memory=False)
except Exception as e:
raise SyntaxError("Invalid syntax for save function, the data output must be in the form of"
f"line-delimited json,\n Error message: {e}")

def step6(self):
self.assertEqual(len(self.translated_dataset['train']), len(self.parser.converted_data),
"The parsed translated dataset does not match the length of the parsed dataset")
self.assertEqual(len(self.parsed_dataset['train']), len(self.parser.converted_data),
msg="The parsed dataset does not match the length of the parsed dataset")
self.assertAlmostEqualInt(len(self.translated_dataset['train']), len(self.parser.converted_data),
msg="The parsed translated dataset fail too much and does not meet the length criteria of the parsed dataset",
tolerance=50)

def step7(self):
if os.path.exists(self.output_path):
Expand All @@ -64,6 +68,16 @@ def test_steps(self):
except Exception as e:
self.fail(f"{step} failed ({type(e)}: {e})")

def assertAlmostEqualInt(self, int1, int2, tolerance=1, msg=None):
"""
Asserts that two integers are almost equal within a specified tolerance range.
"""
if abs(int1 - int2) > tolerance:
standard_msg = f"{int1} and {int2} are not almost equal within a tolerance of {tolerance}."
if msg:
standard_msg = f"{msg}: {standard_msg}"
raise self.failureException(standard_msg)


if __name__ == '__main__':
unittest.main()
Expand Down
26 changes: 18 additions & 8 deletions translator/data_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,28 @@
import json
import os
import random
import string
import sys
sys.path.insert(0, r'./')
from copy import deepcopy

import string
import threading
import warnings
import traceback
try:
from google.colab import files
IN_COLAB = True
except ImportError:
IN_COLAB = False
from httpcore._exceptions import ConnectTimeout
from translators.server import TranslatorError
from typing import List, Dict, Union
from abc import abstractmethod
from tqdm.auto import tqdm

from concurrent.futures import ThreadPoolExecutor

from googletrans import Translator
from providers import Provider, GoogleProvider, MultipleProviders

from configs import BaseConfig, QAConfig, DialogsConfig
from .utils import force_super_call, ForceBaseCallMeta, timeit, have_internet
Expand All @@ -47,6 +49,7 @@ def __init__(self, file_path: str,
large_chunks_threshold: int = 20000, # Maximum number of examples that will be distributed evenly across threads, any examples exceed this threshold will be process in queue
max_list_length_per_thread: int = 3, # Maximum number of strings contain in a list in a single thread.
# if larger, split the list into sub-list and process in parallel
translator: Provider = GoogleProvider,
source_lang: str = "en",
target_lang: str = "vi",
fail_translation_code: str="P1OP1_F" # Fail code for unexpected fail translation and can be removed
Expand Down Expand Up @@ -84,10 +87,10 @@ def __init__(self, file_path: str,

self.converted_data_translated = None

self.translator = Translator
self.translator = translator

@property
def get_translator(self) -> Translator:
def get_translator(self) -> Provider:
return deepcopy(self.translator)()

@staticmethod
Expand Down Expand Up @@ -146,7 +149,7 @@ def post_translate_validate(self) -> None:
print(f"\nTotal data left after filtering fail translation: {len(post_validated_translate_data)}\n")
self.converted_data_translated = post_validated_translate_data

def __translate_per_key(self, example: Dict, translator: Translator = None, progress_idx: int = 0) -> Dict:
def __translate_per_key(self, example: Dict, translator: Provider = None, progress_idx: int = 0) -> Dict:
'''
This function loop through each key of one example and send to __translate_texts if the value of the key is
under a certain threshold. If exceeded, then send to __sublist_multithread_translate
Expand Down Expand Up @@ -274,7 +277,7 @@ def flatten_list(nested_list):

def __translate_texts(self,
src_texts: Union[List[str], str],
translator: Translator = None,
translator: Provider = None,
sub_list_idx: int=None, # sub_list_idx is for pass through of index information and can be merge later by __sublist_multithread_translate
) -> Union[List[str], str, Dict[List[str], int]]:
'''
Expand All @@ -287,7 +290,14 @@ def __translate_texts(self,

try:
target_texts = translator_instance.translate(src_texts, src=self.source_lang, dest=self.target_lang)
except TypeError:
except (TypeError, TranslatorError):
# except Exception as exc:
# TODO: Move Error except to each individual Providers

# Log the full stack trace of the exception
# traceback_str = ''.join(traceback.format_exception(None, exc, exc.__traceback__))
# tqdm.write(f"An exception occurred:\n{traceback_str}")

# TypeError likely due to gender-specific translation, which has no fix yet. Please refer to
# ssut/py-googletrans#260 for more info
if sub_list_idx is None:
Expand Down Expand Up @@ -319,7 +329,7 @@ def extract_texts(obj):
def translate_converted(self,
en_data: List[str] = None,
desc: str = None,
translator: Translator = None,
translator: Provider = None,
large_chunk: List[str] = None) -> Union[None, List[str]]:
'''
This function support translation in multithread for large dataset
Expand Down

0 comments on commit f63503a

Please sign in to comment.