From 9aa7f12b7f2d41098e3c7ea1d933e8507118465e Mon Sep 17 00:00:00 2001 From: zackees Date: Tue, 9 May 2023 21:58:37 -0700 Subject: [PATCH] fix exception that happens because of PT language --- README.md | 167 ++++++++---------------------------- src/video_subtitles/cli.py | 11 +-- src/video_subtitles/gui.py | 10 ++- src/video_subtitles/run.py | 2 +- src/video_subtitles/util.py | 44 +++++++++- 5 files changed, 87 insertions(+), 147 deletions(-) diff --git a/README.md b/README.md index 3de4204..cd3339a 100644 --- a/README.md +++ b/README.md @@ -66,144 +66,51 @@ It should now be installed. # Language Reference -#### Short hand language codes +### Language Inputs -``` -af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da, -de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi, -hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb, -ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn, -no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq, -sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi, -yi,yo,zh -``` +We use openai whisper for language input. See whisper ai documents for a full supported list -#### Full Language codes will also work -``` -Afrikaans,Albanian,Amharic,Arabic,Armenian,Assamese,Azerbaijani, -Bashkir,Basque,Belarusian,Bengali,Bosnian,Breton,Bulgarian,Burmese, -Castilian,Catalan,Chinese,Croatian,Czech,Danish,Dutch,English, -Estonian,Faroese,Finnish,Flemish,French,Galician,Georgian,German, -Greek,Gujarati,Haitian,Haitian Creole,Hausa,Hawaiian,Hebrew,Hindi, -Hungarian,Icelandic,Indonesian,Italian,Japanese,Javanese,Kannada, -Kazakh,Khmer,Korean,Lao,Latin,Latvian,Letzeburgesch,Lingala, -Lithuanian,Luxembourgish,Macedonian,Malagasy,Malay,Malayalam, -Maltese,Maori,Marathi,Moldavian,Moldovan,Mongolian,Myanmar,Nepali, -Norwegian,Nynorsk,Occitan,Panjabi,Pashto,Persian,Polish,Portuguese, -Punjabi,Pushto,Romanian,Russian,Sanskrit,Serbian,Shona,Sindhi, -Sinhala,Sinhalese,Slovak,Slovenian,Somali,Spanish,Sundanese,Swahili, -Swedish,Tagalog,Tajik,Tamil,Tatar,Telugu,Thai,Tibetan,Turkish, -Turkmen,Ukrainian,Urdu,Uzbek,Valencian,Vietnamese,Welsh,Yiddish, -Yoruba -``` +### Language outputs -# Language code -> cononical name +We use the deepl AI for translation. The language list is as follows: ``` -{ - "af": "Afrikaans", - "am": "Amharic", - "ar": "Arabic", - "as": "Assamese", - "az": "Azerbaijani", - "ba": "Bashkir", - "be": "Belarusian", - "bg": "Bulgarian", - "bn": "Bengali", - "bo": "Tibetan", - "br": "Breton", - "bs": "Bosnian", - "ca": "Catalan", - "cs": "Czech", - "cy": "Welsh", - "da": "Danish", - "de": "German", - "el": "Greek", - "en": "English", - "es": "Spanish", - "et": "Estonian", - "eu": "Basque", - "fa": "Persian", - "fi": "Finnish", - "fo": "Faroese", - "fr": "French", - "gl": "Galician", - "gu": "Gujarati", - "ha": "Hausa", - "haw": "Hawaiian", - "he": "Hebrew", - "hi": "Hindi", - "hr": "Croatian", - "ht": "Haitian Creole", - "hu": "Hungarian", - "hy": "Armenian", - "id": "Indonesian", - "is": "Icelandic", - "it": "Italian", - "ja": "Japanese", - "jw": "Javanese", - "ka": "Georgian", - "kk": "Kazakh", - "km": "Khmer", - "kn": "Kannada", - "ko": "Korean", - "la": "Latin", - "lb": "Luxembourgish", - "ln": "Lingala", - "lo": "Lao", - "lt": "Lithuanian", - "lv": "Latvian", - "mg": "Malagasy", - "mi": "Maori", - "mk": "Macedonian", - "ml": "Malayalam", - "mn": "Mongolian", - "mr": "Marathi", - "ms": "Malay", - "mt": "Maltese", - "my": "Burmese", - "ne": "Nepali", - "nl": "Dutch", - "nn": "Norwegian Nynorsk", - "no": "Norwegian", - "oc": "Occitan", - "pa": "Punjabi", - "pl": "Polish", - "ps": "Pashto", - "pt": "Portuguese", - "ro": "Romanian", - "ru": "Russian", - "sa": "Sanskrit", - "sd": "Sindhi", - "si": "Sinhalese", - "sk": "Slovak", - "sl": "Slovene", - "sn": "Shona", - "so": "Somali", - "sq": "Albanian", - "sr": "Serbian", - "su": "Sundanese", - "sv": "Swedish", - "sw": "Swahili", - "ta": "Tamil", - "te": "Telugu", - "tg": "Tajik", - "th": "Thai", - "tk": "Turkmen", - "tl": "Tagalog", - "tr": "Turkish", - "tt": "Tatar", - "uk": "Ukrainian", - "ur": "Urdu", - "uz": "Uzbek", - "vi": "Vietnamese", - "yi": "Yiddish", - "yo": "Yoruba", - "zh": "Chinese", -} + BG - Bulgarian + CS - Czech + DA - Danish + DE - German + EL - Greek + EN - English (unspecified variant for backward compatibility; please select EN-GB or EN-US instead) + EN-GB - English (British) + EN-US - English (American) + ES - Spanish + ET - Estonian + FI - Finnish + FR - French + HU - Hungarian + ID - Indonesian + IT - Italian + JA - Japanese + KO - Korean + LT - Lithuanian + LV - Latvian + NB - Norwegian (Bokmål) + NL - Dutch + PL - Polish + PT-BR - Portuguese (Brazilian) + PT-PT - Portuguese (all Portuguese varieties excluding Brazilian Portuguese) + RO - Romanian + RU - Russian + SK - Slovak + SL - Slovenian + SV - Swedish + TR - Turkish + UK - Ukrainian + ZH - Chinese (simplified) ``` +Please see [https://www.deepl.com/docs-api/translate-text/](https://www.deepl.com/docs-api/translate-text/) for more information # Windows diff --git a/src/video_subtitles/cli.py b/src/video_subtitles/cli.py index 4af232e..7359fd1 100644 --- a/src/video_subtitles/cli.py +++ b/src/video_subtitles/cli.py @@ -12,10 +12,10 @@ from video_subtitles.say import say from video_subtitles.settings import Settings from video_subtitles.util import ( - LANGUAGE_CODES, MODELS, GraphicsInfo, ensure_transcribe_anything_installed, + parse_languages, query_cuda_video_cards, ) @@ -24,15 +24,6 @@ settings = Settings() -def parse_languages(languages_str: str) -> list[str]: - """Parse a comma-separated list of languages and return a list of language codes.""" - languages = languages_str.split(",") - for language in languages: - if language not in LANGUAGE_CODES: - raise argparse.ArgumentTypeError(f"Invalid language code: {language}") - return languages - - def ensure_dependencies() -> list[GraphicsInfo]: """Ensure that dependencies are installed.""" cuda_cards = query_cuda_video_cards() diff --git a/src/video_subtitles/gui.py b/src/video_subtitles/gui.py index 4172e9c..44ae95a 100644 --- a/src/video_subtitles/gui.py +++ b/src/video_subtitles/gui.py @@ -18,6 +18,7 @@ QLabel, QLineEdit, QMainWindow, + QMessageBox, QVBoxLayout, QWidget, ) @@ -26,7 +27,7 @@ from video_subtitles.say import say from video_subtitles.settings import Settings from video_subtitles.thread_processor import ThreadProcessor -from video_subtitles.util import MODELS +from video_subtitles.util import MODELS, parse_languages settings = Settings() @@ -152,7 +153,12 @@ def dropEvent(self, event): files = [u.toLocalFile() for u in event.mimeData().urls()] deepl_api_key = self.deepl_input.text().strip() # get api key from input field model = self.model_select.currentText().strip() - languages = self.output_text.text().strip().split(",") + try: + languages = parse_languages(self.output_text.text()) + except ValueError as ve: + QMessageBox.critical(self, "Error", str(ve)) + return + languages = [lang.strip() for lang in languages] convert_to_webvtt = self.webvtt_select.currentText().strip() == "WEBVTT" for f in files: diff --git a/src/video_subtitles/run.py b/src/video_subtitles/run.py index 0fe5b94..52b91e7 100644 --- a/src/video_subtitles/run.py +++ b/src/video_subtitles/run.py @@ -67,7 +67,7 @@ def run( # pylint: disable=too-many-locals,too-many-branches,too-many-statement api_key=deepl_api_key, in_srt=src_srt_file, out_srt=out_file, - from_lang="en", + from_lang="en-us", # whisper ai is always en-us to_lang=language, ) assert os.path.exists( diff --git a/src/video_subtitles/util.py b/src/video_subtitles/util.py index 9630558..63761f0 100644 --- a/src/video_subtitles/util.py +++ b/src/video_subtitles/util.py @@ -1,5 +1,7 @@ """Utilities for video_subtitles.""" +# pylint: disable=line-too-long + import os import subprocess import tempfile @@ -71,15 +73,49 @@ def ensure_transcribe_anything_installed() -> None: ) +def parse_languages(languages_str: str) -> list[str]: + """Parse a comma-separated list of languages and return a list of language codes.""" + languages = languages_str.split(",") + languages = [language.strip().lower() for language in languages] + for language in languages: + if language not in LANGUAGE_CODES: + raise ValueError(f"Unknown language: {language}") + return languages + + LANGUAGE_CODES = { - "en": "English", + "bg": "Bulgarian", + "cs": "Czech", + "da": "Danish", + "de": "German", + "el": "Greek", + "en": "English (unspecified variant for backward compatibility; please select EN-GB or EN-US instead)", + "en-gb": "English (British)", + "en-us": "English (American)", "es": "Spanish", + "et": "Estonian", + "fi": "Finnish", "fr": "French", - "de": "German", + "hu": "Hungarian", + "id": "Indonesian", "it": "Italian", - "pt": "Portuguese", + "ja": "Japanese", + "ko": "Korean", + "lt": "Lithuanian", + "lv": "Latvian", + "nb": "Norwegian (Bokmål)", + "nl": "Dutch", + "pl": "Polish", + "pt-br": "Portuguese (Brazilian)", + "pt-pt": "Portuguese (all Portuguese varieties excluding Brazilian Portuguese)", + "ro": "Romanian", "ru": "Russian", - "zh": "Chinese", + "sk": "Slovak", + "sl": "Slovenian", + "sv": "Swedish", + "tr": "Turkish", + "uk": "Ukrainian", + "zh": "Chinese (simplified)", } MODELS = {