In [1]:
import stanza
from transformers import MarianTokenizer, MarianMTModel

LANGUAGE_TO_CODE = {
    "afrikaans": "af",
    "albanian": "sq",
    "amharic": "am",
    "arabic": "ar",
    "armenian": "hy",
    "assamese": "as",
    "aymara": "ay",
    "azerbaijani": "az",
    "bambara": "bm",
    "basque": "eu",
    "belarusian": "be",
    "bengali": "bn",
    "bhojpuri": "bho",
    "bosnian": "bs",
    "bulgarian": "bg",
    "catalan": "ca",
    "cebuano": "ceb",
    "chichewa": "ny",
    "chinese (simplified)": "zh-CN",
    "chinese (traditional)": "zh-TW",
    "corsican": "co",
    "croatian": "hr",
    "czech": "cs",
    "danish": "da",
    "dhivehi": "dv",
    "dogri": "doi",
    "dutch": "nl",
    "english": "en",
    "esperanto": "eo",
    "estonian": "et",
    "ewe": "ee",
    "filipino": "tl",
    "finnish": "fi",
    "french": "fr",
    "frisian": "fy",
    "galician": "gl",
    "georgian": "ka",
    "german": "de",
    "greek": "el",
    "guarani": "gn",
    "gujarati": "gu",
    "haitian creole": "ht",
    "hausa": "ha",
    "hawaiian": "haw",
    "hebrew": "iw",
    "hindi": "hi",
    "hmong": "hmn",
    "hungarian": "hu",
    "icelandic": "is",
    "igbo": "ig",
    "ilocano": "ilo",
    "indonesian": "id",
    "irish": "ga",
    "italian": "it",
    "japanese": "ja",
    "javanese": "jw",
    "kannada": "kn",
    "kazakh": "kk",
    "khmer": "km",
    "kinyarwanda": "rw",
    "konkani": "gom",
    "korean": "ko",
    "krio": "kri",
    "kurdish (kurmanji)": "ku",
    "kurdish (sorani)": "ckb",
    "kyrgyz": "ky",
    "lao": "lo",
    "latin": "la",
    "latvian": "lv",
    "lingala": "ln",
    "lithuanian": "lt",
    "luganda": "lg",
    "luxembourgish": "lb",
    "macedonian": "mk",
    "maithili": "mai",
    "malagasy": "mg",
    "malay": "ms",
    "malayalam": "ml",
    "maltese": "mt",
    "maori": "mi",
    "marathi": "mr",
    "meiteilon (manipuri)": "mni-Mtei",
    "mizo": "lus",
    "mongolian": "mn",
    "myanmar": "my",
    "nepali": "ne",
    "norwegian": "no",
    "odia (oriya)": "or",
    "oromo": "om",
    "pashto": "ps",
    "persian": "fa",
    "polish": "pl",
    "portuguese": "pt",
    "punjabi": "pa",
    "quechua": "qu",
    "romanian": "ro",
    "russian": "ru",
    "samoan": "sm",
    "sanskrit": "sa",
    "scots gaelic": "gd",
    "sepedi": "nso",
    "serbian": "sr",
    "sesotho": "st",
    "shona": "sn",
    "sindhi": "sd",
    "sinhala": "si",
    "slovak": "sk",
    "slovenian": "sl",
    "somali": "so",
    "spanish": "es",
    "sundanese": "su",
    "swahili": "sw",
    "swedish": "sv",
    "tajik": "tg",
    "tamil": "ta",
    "tatar": "tt",
    "telugu": "te",
    "thai": "th",
    "tigrinya": "ti",
    "tsonga": "ts",
    "turkish": "tr",
    "turkmen": "tk",
    "twi": "ak",
    "ukrainian": "uk",
    "urdu": "ur",
    "uyghur": "ug",
    "uzbek": "uz",
    "vietnamese": "vi",
    "welsh": "cy",
    "xhosa": "xh",
    "yiddish": "yi",
    "yoruba": "yo",
    "zulu": "zu",
}


In [2]:
source_language = "spanish"
target_language = "english"

In [3]:
stanza.download(source_language)
nlp = stanza.Pipeline(source_language)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

2024-02-03 16:30:04 INFO: "spanish" is an alias for "es"
2024-02-03 16:30:04 INFO: Downloading default packages for language: es (Spanish) ...
2024-02-03 16:30:05 INFO: File exists: /home/xbankov/stanza_resources/es/default.zip
2024-02-03 16:30:10 INFO: Finished downloading models and saved to /home/xbankov/stanza_resources.
2024-02-03 16:30:10 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

2024-02-03 16:30:10 INFO: "spanish" is an alias for "es"
2024-02-03 16:30:11 INFO: Loading these models for language: es (Spanish):
| Processor    | Package         |
----------------------------------
| tokenize     | ancora          |
| mwt          | ancora          |
| pos          | ancora_charlm   |
| lemma        | ancora_nocharlm |
| constituency | combined_charlm |
| depparse     | ancora_charlm   |
| sentiment    | tass2020        |
| ner          | conll02         |

2024-02-03 16:30:11 INFO: Using device: cpu
2024-02-03 16:30:11 INFO: Loading: tokenize
2024-02-03 16:30:12 INFO: Loading: mwt
2024-02-03 16:30:12 INFO: Loading: pos
2024-02-03 16:30:12 INFO: Loading: lemma
2024-02-03 16:30:12 INFO: Loading: constituency
2024-02-03 16:30:13 INFO: Loading: depparse
2024-02-03 16:30:13 INFO: Loading: sentiment
2024-02-03 16:30:13 INFO: Loading: ner
2024-02-03 16:30:14 INFO: Done loading processors!


In [4]:
source_code = LANGUAGE_TO_CODE[source_language]
target_code = LANGUAGE_TO_CODE[target_language]

model_name = f"Helsinki-NLP/opus-mt-{source_code}-{target_code}"

model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)



In [None]:
text = "https://unfccc.int/sites/default/files/resource/ARGENTINA_cop24cmp14cma1-3_SP.pdf"

In [None]:
# Break the text into sentences
sentences = nlp(text).sentences

# Translate each sentence individually
translated_sentences = []
for sentence in tqdm():
    batch = tokenizer.prepare_seq2seq_batch(src_texts=[sentence.text])
    gen = model.generate(**batch)
    words = tokenizer.batch_decode(gen, skip_special_tokens=True)

    translated_sentences.append(words)

# Join the translated sentences back together
translated_text = " ".join(translated_sentences)