In [None]:
!wget https://bit.ly/3GYiaed
!unzip "/content/3GYiaed" -d "/content/output_folder/"


In [None]:
!wget https://dl.fbaipublicfiles.com/m2m_100/spm.128k.model


In [None]:
%cd ..

In [None]:
!pip3 install ctranslate2 fairseq sentencepiece langdetect

In [None]:
import ctranslate2
import sentencepiece as spm
import logging
from langdetect import detect


In [None]:
def setup_logging():
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)

def load_sentencepiece_model(sp_model_path):
    sp = spm.SentencePieceProcessor()
    try:
        sp.load(sp_model_path)
    except Exception as e:
        logging.error(f"Failed to load SentencePiece model: {e}")
        raise
    return sp

def process_file_in_batches(file_path, batch_size):
    batch = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            batch.append(line.strip())
            if len(batch) == batch_size:
                yield batch
                batch = []
        if batch:
            yield batch

def detect_languages(batch):
    return [detect(text) for text in batch]

def translate_batch(batch, languages, translator, sp, tgt_prefix):
    try:
        translations = []
        for text, lang in zip(batch, languages):
            src_prefix = f"__{lang}__"
            target_prefix = [[tgt_prefix]]
            batch_subworded = sp.encode([text], out_type=str)
            batch_subworded = [[src_prefix] + sent for sent in batch_subworded]

            translation = translator.translate_batch(batch_subworded, batch_type="tokens", max_batch_size=2024, beam_size=5, target_prefix=target_prefix)
            translation = translation[0][0]['tokens']
            translation_desubword = sp.decode(translation)
            translations.append(translation_desubword[len(tgt_prefix):])
        return translations
    except Exception as e:
        logging.error(f"Error during batch translation: {e}")
        raise

def translate_file_to_english(source_file_path, target_file_path, batch_size=1000):
    setup_logging()

    ct_model_path = "/content/output_folder/m2m100_12b"
    sp_model_path = "/content/spm.128k.model"
    tgt_prefix = "__en__"
    device = "cpu"

    logging.info("Loading SentencePiece model...")
    sp = load_sentencepiece_model(sp_model_path)

    logging.info("Initializing translator...")
    translator = ctranslate2.Translator(ct_model_path, device=device)

    logging.info("Processing and translating file in batches...")
    with open(target_file_path, "w+", encoding="utf-8") as target_file:
        for batch in process_file_in_batches(source_file_path, batch_size):
            languages = detect_languages(batch)
            translations = translate_batch(batch, languages, translator, sp, tgt_prefix)
            for line in translations:
                target_file.write(line.strip() + "\n")

    logging.info(f"Done! Target file saved at: {target_file_path}")

translate_file_to_english("/content/small_czech.txt", "/content/translated_czech.txt")

