# Translate
Translate all texts in a subfolder.

In [61]:
import os 
from mistralai import Mistral
from dotenv import load_dotenv
import pandas as pd
import requests
import time

# Loading all env variables from .env
load_dotenv()

MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")

In [62]:
directory_path = "clustering/sentences/"
for file in os.listdir(directory_path):
    file_path = os.path.join(directory_path, file)
    if file != "-1.txt":
        if os.path.isfile(file_path) and file not in os.listdir(os.path.join(directory_path, "translations")):
            print(file_path)
            with open(file_path, "r") as f:   
                data = f.read()
                sentences = data.split("\n\n")
                sentences = [s.strip() for s in sentences if s != ""]
                df = pd.DataFrame(sentences)
                df.columns = ["original_sentence"]
                print(len(sentences))
                translations = []
                for s in sentences:
                    msgs = [{"role": "user", "content": f"Translate the following Arabic text into English. Only respond with the translation, not additional comments. ONLY THE TRANSLATION. Also, stick as closely to the original Arabic text as possible: {s}"}]
                    res = get_sentence_translation(msgs)
                    translations.append(res.choices[0].message.content)
                    time.sleep(1)
                df["translation"] = translations
                df.columns = ["original_sentence", "translation"]
                df.to_csv(os.path.join(directory_path, "translations", f"{file}"))

clustering/sentences/8.txt
14
clustering/sentences/30.txt
5
clustering/sentences/68.txt
6
clustering/sentences/10.txt
11
clustering/sentences/39.txt
9
clustering/sentences/55.txt
19
clustering/sentences/65.txt
10
clustering/sentences/31.txt
33
clustering/sentences/53.txt
7
clustering/sentences/23.txt
20
clustering/sentences/73.txt
23
clustering/sentences/12.txt
10
clustering/sentences/40.txt
13
clustering/sentences/64.txt
9
clustering/sentences/38.txt
5
clustering/sentences/4.txt
9
clustering/sentences/72.txt
11
clustering/sentences/14.txt
9
clustering/sentences/36.txt
7
clustering/sentences/51.txt
24
clustering/sentences/48.txt
31
clustering/sentences/76.txt
9
clustering/sentences/62.txt
17
clustering/sentences/3.txt
31
clustering/sentences/22.txt
24
clustering/sentences/41.txt
12
clustering/sentences/74.txt
52
clustering/sentences/56.txt
5
clustering/sentences/49.txt
20
clustering/sentences/35.txt
8
clustering/sentences/61.txt
13
clustering/sentences/37.txt
33
clustering/sentences/52

In [20]:
from typing import List

# Get translations of sentences
def get_sentence_translation(messages_list: List[str], retries=3, backoff_factor=2):
    model = "mistral-saba-latest"
    
    client = Mistral(api_key=MISTRAL_API_KEY)
    
    for attempt in range(retries):
        try:
            translation_batch_response = client.chat.complete(
                model=model,
                messages = messages_list
            )
            return translation_batch_response
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            time.sleep(backoff_factor ** attempt)
    raise Exception("Failed to get embeddings after multiple attempts")