In [35]:
import openai
import typing
from langdetect import detect, LangDetectException
from pathlib import Path
from openai.types.chat import ChatCompletion
import itertools
import json
import os
import random
from tqdm import tqdm
from itertools import islice

import asyncio
import aiofiles 
from openai import AsyncOpenAI

In [22]:
def validate_openai_translation(
    completion: ChatCompletion,
    target_lang: str = 'en',
    min_completion_tokens: int = 100
) -> bool:
    try:
        if not completion or not completion.choices:
            return False

        first_choice = completion.choices[0]

        if first_choice.finish_reason != 'stop':
            return False

        if not completion.usage or completion.usage.completion_tokens < min_completion_tokens:
            return False

        content = first_choice.message.content
        if not content:
            return False

        try:
            answer_lang = detect(content)
            if target_lang.lower() != answer_lang.lower():
                return False
        except LangDetectException:
            return False

    except AttributeError as e:
        return False
    
    except Exception as e:
        print(e)
        return False

    return True


In [23]:
system_prompt = {
  "en": "Translate the text provided by the user after the `###` delimiter into English. Output only the translated text.",
  "de": "Übersetzen Sie den Text, den der Benutzer nach dem `###`-Trennzeichen bereitstellt, ins Deutsche. Geben Sie nur den übersetzten Text aus.",
  "fr": "Traduisez le texte fourni par l'utilisateur après le délimiteur `###` en français. Ne retournez que le texte traduit.",
  "zh-CN": "将用户在 `###` 分隔符后提供的文本翻译成简体中文。只输出翻译后的文本。",
  "es": "Traduzca el texto proporcionado por el usuario después del delimitador `###` al español. Devuelva únicamente el texto traducido.",
  "it": "Traduci il testo fornito dall'utente dopo il delimitatore `###` in italiano. Restituisci solo il testo tradotto.",
  "pl": "Przetłumacz tekst dostarczony przez użytkownika po ograniczniku `###` na język polski. Zwróć tylko przetłumaczony tekst.",
  "ro": "Traduceți textul furnizat de utilizator după delimitatorul `###` în limba română. Afișați doar textul tradus.",
  "ja": "ユーザーが`###`デリミタの後に提供したテキストを日本語に翻訳してください。翻訳されたテキストのみを出力してください。",
  "hu": "Fordítsa le a felhasználó által a `###` elválasztójel után megadott szöveget magyarra. Csak a lefordított szöveget adja vissza.",
  "cs": "Přeložte text poskytnutý uživatelem po oddělovači `###` do češtiny. Vypište pouze přeložený text.",
  "tr": "Kullanıcı tarafından `###` sınırlayıcısından sonra sağlanan metni Türkçe'ye çevirin. Yalnızca çevrilmiş metni çıktı olarak verin.",
  "nl": "Vertaal de tekst die de gebruiker na het `###`-scheidingsteken verstrekt naar het Nederlands. Geef alleen de vertaalde tekst terug.",
  "th": "โปรดแปลข้อความที่ผู้ใช้ป้อนหลังตัวคั่น `###` เป็นภาษาไทย และแสดงผลเฉพาะข้อความที่แปลแล้วเท่านั้น",
  "id": "Terjemahkan teks yang diberikan oleh pengguna setelah pembatas `###` ke dalam bahasa Indonesia. Hanya keluarkan teks yang sudah diterjemahkan.",
  "vi": "Dịch văn bản do người dùng cung cấp sau dấu phân cách `###` sang tiếng Việt. Chỉ xuất ra văn bản đã dịch.",
  "ko": "`###` 구분 기호 뒤에 사용자가 제공한 텍스트를 한국어로 번역하십시오. 번역된 텍스트만 출력하십시오."
}
openai.api_key = 'aib_machinetranslation_a6a051'
openai.base_url = "https://pre-openai-keys.alibaba-inc.com"
model_name= "gpt-4o-mini"
possible_langs = list(system_prompt.keys())

In [6]:
def openai_translate_one(lang, content):
    completion = openai.chat.completions.create(

        model=model_name,

        messages=[
        {"role": "system", "content": system_prompt[lang]},
        {
        "role": "user",
        "content": "###" +  content +  "###"
        },
        ],
        #max_tokens=100,
        temperature=0.7,
        top_p=0.9,
        frequency_penalty=0.0,
        presence_penalty=0.0,
        n=1
    )
    if validate_openai_translation(completion, lang, 100):
        return completion.choices[0].message.content.replace("#", ""), completion.usage.total_tokens
    else:
        print("garbage response")
        return None, 0


In [7]:
example = "Coming as it did at a period of exceptional dullness. It attracted perhaps rather more attention than it deserved, but it offered to the public that mixture of the whimsical and the tragic which is most stimulating to the popular imagination. Interest drooped, however, when, after weeks of fruitless investigation, it was found that no final explanation of the facts was forthcoming and the tragedy seemed from that time. It would be as well, perhaps, that i should refresh their memories as to the singular facts upon which this commentary is founded. These facts were briefly as follows: at five o'clock on the evening of the eighteenth of march, In the year already mentioned, a train left euston station for manchester. It was a rainy, squally day which grew wilder as it progressed, so it was by no means the weather in which anyone would travel who was not driven to do so by necessity. The train, however, is a favourite one among manchester business men who are returning from town, for it does the journey in four hours and twenty minutes, with only three stoppages upon the way. In spite of the inclement evening, it was therefore fairly well filled. Upon the occasion of which i speak, the guard of the train was a tried servant of the company, a man who had worked for twenty two years without a blemish or complaint. His name was john palmer. The station clock was upon the stroke of five and the guard was about to give the customary signal to the engine driver when he observed two belated passengers hurrying down the platform. The one was An exceptionally tall man dressed in a long black overcoat with astrakhan collar and cuffs. I have already said that the evening was an inclement one and the tall traveller had the high, warm collar. Turned up to protect his throat against the bitter march wind. He appeared, as far as the guard could judge by so hurried an inspection, to be a man between fifty and sixty years of age. Which outpaced the gentleman beside her. She wore a long fawn coloured dust cloak, a black close fitting toque and a dark veil which concealed the greater part of her face. The two might very well have passed as father and daughter. They walked swiftly down the line of carriages, glancing in at the windows, until the guard, john palmer, overtook them. Now then, sir, look sharp, the train is going, said he. First class the man answered. The guard turned the handle of the nearest door in the carriage which he had opened, there sat a small man with a cigar in his mouth. His appearance seems to have impressed itself upon the guard's memory, for he was prepared afterwards. To describe or to identify him. He was a man of thirty four or thirty five years of age, dressed in some grey material, sharp nosed, alert, with a ruddy weather, beaten face and a small. He glanced up as the door was opened. The tall man paused with his foot upon the step. This is a smoking compartment. The lady dislikes smoke. All right, here you are, sir, said john palmer. He slammed the door of the smoking carriage, opened that of the next one which was empty, and thrust the two travellers in. At the same moment he sounded his whistle. And the wheels of the train began to move. The man with the cigar was at the window of his carriage and said something to the guard as he rolled past him, but the words were lost in the bustle of the departure. Palmer stepped into the guard's van. As it came up to him and thought no more of the incident. Twelve minutes after its departure, the train reached willesden junction, where it stopped for a very short interval. An examination of the tickets has made it certain. That no one either joined or left it at this time and no passenger was seen to alight upon the platform. At five fourteen the journey to manchester was resumed and rugby was reached at six fifty. The express being five minutes late at rugby, the attention of the station officials was drawn to the fact that the door of one of the first class carriages was open. An examination of that compartment and of its neighbour. Disclosed a remarkable state of affairs: the smoking carriage in which the short red faced man. Save for a half smoked cigar, there was no trace whatever of its recent occupant. The door of this carriage was fastened in the next compartment, to which attention had been originally drawn. There was no sign either of the gentleman with the astrakhan collar or of the young lady who accompanied him. All three passengers had disappeared. On the other hand, there was found upon the floor of this carriage. The one in which the tall traveller and the lady had been a young man, fashionably dressed and of elegant appearance. He lay with his knees. An elbow upon either seat, a bullet had penetrated his heart, and his death must have been instantaneous. No one had seen such a man enter the train, and no railway ticket was found in his pocket. As what had occurred to the three people who had started an hour and a half before from willesden in those two compartments. I have said that there was no personal property which might help to identify him. But it is true that there was one peculiarity about this unknown young man which was much commented upon at the time: in his pockets were found no fewer than six valuable gold watches. Three in the various pockets of his waist coat, one in his ticket pocket. And that this was his plunder was discounted by the fact that all six were of american make and of a type which is rare in england. Three of them bore the mark of the rochester watchmaking company. And the small one, which was highly jewelled and ornamented, was from tiffany of new york. The other contents of his pocket consisted of an ivory knife with a corkscrew by rodgers of sheffield, a small circular mirror. One inch in diameter, a readmission slip to the lyceum theatre, a silver box full of vesta matches and a brown leather cigar case containing two cheroots. Also two pounds fourteen shillings in money. It was clear then that whatever motives may have led to his death, robbery was not among them. As already mentioned, there were no markings upon the man's linen. Which appeared to be new, and no tailor's name upon his coat. In appearance he was young, short, smooth, cheeked and delicately featured one of his front teeth. The two compartments in question was uncoupled and side tracked. Then, on the arrival of inspector vane of scotland yard and of mister henderson, a detective in the service of the railway company, an exhaustive inquiry. Was made, into all the circumstances, that crime had been committed was certain. The bullet, which appeared to have come from a small pistol or revolver, had been fired from some little distance. As there was no scorching of the clothes, no weapon was found in the compartment which finally disposed of the theory of suicide, nor was there any sign of the brown leather bag which the guard had seen in the hand of the tall gentleman. Could get out of the train and one other get in during the unbroken run between willesden and rugby. John palmer, the guard was able at the inquest to give some evidence which threw a little light upon the matter. There was a spot between tring and cheddington, according to his statement, where, on account of some repairs to the line, The train had, for a few minutes, slowed down to a pace not exceeding eight or ten miles an hour. At that place, it might be possible for a man, or even for an exceptionally active woman, to have left the train without serious injury. It was true that a gang of platelayers was there and that they had seen nothing, but it was their custom to stand in the middle between the metals, and the open carriage door was upon the far side. So that it was conceivable that someone might have alighted unseen as the darkness would by that time be drawing in a steep embankment- would instantly screen anyone who sprang out from the observation of the navvies. A careful examination of the line between willesden and rugby resulted in one discovery which might or might not have a bearing upon the tragedy near tring, at the very place where the train slowed down. There was found at the bottom of the embankment a small pocket testament, very shabby and worn. It was printed by the bible society of london and bore an inscription from john to alice. Upon the fly leaf. Underneath was written james. Eighteen, fifty nine, and beneath that again, edward november first, eighteen sixty nine, all the entries being in the same handwriting. This was the only clue, if it could be called a clue. Which was solid enough to form the basis for a profitable investigation. It would be a mistake, however. On the contrary, the press both in england and in america teemed with suggestions and suppositions, most of which were obviously absurd. The fact that the watches were of american make. And some peculiarities in connection with the gold stopping of his front tooth appeared to indicate that the deceased was a citizen of the united states, though his linen clothes and boots were undoubtedly of british manufacture. It was surmised by some that he was concealed under the seat and that, being discovered he was for some reason, possibly because he had overheard their guilty secrets put to death by his fellow passengers. When coupled with generalities as to the ferocity and cunning of anarchical and other secret societies, this theory sounded as plausible as any. The fact that he should be without a ticket would be consistent."
openai_translate_one('zh-CN', example)

('这件事发生在一个异常乏味的时期，因此可能吸引了比它应得的更多的关注，但它向公众提供了一种奇异与悲剧的混合，这对大众想象力最具刺激性。然而，经过几周的无果调查，当发现没有最终的事实解释时，兴趣便开始减退，悲剧似乎从那时起就开始了。也许我应该刷新一下他们的记忆，回顾一下这篇评论所依据的奇特事实。这些事实简要如下：在我提到的那年三月十八日下午五点，一列火车从尤斯顿车站开往曼彻斯特。那是一个下着雨、风暴交加的日子，随着时间的推移天气变得更加恶劣，因此在这种天气下，除了出于必要，没人会选择出行。然而，这列火车在曼彻斯特的商人中非常受欢迎，因为它只需四小时二十分钟就能完成旅程，中途仅停靠三次。尽管傍晚天气恶劣，但车厢内仍然相对满员。在我提到的那次出行中，列车的列车员是一位为公司工作了二十二年的老员工，他没有任何污点或投诉。他的名字是约翰·帕尔默。车站的钟敲响五点，列车员正准备向机车司机发出常规信号时，他注意到两名迟到的乘客急匆匆地朝站台走来。其中一位是一位身材特别高大的男士，穿着一件带有阿斯特拉罕毛领和袖口的黑色长外套。我已经提到过，傍晚的天气很糟糕，而这位高个旅客的高领子翻起来，以保护他的喉咙免受刺骨的三月寒风的侵袭。根据列车员匆忙的观察，他似乎是一位五十至六十岁之间的男士。与他同行的女士则显得相对矮小，她穿着一件长长的淡褐色防尘斗篷，戴着一顶黑色贴身小帽，脸上蒙着一层黑色面纱，遮住了大部分面容。这两人很可能被误认为是父女。他们迅速沿着车厢走，朝窗户里望去，直到列车员约翰·帕尔默追上了他们。 “快点，先生，火车要开了，”他说。 “一等座，”那位男士回答。列车员转动了最近一扇门的把手，打开了车厢，里面坐着一位嘴里叼着雪茄的小个子男子。他的外貌似乎在列车员的记忆中留下了深刻印象，因为他后来能够描述或识别出他。这位男士大约三十四或三十五岁，穿着一些灰色的材料，尖鼻子，机警，脸色红润，显得风吹日晒。他在门打开时抬头看了一眼。高个子男士停在了台阶上。“这是吸烟车厢。女士不喜欢烟雾。” “好的，您在这里，”约翰·帕尔默说。他关上了吸烟车厢的门，打开了下一个空车厢的门，把两位旅客推了进去。与此同时，他吹响了口哨，火车的轮子开始移动。那位叼着雪茄的男子在他的车厢窗边对列车员说了些什么，但由于出发时的喧闹，话语被淹没了。帕尔默走进了列车员的货车，随即不再关注这一事件。在离开后十二分钟，火车

In [24]:
DATA_PATH = Path("../datasets/LongSpeech/all_audios.jsonl")
OUTPUT_PATH = Path("../datasets/LongSpeechQA/translation.jsonl")
BATCH_SIZE = 100  
CONCURRENCY_LIMIT = 500
semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)


In [25]:
def stream_from_file(path, max_lines=-1):
    if not (path.exists() and path.is_file()):
        print(f"⚠️ 警告: 文件不存在或路径不是一个文件 - {path}")

    try:
        with open(path, 'r', encoding='utf-8') as f:
            lines_to_process = f  
            if max_lines != -1:
                lines_to_process = itertools.islice(f, max_lines)
            
            yield from lines_to_process
            
    except Exception as e:
        print(f"读取文件 '{path}' 时发生错误: {e}")


In [26]:
def batch_iterator(iterator, batch_size):
    while True:
        batch = list(islice(iterator, batch_size))
        if not batch:
            return
        yield batch

In [37]:
async def async_openai_translate_one(client, target_lang, content, original_record_info):

    async with semaphore:
        try:
            messages = [
                {"role": "system", "content": system_prompt.get(target_lang, f"Translate to {target_lang}")},
                {"role": "user", "content": "###" + content + "###"},
            ]


            completion = await client.chat.completions.create(
                model=model_name,      
                messages=messages,
                temperature=0.7,
                top_p=0.9
            )
            

            translated = completion.choices[0].message.content.replace("#", "")
            tokens_used = completion.usage.total_tokens
            
            return {
                "status": "success",
                "original_info": original_record_info,
                "target_lang": target_lang,
                "translated_text": translated,
                "tokens_used": tokens_used
            }
        except Exception as e:

            return {
                "status": "error",
                "original_info": original_record_info,
                "target_lang": target_lang,
                "error": e
            }

In [33]:
async def batch_processing():
    total_tokens_used = 0
    total_lines_processed = 0
    
    total_lines = 8411

    async_client = AsyncOpenAI(
        api_key='aib_machinetranslation_a6a051',
        base_url= "https://pre-openai-keys.alibaba-inc.com"
    )

    data_iter = stream_from_file(DATA_PATH, -1)
    filtered_iter = (line for line in data_iter if ("librispeech" in line or "tedlium" in line))

    async with aiofiles.open(OUTPUT_PATH, "w", encoding="utf-8") as outfile:
        with tqdm(total=total_lines, desc="Translating Batches") as progress_bar:
            for batch in batch_iterator(filtered_iter, BATCH_SIZE):
                tasks = []
                for line in batch:
                    try:
                        ljs = json.loads(line.strip())
                        source_lang = ljs.get('language')
                        content = ljs.get('transcribe')
                        wav_id = ljs.get('id')

                        if not all([source_lang, content, wav_id]): continue
                        
                        original_record_info = {
                            "source_lang": source_lang,
                            "wav_path": str(Path(DATA_PATH.parent / 'wavs' / (wav_id + '.wav')).resolve())
                        }
                        
                        target_langs = random.sample([lang for lang in possible_langs if lang != source_lang], 5)
                        
                        for target_lang in target_langs:
                            
                            task = async_openai_translate_one(async_client, target_lang, content, original_record_info)
                            tasks.append(task)
                    except (json.JSONDecodeError, AttributeError):
                        continue

                if not tasks: continue
                
                results = await asyncio.gather(*tasks)

                for result in results:
                    if result["status"] == "success":
                        total_tokens_used += result["tokens_used"]
                        
                        record = {
                            "source_lang": result["original_info"]["source_lang"],
                            "target_lang": result["target_lang"],
                            "content": result["translated_text"],
                            "wav_path": result["original_info"]["wav_path"]
                        }
                        await outfile.write(json.dumps(record, ensure_ascii=False) + '\n')
                    else:
                        print(f"A task failed for {result['original_info']} -> {result['target_lang']}: {result['error']}")

                progress_bar.update(len(batch))
                progress_bar.set_postfix({"total_tokens": f"{total_tokens_used:,}", "concurrency": CONCURRENCY_LIMIT})


In [38]:
await batch_processing()    

Translating Batches:  50%|████▉     | 4200/8411 [1:35:55<2:14:12,  1.91s/it, total_tokens=105,161,685, concurrency=500]

A task failed for {'source_lang': 'en', 'wav_path': '/mnt/workspace/renyi/datasets/LongSpeech/wavs/004133.wav'} -> zh-CN: Connection error.


Translating Batches:  80%|███████▉  | 6700/8411 [2:28:28<33:48,  1.19s/it, total_tokens=162,041,532, concurrency=500]  

A task failed for {'source_lang': 'en', 'wav_path': '/mnt/workspace/renyi/datasets/LongSpeech/wavs/006642.wav'} -> hu: Expecting value: line 1 column 1 (char 0)


Translating Batches:  81%|████████  | 6800/8411 [2:30:43<33:08,  1.23s/it, total_tokens=164,232,555, concurrency=500]

A task failed for {'source_lang': 'en', 'wav_path': '/mnt/workspace/renyi/datasets/LongSpeech/wavs/006755.wav'} -> vi: Expecting value: line 1 column 1 (char 0)


Translating Batches:  88%|████████▊ | 7400/8411 [2:42:29<22:12,  1.32s/it, total_tokens=176,286,358, concurrency=500]

A task failed for {'source_lang': 'en', 'wav_path': '/mnt/workspace/renyi/datasets/LongSpeech/wavs/007303.wav'} -> vi: Expecting value: line 1 column 1 (char 0)


Translating Batches: 100%|██████████| 8411/8411 [3:06:33<00:00,  1.33s/it, total_tokens=197,096,990, concurrency=500]


In [40]:
amount = 0
with open(OUTPUT_PATH, "r", encoding="utf-8") as outfile:
    for line in outfile:
        amount +=1

amount



42051

In [16]:
"""
total_tokens_used = 0 

try:
    with open(DATA_PATH, 'r', encoding='utf-8') as f:
        total_lines = sum(1 for _ in f)
except FileNotFoundError:
    print(f"Error: Input file not found at {DATA_PATH}")
    total_lines = 0

test_lines = 10
with open(OUTPUT_PATH, "w", encoding="utf-8") as outfile:
    data_iter = stream_from_file(DATA_PATH, -1)
    filtered_iter = (line for line in data_iter if ("librispeech" in line or "tedlium" in line))
    progress_bar = tqdm(filtered_iter, total=8411, desc="Translating Records")

    for line in progress_bar:
        line = line.strip()
        if not line:
            continue
        try:
            ljs = json.loads(line)
        except json.JSONDecodeError:
            continue
            
        source_lang = ljs.get('language')
        content = ljs.get('transcribe')
        wav_id = ljs.get('id')

        if not all([source_lang, content, wav_id]):
            continue

        wav_path = Path(DATA_PATH.parent / 'wavs' / (wav_id + '.wav')).resolve()
        
        sampled_lang = [x for x in possible_langs if x != source_lang]
    
        target_langs = random.sample(sampled_lang, 5)

        for target_lang in target_langs:

            translated, tokens_used = openai_translate_one(target_lang, content)
            
            total_tokens_used += tokens_used
            
            record = {
                "source_lang": source_lang,
                "target_lang": target_lang,
                "content": translated,
                "wav_path": str(wav_path)
            }
            outfile.write(json.dumps(record, ensure_ascii=False) + '\n')
            progress_bar.set_postfix({"total_tokens": f"{total_tokens_used:,}"})
"""

Translating Records:   0%|          | 8/8411 [26:07<457:23:28, 195.95s/it, total_tokens=202,826]


KeyboardInterrupt: 