-
Notifications
You must be signed in to change notification settings - Fork 1
/
translate.py
185 lines (149 loc) · 9.77 KB
/
translate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import os
import time
import math
import asyncio
import logging
import traceback
import requests
import pysrt
import spacy
import openai
from typing import Optional
from pydantic import BaseModel
from fastapi import FastAPI, HTTPException
from aiolimiter import AsyncLimiter
from rich.logging import RichHandler
from rich.console import Console
from tenacity import retry, stop_after_attempt
# FastAPI Instance
app = FastAPI()
# Set up logging with Rich library
logging.basicConfig(
level=logging.DEBUG,
format="%(message)s",
handlers=[RichHandler(rich_tracebacks=True)]
)
log = logging.getLogger("rich")
# Console for progress monitoring
console = Console()
# Pydantic Models
class TranslationRequest(BaseModel):
srt_content: str
source_language: str
target_language: str
class TranslationResponse(BaseModel):
translated_srt_content: str
status: str
error_message: Optional[str] = None
# OpenAI API Key
openai.api_key = os.getenv('OPENAI_API_KEY', 'xxx')
# DeepL API Key
deepl_api_key = os.getenv('DEEPL_API_KEY', 'xxx')
# Rate Limiter
max_calls = int(os.getenv('MAX_CALLS', 20))
rate_limiter = AsyncLimiter(max_rate=max_calls, time_period=1) # Adjustable rate limits
# NLP model for sentence tokenization
nlp = spacy.load("en_core_web_sm")
async def translate_with_openai(chunk, source_language, target_language):
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo-16k",
messages=[
{"role": "system", "content": f"You are an AI model specializing in subtitle translation. Your task is to translate three lines of text from a source language to a target language, while retaining the original tone and context. Make sentences short and clear for the target audience, who are young people favoring concise sentences. You can slightly deviate from the original text without altering its meaning too much. Do not modify any timestamps, personal information, or disrupt the original format.\r\n\r\nHere's the input format:\r\n\r\n```\r\n1) first string to translate\r\n2) second string to translate\r\n3) third string to translate\r\n```\r\n\r\nAnd the expected output:\r\n\r\n```\r\n1) translated first string\r\n2) translated second string\r\n3) translated third string\r\n```\r\n\r\nWhile translating, keep in mind:\r\n\r\n- Source language for the translation: {source_language}\r\n- Target language for the translation: {target_language}\r\n\r\nFrom now on, the 'user' will provide the text directly in three lines. You should return your translation preserving its format. If the user\u2019s input isn't three lines, return at least 3 lines that can be even a new line without content but the output must be having three lines. Use plain Turkish that everyone can understand in translation. Please try to understand the context between three lines and respect to flow of subtitle by deeply understanding context before translating."},
{"role": "user", "content": f"1) I think you guys know me and obviously the prime minister, but if you guys could introduce yourselves and say a bit about you and, you've done both done amazing things, so please don't be shy.\r\n2) Greg Brockman from OpenAI: Sure thing.\r\n3) So I'm Greg Brockman."},
{"role": "assistant", "content": f"1) San\u0131r\u0131m beni ve tabii ki ba\u015Fbakan\u0131 tan\u0131yorsunuz. Ama siz kendinizi tan\u0131tabilir ve hakk\u0131n\u0131zda biraz bilgi verebilir misiniz? \u0130kiniz de harika \u015Feyler ba\u015Fard\u0131n\u0131z, l\u00FCtfen \u00E7ekinmeden konu\u015Fun.\r\n2) Greg Brockman \/ OpenAI'dan: Tabii ki!\r\n3) Evet, ben Greg Brockman. "},
{"role": "user", "content": f"1) Greg Brockman from OpenAI: did it.\r\n2) We literally had an intern in 20, 2016.\r\n3) So our very first summer who we had this conversation about."},
{"role": "assistant", "content": f"1) OpenAI'dan Greg Brockman: biz yapm\u0131\u015Ft\u0131k!\r\n2) 2016'da tam 20 ya\u015F\u0131nda bi tane harbi stajyerimiz vard\u0131\r\n3) Bu muhabbeti yapt\u0131\u011F\u0131m\u0131zda OpenAI'daki ilk yaz aylar\u0131m\u0131zd\u0131"},
{"role": "user", "content": f"1) The Max's book takes you to the existential question of whether, you\r\n2) project basically machine intelligence or human intelligence into the cosmos,\r\n3) human intelligence turned into machine intelligence into the cosmos and so on."},
{"role": "assistant", "content": f"1) Max'\u0131n kitab\u0131, bize varolu\u015Fumuzla ilgili \u00E7ok ilgin\u00E7 bir soru soruyor.\r\n2) Biz evrene robot zekas\u0131 m\u0131, yoksa insan zekas\u0131 m\u0131 b\u0131rakaca\u011F\u0131z? \r\n3) Yoksa insan zekas\u0131 bundan sonra tamamen robot zakas\u0131 m\u0131 demek olacak?"},
{"role": "user", "content": "\n".join(chunk)},
]
)
response_lines = response['choices'][0]['message']['content'].split("\n", 2)
# Ensure there are exactly three lines
while len(response_lines) < 3:
response_lines.append("")
return response_lines
def translate_with_deepl(chunk, target_language):
data = {'text': "\n".join(chunk), 'target_lang': target_language}
headers = {'Authorization': 'DeepL-Auth-Key ' + deepl_api_key}
response = requests.post('https://api-free.deepl.com/v2/translate', headers=headers, data=data)
return response.json()['translations'][0]['text'].split("\n")
async def translate_chunk(chunk, source_language, target_language):
async with rate_limiter:
try:
# Prepend each sentence with its index
indexed_chunk = [f"{i+1}) {sentence}" for i, sentence in enumerate(chunk)]
log.debug(f"Translating chunk: {indexed_chunk}") # Log the chunk
translated_chunk = await translate_with_openai(indexed_chunk, source_language, target_language)
# Ensure there are exactly three lines
while len(translated_chunk) < 3:
translated_chunk.append("")
# Strip the index from each translated sentence
translated_chunk = [sentence.split(') ', 1)[1] if ') ' in sentence else sentence for sentence in translated_chunk]
log.debug(f"Translated chunk: {translated_chunk}") # Log the translated chunk
return translated_chunk
except Exception as e:
log.error(f"Error with OpenAI API: {str(e)}. Switching to fallback service: DeepL")
# Do not prepend the index for DeepL
translated_chunk = translate_with_deepl(chunk, target_language)
# Ensure there are exactly three lines
while len(translated_chunk) < 3:
translated_chunk.append("Translation not available")
log.debug(f"Translated chunk with DeepL: {translated_chunk}") # Log the chunk translated with DeepL
return translated_chunk
@app.post("/subtitle-translate", response_model=TranslationResponse)
async def translate_subtitle(request: TranslationRequest):
try:
log.debug("Received translation request") # Log the request
start_time = time.time()
subs = pysrt.from_string(request.srt_content, encoding='iso-8859-1')
combined_text = ' '.join([sub.text for sub in subs])
doc = nlp(combined_text)
sentences = [line for sub in subs for line in sub.text.split('\n')]
log.debug(f"Split sentences into {len(sentences)} chunks") # Add debug log here
# Create a list to hold all the tasks
tasks = []
for i in range(0, len(sentences), 3):
chunk = sentences[i : i+3]
# Create a task for each chunk and add it to the tasks list
tasks.append(asyncio.create_task(translate_chunk(chunk, request.source_language, request.target_language)))
# Gather all the tasks and run them in parallel
translated_sentences = await asyncio.gather(*tasks, return_exceptions=True)
# Handle exceptions (i.e., failed translations)
for i, result in enumerate(translated_sentences):
if isinstance(result, Exception):
log.error(f"Failed to translate chunk #{i}: {result}")
translated_sentences[i] = ["..."] * 3 # Replace the failed translation with "..."
# Flatten the list of translated sentences
translated_sentences = [sentence for chunk in translated_sentences for sentence in chunk]
if len(translated_sentences) != len(sentences):
log.error("Mismatch in number of sentences in the translated text.") # Log the mismatch
# Handle mismatch situations by adding or removing lines as necessary
while len(translated_sentences) < len(sentences):
translated_sentences.append("...")
translated_sentences = translated_sentences[:len(sentences)]
for i, sub in enumerate(subs):
if i < len(translated_sentences):
sub.text = translated_sentences[i]
else:
sub.text = "" # Or some placeholder text
translated_srt_content = '\n'.join([str(sub) for sub in subs])
log.debug(f"Translated SRT content: {translated_srt_content}") # Log the translated SRT content
log.info(f"Translation completed in {time.time() - start_time} seconds.")
# Save state every 10 subtitles
if len(subs) % 10 == 0:
with open('state.txt', 'w') as f:
f.write(f"Current position: {len(subs)}\n")
f.write("Translated text so far:\n" + '\n'.join([str(sub) for sub in subs]))
return TranslationResponse(
translated_srt_content=translated_srt_content,
status="success"
)
except Exception as e:
log.error(f"An error occurred: {str(e)}")
log.debug(f"Exception details: {traceback.format_exc()}")
return TranslationResponse(
translated_srt_content="",
status="failure",
error_message=str(e)
)