In [None]:
import os
from dotenv import load_dotenv
from openai import OpenAI
import json
import re

BATCHSIZE = 10
LANG = "Hungarian"
INPUT = R"d:\The.Day.of.the.Jackal.S01E01.1080p.10bit.WEBRip.6CH.x265.HEVC-PSA.srt"
OUTPUT = re.sub(r'\.srt$', f"-{LANG}.srt", INPUT)
GPT_MODEL = "gpt-4o"

load_dotenv()
KEY = os.getenv("API_KEY")
client = OpenAI(api_key=KEY)

with open(INPUT, 'r', encoding='utf-8') as f:
    subs = list(map(lambda i: {
        'sequenceNumber':i[0],
        'timeCode':i[1],
        'subtitleText':i[2]
    }, re.findall("(\d+)[\r\n](\d+:\d+:\d+,\d+ --> \d+:\d+:\d+,\d+)[\r\n]((?:.+[\n\r])+)", f.read())))

chunks = [subs[i:i + BATCHSIZE] for i in range(0, len(subs), BATCHSIZE)]

with open(OUTPUT, 'w', encoding='utf-8') as t:

    for chunk in chunks:
        
        to_translate = [item for item in chunk if not str(item['subtitleText']).startswith('â™ª')]

        if len(to_translate) > 0:
            
            print(to_translate[0])
            
            to_translate_content = '\n###\n'.join(['\n'.join(list([item['sequenceNumber'],item['subtitleText']])) for item in to_translate])
            
            completion = client.chat.completions.create(
            model=GPT_MODEL,
            messages=[
                {
                    "role": "system",
                    "content": f"Translate the provided subtitles into {LANG} and output as a structured JSON array. Each subtitle is separated by ###, with a unique sequence number. For each subtitle, maintain context, tone, cultural nuances, and naturally mirror the original content and format including line-breaks for proper screen-fit. Prioritize sequence integrity and consistency."
                },
                {
                    "role": "user",
                    "content": to_translate_content
                }
            ],
            response_format={
                    "type": "json_schema",
                    "json_schema": {
                        "name": "response",
                        "schema": {
                            "type": "object",
                            "properties": {
                                "results": {
                                    "type": "array",
                                    "items": {
                                        "type": "object",
                                        "properties": {
                                            "sequenceNumber": {
                                                "type": "string",
                                                "description": "A numeric sequence counter for each subtitle entry."
                                            },
                                            "subtitleText": {
                                                "type": "string",
                                                "description": "The text of the subtitle."
                                            }
                                        },
                                        "required": [
                                            "sequenceNumber",
                                            "subtitleText"
                                        ],
                                        "additionalProperties": False
                                    }
                                }
                            },
                            "additionalProperties": False,
                            "required": ["results"]
                        },
                        "strict": True
                    }
                }
            )

            translated_chunk = json.loads(completion.choices[0].message.content)['results']

            for item in chunk:
                try:
                    translated = next(
                        sub['subtitleText'] 
                        for sub in translated_chunk 
                        if str(sub['sequenceNumber']) == str(item['sequenceNumber'])
                    )
                except StopIteration:
                    translated = None
                    
                t.write("%s\n" % item['sequenceNumber'])
                t.write("%s\n" % item['timeCode'])
                t.write("%s\n\n" % re.sub(r'[\r\n]+$', '', translated if translated else item['subtitleText']))
        
