In [2]:
'''
This is to convert a speech-to-text output file to be in the form of SRT.
See what is the input and what is the output:

input (Txt)
{QTtext} {font:Tahoma}
{plain} {size:20}
{timeScale:30}
{width:160} {height:32}
{timestamps:absolute} {language:0}
[00:00:11.15]
Toda a gente me conhece 
[00:00:14.01]

[00:00:14.01]
e têm muito respeito por mim, toda a gente.
[00:00:16.17]

[00:00:16.17]
Eu também tenho.
[00:00:17.20]

[00:00:21.01]
O meu pai era arrais da campanha,
[00:00:23.18]

[00:00:23.18]
da sardinha,
[00:00:25.04]

[00:00:25.04]
do carapau, lulas.
[00:00:26.19]

Output (SRT):

1
00:00:11,150 --> 00:00:14,010
Toda a gente me conhece

2
00:00:14,010 --> 00:00:16,170
e têm muito respeito por mim, toda a gente.

3
00:00:16,170 --> 00:00:17,200
Eu também tenho.

4
00:00:21,010 --> 00:00:23,180
O meu pai era arrais da campanha,

5
00:00:23,180 --> 00:00:25,040
da sardinha,

6
00:00:25,040 --> 00:00:26,190
do carapau, lulas.


'''

import os
import re

def convert_to_srt_corrected(content):
    # Split the content by lines and initialize variables
    lines = content.split('\n')
    srt_format = []
    counter = 1

    # Regular expression to match timecodes and text
    timecode_regex = r'\[(\d{2}:\d{2}:\d{2}\.\d{2})\]'

    for i in range(len(lines)):
        # Match the timecode
        if re.match(timecode_regex, lines[i]):
            # Extract start time
            start_time = re.findall(timecode_regex, lines[i])[0]

            # Convert start_time from 'HH:MM:SS.FF' to 'HH:MM:SS,FFF'
            # Adjusting for fractional seconds
            h, m, s_ff = start_time.split(':')
            s, ff = s_ff.split('.')
            h, m, s = int(h), int(m), int(s)
            start_time_srt = f"{h:02d}:{m:02d}:{s:02d},{int(ff)*10:03d}"  # Adjusting for fractional seconds to milliseconds

            # Check for end time (next timestamp)
            if i + 2 < len(lines) and re.match(timecode_regex, lines[i + 2]):
                end_time = re.findall(timecode_regex, lines[i + 2])[0]
                h, m, s_ff = end_time.split(':')
                s, ff = s_ff.split('.')
                h, m, s = int(h), int(m), int(s)
                end_time_srt = f"{h:02d}:{m:02d}:{s:02d},{int(ff)*10:03d}"
            else:
                # No end time found, use a default duration of 2 seconds
                s += 2  # Add 2 seconds
                if s >= 60:
                    s -= 60
                    m += 1
                if m >= 60:
                    m -= 60
                    h += 1
                end_time_srt = f"{h:02d}:{m:02d}:{s:02d},000"

            # Extract and clean the text
            text = lines[i + 1].strip()

            # Append formatted subtitle to the list
            formatted_subtitle = f"{counter}\n{start_time_srt} --> {end_time_srt}\n{text}\n"
            srt_format.append(formatted_subtitle)
            counter += 1

    return '\n'.join(srt_format)

# Convert the content to SRT format
#srt_content = convert_to_srt_corrected(content)

# Display the first few lines of the converted content
#srt_content[:500]

def process_folder(folder_path, output_folder):
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)

            try:
                # Read the content of the file with utf-8 encoding
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()

                # Convert to SRT format
                srt_content = convert_to_srt_corrected(content)

                # Define the output file path
                output_file_name = os.path.splitext(filename)[0] + '.srt'
                output_file_path = os.path.join(output_folder, output_file_name)

                # Save the converted content
                with open(output_file_path, 'w', encoding='utf-8') as file:
                    file.write(srt_content)

                print(f"Processed: {filename}")
            except UnicodeDecodeError:
                print(f"Error processing {filename}: Unsupported character encoding")

# Example usage
# process_folder('/path/to/input/folder', '/path/to/output/folder')

# Saving the converted SRT content to a new file
process_folder('D:/OneDrive/Job/TDM/20231128 - MyLand/0101', 'D:/OneDrive/Job/TDM/20231128 - MyLand/0202')


Processed: 1. MTMG_ALCÁCER DO SAL_AMOR POR ALCÁCER_HD@25.mov.txt
