In [1]:
import pandas as pd
import re
from transformers import pipeline
from tqdm import tqdm

In [2]:
def parse_srt_file(srt_file_path):
    with open(srt_file_path, 'r') as f:
        srt_data = f.read()

    srt_regex = r'(\d+)\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n(.+?(?=\n\d|$))'

    srt_matches = re.findall(srt_regex, srt_data, re.DOTALL)
    srt_data_list = [(int(m[0]), m[1], m[2], m[3].replace('\n', ' ')) for m in srt_matches]

    return pd.DataFrame(srt_data_list, columns=['id', 'start_time', 'end_time', 'sentence'])

In [4]:
srt_file_path = 'd01_data\english_subtitles.srt'
df = parse_srt_file(srt_file_path)
print(df.size)
print(df.head())

3904
   id    start_time      end_time               sentence
0   1  00:00:25,108  00:00:27,610     (leaves rustling) 
1   2  00:00:27,610  00:00:29,279  Laura: ...years ago. 
2   3  00:00:29,279  00:00:32,615  (indistinct arguing) 
3   4  00:00:32,615  00:00:33,783       Max: That's it! 
4   5  00:00:33,783  00:00:36,077               â™ª â™ª 


In [8]:
def translate_srt_file(srt_file_path, target_language):
    df = parse_srt_file(srt_file_path)

    # Initialize the translation pipeline
    translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-en-{target_language}")

    # Apply the translation pipeline to each sentence in the DataFrame
    tqdm.pandas()
    df['translated_sentence'] = df['sentence'].progress_apply(lambda x: translator(x, max_length=512)[0]['translation_text'])

    return df

In [9]:
srt_file_path = 'd01_data\english_subtitles.srt'
target_language = 'fr' # Change this to the target language you want to translate to
df = translate_srt_file(srt_file_path, target_language)
print(df.head())

100%|████████████████████████████████████████████████████████████████████████████████| 976/976 [11:49<00:00,  1.37it/s]

   id    start_time      end_time               sentence   
0   1  00:00:25,108  00:00:27,610     (leaves rustling)   \
1   2  00:00:27,610  00:00:29,279  Laura: ...years ago.    
2   3  00:00:29,279  00:00:32,615  (indistinct arguing)    
3   4  00:00:32,615  00:00:33,783       Max: That's it!    
4   5  00:00:33,783  00:00:36,077               â™ª â™ª    

                                 translated_sentence  
0                           (les feuilles rouillent)  
1                                 Il y a des années.  
2                        (argumentation indistincte)  
3                                         C'est ça !  
4  «A» et «A» de l'annexe I du règlement (UE) no ...  





In [None]:
def translate_sentence(sentence, model, tokenizer):
    # Add start and end of sentence tokens and tokenize the input
    sentence = f'>{sentence.strip()}'
    inputs = tokenizer(sentence, return_tensors='pt')

    # Translate the input and decode the output
    outputs = model.generate(**inputs)
    translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return translated_sentence

In [None]:
def translate_srt_file(srt_file_path, target_language):
    df = parse_srt_file(srt_file_path)

    # Initialize the MarianMT model and tokenizer
    model_name = f'Helsinki-NLP/opus-mt-en-{target_language}'
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)

    # Apply the MarianMT model to each sentence in the DataFrame with a progress bar
    tqdm.pandas()
    df['translated_sentence'] = df['sentence'].progress_apply(lambda x: translate_sentence(x, model, tokenizer))

    return df

In [10]:
def write_srt_file(df, output_file_path):
    with open(output_file_path, 'w') as f:
        for index, row in df.iterrows():
            f.write(str(row['id']) + '\n')
            f.write(row['start_time'] + ' --> ' + row['end_time'] + '\n')
            f.write(row['translated_sentence'] + '\n\n')

In [14]:
output_file_path = 'd01_data/translated.srt'
write_srt_file(df, output_file_path)

In [7]:
df['translated_sentence']

0                             (les feuilles rouillent)
1                                   Il y a des années.
2                          (argumentation indistincte)
3                                           C'est ça !
4    «A» et «A» de l'annexe I du règlement (UE) no ...
5                                          (inaudible)
6                                     Vous comprenez !
7                                         Je le sais !
8                                        Laura : Ohh !
9                             (violon poignant jouant)
Name: translated_sentence, dtype: object