In [30]:
"""
SRT2CSV Preprocessing
Author: vsulli
Date: 26 April 2025

- Convert a .srt file to UTF-8 BOM
- Extract and load spaCy language models
"""

'\nSRT2CSV Preprocessing\nAuthor: vsulli\nDate: 26 April 2025\n\n- Convert a .srt file to UTF-8 BOM\n- Extract and load spaCy language models\n'

In [31]:
import os
import spacy
import string
import tarfile

# Allows for displaying multiple outputs in one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [32]:
def convert_srt_to_utf8_bom(srt_path, output_path=None, output_filename=None):
    """
    Converts a .srt file to UTF-8 with BOM encoding.

    Args:
        srt_path (str): Path to the input .srt file.
        output_path (str, optional): Directory to save the converted file.
        output_filename (str, optional): Custom name for the output file (without extension).
    """
    
    output_path = output_path if output_path else os.path.dirname(srt_path)
    new_filename = output_filename + ".srt" if output_filename else os.path.basename(srt_path)
    output_path = os.path.join(output_path, new_filename)
    
    with open(srt_path, 'r', encoding='utf-8', errors='replace') as f:
        content = f.read()
    with open(output_path, 'w', encoding='utf-8-sig') as f:
        f.write(content)
    print(f"File saved under: {output_path}")

In [34]:
# Provide the full path to the .srt file and the name of the output file
convert_srt_to_utf8_bom(
    srt_path = "", # ex) "C:\path\to\input.srt"
    output_path=r"", # optional
    output_filename="" # optional: no extension
)

File saved under D:\Coding\Coding Projects\TatortNLP\converted.srt


In [44]:
# Extract the tar zip of language model
def extract_spacy_model(tar_path, extract_path=None):
    """
    Extracts a spaCy language model from a .tar.gz archive.

    Args:
        tar_path (str): Full path to the .tar.gz archive.
        extract_path (str, optional): Directory to extract the contents into.
                                      Defaults to the archive's parent directory.
    """
    if not extract_path:
        extract_path = os.path.splitext(os.path.splitext(tar_path)[0])[0] # remove .tar.gz
    if not os.path.exists(tar_path):
        raise FileNotFoundError(f"The tar archive does not exist: {tar_path}")
        
    with tarfile.open(tar_path, "r:gz") as tar:
        tar.extractall(path=extract_path)
    print(f"File saved under: {extract_path}")

In [45]:
# Provide the full path to the .tar.gz file and an optional output path
extract_spacy_model(
    tar_path = r"", # ex) "C:\path\to\es_core_news_sm-3.8.0.tar.gz"
    extract_path = r"" # optional
)

File saved under: C:\Users\paro\Documents\GitHub\SRT2CSV\es_core_news_sm-3.8.0


In [49]:
# Full path to the model
filepath = r"" # ex) r"C:\path\to\es_core_news_sm-3.8.0\es_core_news_sm-3.8.0\es_core_news_sm\es_core_news_sm-3.8.0"

# Load in the model
nlp = spacy.load(filepath)

# Test with a sentence from the target language
doc = nlp("") # ex) "El País es un periódico español fundado en 1976."

# Print information for each token of sentence
for token in doc:
    print(token.text, token.pos_, token.dep_)




El DET det
País PROPN nsubj
es AUX cop
un DET det
periódico NOUN ROOT
español ADJ amod
fundado ADJ amod
en ADP case
1976 NOUN obl
. PUNCT punct
