In [1]:
''' 
SRT2CSV
vsulli
26 April 2025
read in a .srt file 
convert to a df to perform nlp
export to a .csv file for upload to ANKI
'''

import nltk
import numpy as np
import pandas as pd
import pysrt
import re
import seaborn as sns
import spacy
import string

from nltk.corpus import stopwords
german_sw = stopwords.words('german')
from textblob import TextBlob

# allows for displaying multiple outputs in one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
from tkinter import Tk
from tkinter.filedialog import askopenfilename

root = 
# keeps full GUI from appearing
Tk().withdraw() 

# shows dialog box to ask for filename
filename = askopenfilename()

''

In [None]:
# specify the encoding if UnicodeDecodeError
# you can open the file in NotePad and check SaveAs for default encoding
# ANSI, UTF-8, UTF-8 BOM?
subs = pysrt.open(filename, encoding='utf-8')

In [None]:
def remove_formatting(subfile):
    # read through every sub line
    # using regex, delete section of line between <>
    # combine to string
    # update index
    for i in range(len(subfile)):
        # replace all characters between < and > with ""
        # using regex
        # strip white space from beginning
        subfile[i].text = subfile[i].text.lstrip()
        subfile[i].text = re.sub(r'<c.vtt_\w+>\s*"*', "", subfile[i].text)
        subfile[i].text = re.sub("</c>", "", subfile[i].text)
        # replace "-" with ""
        subfile[i].text = re.sub("-", "", subfile[i].text)

    return subfile

In [None]:
subs = remove_formatting(subs)

In [None]:
# create a dataframe with desired column name - 'Subtitle'
def create_dataframe(subfile, col_name):
    # create dataframe
    df = pd.DataFrame(columns=[col_name])
    for i in range(len(subfile)):
        df.loc[i] = subfile[i].text
    return df

In [None]:
# combine rows until period or end character reached
# used for gaining more context for sentiment analysis and classification
# append these rows to list and then create new df - more efficient than appending to df
# end characters at index -1 must be ) . ? ! 
df = create_dataframe(subs, 'Subtitle')
df
new_df_list = []
current_row = ""
for row in df['Subtitle']:
    if row[-1] == ")" or row[-1] == "." or row[-1] == "?" or row[-1] == "!":
        current_row += row
        new_df_list.append(current_row)
        current_row = ""
    else:
        current_row += row

# create new df from the list of combined rows
# currently fewer rows - need to verify that it combined the rows correctly
new_df = pd.DataFrame(new_df_list)
new_df.rename(columns={0:"Subtitle"}, inplace=True)
new_df


In [None]:
# create new dataframe removing the subtitle font tags
# go row by row - only include what's between > and < symbols
new_df_list = []
current_row = ""
for i in range(len(df['Subtitle'])):
    # slice between two characters > and < 
    match = re.findall(r'>(.*?)<', df['Subtitle'][i])
    if match:
        new_df_list.append(match[0])
    else:
        new_df_list.append(df['Subtitle'][i])

    
# create new df from the list of combined rows
# currently fewer rows - need to verify that it combined the rows correctly
new_df = pd.DataFrame(new_df_list)
new_df.rename(columns={0:"Subtitle"}, inplace=True)
new_df

In [None]:
# basic exploratory data analysis
df = create_dataframe(subs, 'Subtitle')
df.head()
df.shape
df.dtypes
df.describe(include='all')

In [None]:
# convert to lowercase
def make_lowercase(df):
    df['Lowercase'] = df['Subtitle'].str.lower()
    return df

In [None]:
# remove punctuation
def remove_punctuation(df):
    # starts with any word, digits, or underscore, white space character, + means any character in the string
    df['Lowercase'] = df['Lowercase'].str.replace(r'[^\w\s]+', ' ', regex = True)
    # df['Subtitle'] = df['Subtitle'].str.replace(r'\n', ' ', regex = True) # replaces the \n with a space
    return df

In [None]:
# remove newline \n character
def remove_newline(df):
    df['Subtitle'] = df['Subtitle'].str.replace(r'\n', ' ', regex = True) # replaces the \n with a space
    return df

In [None]:
df = create_dataframe(subs, 'Subtitle')
clean_df = remove_newline(df)

clean_df = make_lowercase(clean_df)
print(clean_df.head())

In [None]:
clean_df = remove_punctuation(clean_df)
print(clean_df.head())
clean_df[100:101]

In [None]:
df['Word Tokens'] = df['Lowercase'].apply(nltk.word_tokenize)

In [None]:
word_tokens = df['Lowercase'].apply(nltk.word_tokenize)
word_tokens

In [None]:
df['Sentence Tokens'] = df['Lowercase'].apply(nltk.sent_tokenize)

In [None]:
sentence_tokens = df['Lowercase'].apply(nltk.sent_tokenize)
sentence_tokens

In [None]:
# remove stopwords

# have to extend stopwords to include names of characters
custom_stop_words = ['hamburg', 'berlin', 'elise', 'christian', 'berti', 'eva', 'chris']
german_sw.extend(custom_stop_words)

df['no_stop_words'] = df['Word Tokens'].apply(lambda x: [item for item in x if item not in german_sw])
df[100:120]

In [None]:
df[100:101]['Subtitle']
df[0:20]['no_stop_words']

In [None]:
# create a frequency diagram without stop words

# list of all words
words = []
for index, row in df.iterrows():
    for word in row['no_stop_words']:
        words.append(word)
words[0:20]

In [None]:
# plot the word frequency
sns.set_style('darkgrid')
freq_words = nltk.FreqDist(words)
freq_words.plot(20)

In [None]:
# TODO

# Notepad++ convert ANSI to UTF-8 for special characters

# remove font color tags
# everything from < to > 
# <font color=#ffffff>(Motor wird angelassen)</font>

# # combine rows until period or end character reached
# not creating a new df