In [1]:
lang = 'german' # select your file's language
''' ['arabic', 'azerbaijani', 'basque', 'bengali', 'catalan', 'chinese', 
'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 
'hebrew', 'hinglish', 'hungarian', 'indonesian', 'italian', 'kazakh', 
'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 
'spanish', 'swedish', 'tajik', 'turkish']
'''
tags = [lang, 'sentence::mining', 'Tatort', 'sentence::recognition'] # input the desired tags for ANKI
fields = [lang, 'defs', tags] # decide on the number of fields and their contents for ANKI

In [5]:
# extract the tar zip
import tarfile

tar_path = r"Documents\GitHub\SRT2CSV\de_core_news_sm-3.0.0.tar.gz"
extract_path = r"Documents\GitHub\SRT2CSV\de_core_news_sm-3.0.0"

with tarfile.open(tar_path, "r:gz") as tar:
    tar.extractall(path=extract_path)


In [10]:
import spacy

# Load from the correct folder
nlp = spacy.load(r"C:\Users\paro\Documents\GitHub\SRT2CSV\de_core_news_sm-3.0.0\de_core_news_sm-3.0.0\de_core_news_sm\de_core_news_sm-3.0.0")

# Test it with a German sentence
doc = nlp("Das Auto ist schnell.")

# Iterate through tokens
for token in doc:
    print(token.text, token.pos_, token.dep_)


Das DET nk
Auto NOUN sb
ist AUX ROOT
schnell ADV pd
. PUNCT punct


In [8]:
''' 
SRT2CSV
vsulli
26 April 2025
read in a .srt file 
convert to a df to perform nlp
export to a .csv file for upload to ANKI
'''
import json
import nltk
import numpy as np
import pandas as pd
import pysrt
import re
import seaborn as sns
import spacy
import string

from nltk.corpus import stopwords
lang_sw = stopwords.words(lang)
from textblob import TextBlob

from langdetect import detect, LangDetectException

nlp = spacy.load("C:\Users\paro\Documents\GitHub\SRT2CSV\de_core_news_sm-3.0.0\de_core_news_sm-3.0.0\de_core_news_sm\de_core_news_sm-3.0.0") # loads german model

# module that allows you to get meanings, translations, synonyms, and antonyms for supported langs
'''[bengali' (bn),'chinese (zh)', english' (en), 'french' (fr), 'german' (de), 
'italian' (it),'portuguese' (pt), 'romanian' (ro), 'russian'(ru), 'spanish'(es), 
'turkish' (tr)]'''
from PyMultiDictionary import MultiDictionary, DICT_EDUCALINGO
dictionary = MultiDictionary()

# allows for displaying multiple outputs in one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (1596159479.py, line 25)

In [5]:
import tkinter as tk
from tkinter import filedialog

# keeps full GUI from appearing
root = tk.Tk()
root.withdraw()

# shows dialog box to ask for filename
filename = filedialog.askopenfilename()

In [None]:
# specify the encoding if UnicodeDecodeError
# you can open the file in NotePad and check SaveAs for default encoding
# ANSI, UTF-8
subs = pysrt.open(filename, encoding='utf-8')

In [None]:
def remove_formatting(subfile):
    # read through every sub line
    # using regex, delete section of line between <>
    # combine to string
    # update index
    for i in range(len(subfile)):
        # replace all characters between < and > with ""
        # using regex
        # strip white space from beginning
        subfile[i].text = subfile[i].text.lstrip()
        subfile[i].text = re.sub(r'<c.vtt_\w+>\s*"*', "", subfile[i].text)
        subfile[i].text = re.sub("</c>", "", subfile[i].text)
        # replace "-" with ""
        subfile[i].text = re.sub("-", "", subfile[i].text)

    return subfile

In [None]:
subs = remove_formatting(subs)

In [None]:
# create a dataframe with desired column name
def create_dataframe(subfile, col_name):
    # create dataframe
    df = pd.DataFrame(columns=[col_name])
    for i in range(len(subfile)):
        df.loc[i] = subfile[i].text
    return df

In [None]:
# combine rows until period or end character reached
# used for gaining more context for sentiment analysis and classification
# append these rows to list and then create new df - more efficient than appending to df
# end characters at index -1 must be ) . ? ! 
df = create_dataframe(subs, 'Subtitle')
df
new_df_list = []
current_row = ""
for row in df['Subtitle']:
    if row[-1] == ")" or row[-1] == "." or row[-1] == "?" or row[-1] == "!":
        current_row += row
        new_df_list.append(current_row)
        current_row = ""
    else:
        current_row += row

# create new df from the list of combined rows
new_df = pd.DataFrame(new_df_list)
new_df.rename(columns={0:"Subtitle"}, inplace=True)
new_df


In [None]:
# create new dataframe removing the subtitle font tags
# go row by row - only include what's between > and < symbols
new_df_list = []
current_row = ""
for i in range(len(df['Subtitle'])):
    # slice between two characters > and < 
    match = re.findall(r'>(.*?)<', df['Subtitle'][i])
    if match:
        new_df_list.append(match[0])
    else:
        new_df_list.append(df['Subtitle'][i])

    
# create new df from the list of combined rows
# currently fewer rows - need to verify that it combined the rows correctly
new_df = pd.DataFrame(new_df_list)
new_df.rename(columns={0:"Subtitle"}, inplace=True)
new_df


In [None]:
# basic exploratory data analysis
def explore_data(dataframe, column):
    print(dataframe.head())
    print(dataframe.shape)
    print(dataframe.dtypes)
    print(dataframe.describe(include='all'))
    
explore_data(new_df, 'Subtitle')

In [None]:
# convert to lowercase
def make_lowercase(df):
    df['Lowercase'] = df['Subtitle'].str.lower()
    return df

In [None]:
# remove punctuation
def remove_punctuation(df):
    # ^ is a negation inside brackets (anything except)
    # starts with any word, digits, or underscore, white space character, apostrophe, + means any character in the string
    df['Lowercase'] = df['Lowercase'].str.replace(r"[^\w\s']+", ' ', regex = True)
    return df

In [None]:
# remove newline \n character
def remove_newline(df):
    df['Subtitle'] = df['Subtitle'].str.replace(r'\n', ' ', regex = True) # replaces the \n with a space
    return df

In [None]:
# remove newline from df
clean_df = remove_newline(new_df)
print(clean_df.head())

clean_df[102:103]

In [None]:
# make lowercase
clean_df = make_lowercase(clean_df)
print(clean_df.head())


In [None]:
# remove punctuation
clean_df = remove_punctuation(clean_df)
print(clean_df.head())
clean_df[102:103]

In [None]:
# word tokenizer
# basic tokenizer splits on apostrophe
# tweet tokenizer does not
clean_df['Word Tokens'] = clean_df['Lowercase'].apply(nltk.word_tokenize)

In [None]:
clean_df['Word Tokens']

In [None]:
clean_df['Sentence Tokens'] = clean_df['Lowercase'].apply(nltk.sent_tokenize)

In [None]:
clean_df['Sentence Tokens']

In [None]:
# remove stopwords

# open existing file
try:
    with open("removed_words.json", "r") as f:
        removed_words = set(json.load(f))
#JSONDecodeError - if file is empty       
except (FileNotFoundError, json.JSONDecodeError):
    removed_words = set()
    
# add words 
removed_words.update(lang_sw)

# save new set
with open("removed_words.json", "w") as f:
    json.dump(list(removed_words), f)

In [None]:
# extend the stop words with custom words

# names of characters & places 
custom_stop_words = ['hamburg', 'berlin', 'leipzig', 'elise', 'christian', 'berti', 'eva', 'chris', 'landsberger', 'trimmel']

# save to JSON file
try:
    with open("removed_words.json", "r") as f:
        removed_words = set(json.load(f))
except FileNotFoundError:
    removed_words = set()
    
removed_words.update(custom_stop_words)

with open("removed_words.json", "w") as f:
    json.dump(list(removed_words), f)

In [None]:
# remove from stop words file
desired_words = []

try:
    with open("removed_words.json", "r") as f:
        removed_words = set(json.load(f))
  
except (FileNotFoundError, json.JSONDecodeError):
    removed_words = set()
    
# add back words (remove from set)
removed_words.difference_update(desired_words)

with open("removed_words.json", "w") as f:
    json.dump(list(removed_words), f)

In [None]:
# create new column without all removed words
clean_df['No Stop Words'] = clean_df['Word Tokens'].apply(lambda x: [item for item in x if item not in removed_words])
clean_df[100:120]

In [None]:
clean_df[100:101]['Subtitle']
clean_df[0:20]['No Stop Words']

In [None]:
# create a frequency diagram without stop words

# list of all words
words = []
for index, row in clean_df.iterrows():
    for word in row['No Stop Words']:
        words.append(word)
words[0:20]

In [None]:
# plot the word frequency
sns.set_style('darkgrid')
freq_words = nltk.FreqDist(words)
freq_words.plot(20)

In [None]:
# PyMultiDictionary to get definition

# retrieves part of speech, explanations
print(dictionary.meaning('de', 'hund', dictionary=DICT_EDUCALINGO))


In [None]:
# extract only desired language from definition
def extract_desired_lang(word, lang):
    # Get the dictionary result
    res = dictionary.meaning(lang, word, dictionary=DICT_EDUCALINGO)
    
    if not res or not res[1]:
        return None
    
    pos_tags, text, _ = res
    
    # split into sentences
    sentences = text.split('. ')
    
    # desired language sentences
    my_sentences = []
    
    for sent in sentences:
        sent = sent.strip() 
        if not sent:
            continue
        
        try:
            detected_lang = detect(sent)
            
            # check if desired lang
            if detected_lang == lang:
                my_sentences.append(sent)
        except LangDetectException:
            # skip if not desired lang
            continue
    
    # combine sentences and return
    return ' '.join(my_sentences)

In [None]:
no_stop = ['katze', 'vogel']

# construct a dictionary entry for all words in no stop words column
# word [pos]: definition
# space
# word2 [pos]: definition
dict_entry = ""
for w in no_stop:
    doc = nlp(w)
    
    if doc.pos == "NOUN":
        article = doc.article_
    else:
        article = ""
        
    dict_entry += article + " " + w + " [" + dictionary.meaning('de', w, dictionary=DICT_EDUCALINGO)[0][0] + "] : " +  extract_desired_lang(w, 'de') + "\n\n"
    
print(dict_entry)

In [None]:
# export sentences to a .csv file



In [None]:
# TODO

# change lang list to include abbreviations ['german', 'de']
# rename json file by language DEU_removed_words, ESP_removed_words

# add export of all sentences to a .csv file
# fields 
    # DEU     # dict def of no-stop-words column    # tags
    
# change export to .csv to only include desired number of no stop words/unknown words (1, 2, 3) etc.

# change educalingo dictionary to web scraping an actual dictionary for better definitions and articles for nouns
    
#  Parts of speech tagging - NLP

# Notepad++ convert ANSI to UTF-8 for special characters
# seems to need to be UTF-8 BOM in order to preserve special characters

In [None]:
!python -m spacy download de_core_news_sm


In [None]:
!pip install C:/Users\paro/Documents/GitHub/SRT2CSV/de_core_news_sm-3.0.0.tar.gz

In [None]:
import spacy

# Load the German model
nlp = spacy.load("de_core_news_sm")

# Test it on a sentence
doc = nlp("Der Hund läuft im Park.")
for token in doc:
    print(token.text, token.pos_)


In [None]:
!pip check



In [None]:
!pip install click==7.1.2
!pip install torch==2.3.0
!pip install urllib3==1.26.6
!pip install clyent==1.2.1 nbformat==5.4.0 python-dateutil==2.8.2 PyYAML==6.0 requests==2.28.1
!pip install markdown-it-py==2.2.0
!pip install fsspec==2023.3.0



In [None]:
!python -m spacy download de_core_news_sm

In [None]:
conda create -n spacy_env python=3.11
conda activate spacy_env
pip install spacy
python -m spacy download de_core_news_sm


In [None]:
conda activate spacy_env  # or whatever you named your environment
pip install ipykernel
