SDH subtitles from https://www.opensubtitles.org/en/ssearch/sublanguageid-ger/idmovie-578221

In [24]:
import re
import pandas as pd
import numpy as np
import sys
import os
import pysrt #for subs analysis
from datetime import date, datetime, timedelta, time

import matplotlib
from matplotlib import style
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (16.0, 9.0)
style.use('fivethirtyeight')

The first dataset is for vocabulary size calculation

In [87]:
# function that returns text from all the lines for a given .srt file
def text_from_srt(path):
    path = path
    pattern = r'\[(.*?)\]'
    lines=[]
    # loading the subs
    subs = pysrt.open(path)
    #selecting text without [...]-like comments | music
    for line in subs:
        text = line.text_without_tags.replace('\n',' ')
        # not interested in lyrics and opening text
        if '♪' not in text and 'NETFLIX' not in text:
            # checking whether it contains sdh comments and removing them
            text = re.sub(pattern,'',text)
            lines.append(text)
        
    # concatenating all the values from list to create a text
    clean_text = ' '.join(lines)
    return clean_text

In [112]:
directory = 'SDH/'
# all .srt files are named like this: S01E01.srt 

data = pd.DataFrame(columns =["episode_text","episode","season","season00episode00"])

for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(f) and '.srt' in filename:
        print(f)
        text = text_from_srt(f)
        # appending to a dataframe
        data = data.append({"episode_text":text,"episode":filename[-6:-4],
        "season":filename[1:3],"season00episode00":filename[0:6]},
        ignore_index = True)

        print(filename[0:6],"done, text length: ",len(text))

data.head(5)

SDH/S03E02.srt
S03E02 done, text length:  11089
SDH/S01E07.srt
S01E07 done, text length:  11794
SDH/S01E06.srt
S01E06 done, text length:  11988
SDH/S03E03.srt
S03E03 done, text length:  11569
SDH/S03E01.srt
S03E01 done, text length:  12846
SDH/S01E04.srt
S01E04 done, text length:  11558
SDH/S01E10.srt
S01E10 done, text length:  14168
SDH/S01E05.srt
S01E05 done, text length:  12000
SDH/S03E04.srt
S03E04 done, text length:  12915
SDH/S01E01.srt
S01E01 done, text length:  14919
SDH/S03E05.srt
S03E05 done, text length:  11043
SDH/S03E07.srt
S03E07 done, text length:  12737
SDH/S01E02.srt
S01E02 done, text length:  7375
SDH/S01E03.srt
S01E03 done, text length:  8602
SDH/S03E06.srt
S03E06 done, text length:  12883
SDH/S02E08.srt
S02E08 done, text length:  13514
SDH/S02E01.srt
S02E01 done, text length:  11861
SDH/S02E03.srt
S02E03 done, text length:  16212
SDH/S02E02.srt
S02E02 done, text length:  13237
SDH/S02E06.srt
S02E06 done, text length:  12861
SDH/S02E07.srt
S02E07 done, text length:  

Unnamed: 0,episode_text,episode,season,season00episode00
0,- - Zieh dich um. -Martha? - Das kann nic...,2,3,S03E02
1,Ich erinnere mich. Ich erinnere mich an ...,7,1,S01E07
2,Das ist nicht mehr lustig! Katharina? Ulr...,6,1,S01E06
3,"""Von da an wusste ich, dass sich nichts ände...",3,3,S03E03
4,"Wenn wir wüssten, wie die Dinge enden... ......",1,3,S03E01


In [113]:
# saving to csv
data.to_csv("texts.csv",index=False)

The second dataset is for SDH comments (SDH = Subtitles for the deaf and hard of hearing) mapping —  we're interested in how often something happens e.g. ominous music starts playing. This is why we're saving intervals here.

In [140]:
# function that returns sdh comments for a given .srt file in a dataframe format
def text_from_srt(path):
    path = path
    pattern = r'\[(.*?)\]'
    data = pd.DataFrame(columns =["sdh_comment","interval_start","interval_end"])
    
    # loading the subs
    subs = pysrt.open(path)
    for line in subs:
        start = line.start.to_time()
        end = line.end.to_time()
        text = line.text_without_tags.replace('\n',' ')
        #still not interested in lyrics and opening text
        if '♪' not in text and 'NETFLIX' not in text:
            matches = re.findall(pattern,text)
            # if there are sdh comments, let's save them with intervals
            if len(matches)>0:
                for match in re.findall(pattern,text):
                    data = data.append({"sdh_comment":match,
                    "interval_start":start,"interval_end":end},
                    ignore_index = True)
            
    return data

In [139]:
# subs = pysrt.open("SDH/S03E01.srt")
# pattern = r'\[(.*?)\]'
# data = pd.DataFrame(columns =["sdh_comment","interval_start","interval_end"])

# for sub in subs:
#     start = sub.start.to_time()
#     end = sub.end.to_time()
#     text = sub.text_without_tags.replace('\n',' ')

#     matches = re.findall(pattern,text)
#     if len(matches)>0:
#         print("match found")
#         for match in re.findall(pattern,text):
#             data = data.append({"sdh_comment":match,
#             "interval_start":start,"interval_end":end},
#             ignore_index = True)
            
#             print(len(data))
#         # replacements = re.subn(pattern,'',text)
#         # print("clean string: "+replacements[0])
#         # print("num of replacements: ",replacements[1])    
    

In [145]:
directory = 'SDH/'
# all .srt files are named like this: S01E01.srt 

sdh_data = pd.DataFrame(columns =["sdh_comment","interval_start","interval_end","episode","season","season00episode00"])

for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(f) and '.srt' in filename:
        print(f)
        data = text_from_srt(f)
        # appending to a dataframe with season and episode indicators
        data["episode"] = filename[-6:-4]
        data["season"] = filename[1:3]
        data["season00episode00"] = filename[0:6]
        sdh_data = pd.concat([sdh_data,data], ignore_index = True)

        print(filename[0:6],"done")

sdh_data.head(5)

SDH/S03E02.srt
S03E02 done
SDH/S01E07.srt
S01E07 done
SDH/S01E06.srt
S01E06 done
SDH/S03E03.srt
S03E03 done
SDH/S03E01.srt
S03E01 done
SDH/S01E04.srt
S01E04 done
SDH/S01E10.srt
S01E10 done
SDH/S01E05.srt
S01E05 done
SDH/S03E04.srt
S03E04 done
SDH/S01E01.srt
S01E01 done
SDH/S03E05.srt
S03E05 done
SDH/S03E07.srt
S03E07 done
SDH/S01E02.srt
S01E02 done
SDH/S01E03.srt
S01E03 done
SDH/S03E06.srt
S03E06 done
SDH/S02E08.srt
S02E08 done
SDH/S02E01.srt
S02E01 done
SDH/S02E03.srt
S02E03 done
SDH/S02E02.srt
S02E02 done
SDH/S02E06.srt
S02E06 done
SDH/S02E07.srt
S02E07 done
SDH/S02E05.srt
S02E05 done
SDH/S02E04.srt
S02E04 done
SDH/S03E08.srt
S03E08 done
SDH/S01E08.srt
S01E08 done
SDH/S01E09.srt
S01E09 done


Unnamed: 0,sdh_comment,interval_start,interval_end,episode,season,season00episode00
0,düster-melancholische Musik,00:00:13.458000,00:00:16.500000,2,3,S03E02
1,rhythmisches Klopfen,00:00:24.625000,00:00:30.041000,2,3,S03E02
2,bedrohliche Klänge,00:00:24.625000,00:00:30.041000,2,3,S03E02
3,keucht,00:00:36,00:00:38.291000,2,3,S03E02
4,unheilvolle Klänge,00:00:43.875000,00:00:46.208000,2,3,S03E02


In [146]:
len(sdh_data)

3062

In [147]:
# saving to csv
sdh_data.to_csv("sdh_comments.csv",index=False)

Another dataset can be created to save lines for intervals like 10 min or 30 min to see the tone dynamic or something. But this is something we'll do later (maybe).

Good example can be found here: https://mubaris.com/posts/movie-analysis/

Some silly testing

In [17]:
# start = time(0, 0, 0)
# end = subs[-1].end.to_time()

datetime.time(0, 48, 18, 916000)