In [328]:
import pysrt
import subsync
import random
import re
import argparse
import editdistance
import nltk
from tqdm.notebook import tqdm
import collections

In [294]:
class DualSubItem(object):
    def __init__(self, en, cn, spec, en_size=None, cn_size=None, start=None, end=None):
        self.en = en
        self.cn = cn
        self.spec = spec
        self.start = start
        self.end = end
        self.en_size = en_size
        self.cn_size = cn_size
        
    def to_srt_item(self, mode):
        en, cn = self.en, self.cn
        if self.en_size is not None:
            en = f'<font size="{self.en_size}px">{self.en}</font>'
        if self.cn_size is not None:
            cn = f'<font size="{self.cn_size}px">{self.cn}</font>'
            
        if mode == "both":
            text = cn + '\n' + en
        elif mode == "cn":
            text = cn
        elif mode == "en":
            text = en
            
        if self.spec == "hang":
            text = r"{\an8}" + text
            
        return pysrt.SubRipItem(start=self.start, end=self.end, text=text)

In [295]:
class DualSub(object):
    def __init__(self, items=None):
        self.items = items or []
        
    def load_items(self, fname):
        group_delim = re.compile('\n\n')
        with open(fname) as f:
            groups = group_delim.split(f.read())
            for group in groups:
                group = group.splitlines()
                if group[0] == "@@":
                    spec = 'hang'
                    group = group[1:]
                else:
                    spec = None
                en, cn = group
                item = DualSubItem(en, cn, spec)
                self.items.append(item)
    
    def generate_times(self, duration):
        total_len = sum(map(len, [item.en for item in self.items]))
        start = 0
        for i, item in enumerate(self.items):
            if item.spec == 'hang':
                item.start = self.items[i - 1].start
                item.end = self.items[i - 1].end
            else:
                end = int(start + duration * (len(item.en) / total_len))
                item.start = start
                item.end = end
                start = end
    
    def set_size(self, mode, size):
        for item in self.items:
            if mode == "en":
                item.en_size = size
            elif mode == "cn":
                item.cn_size = size
            else:
                raise ValueError
                
    def to_srt(self, mode, ignore_spec=False):
        out = pysrt.SubRipFile()
        for item in self.items:
            if ignore_spec and item.spec:
                continue
            out.append(item.to_srt_item(mode=mode))
        out.clean_indexes()
        return out

In [220]:
script = pysrt.open("jp_clean_up_your_room/youtube_script.srt")
empty_items = [item for item in script if not item.text]
for item in empty_items:
    script.remove(item)

for lhs, rhs in zip(script[:-1], script[1:]):
    lhs.end = rhs.start
# for item in script:
#     item.
script.save("jp_clean_up_your_room/youtub_script_norm.srt")


<__main__.DualSub at 0x12969f780>

In [306]:
sub_tokens = []
sub_token_idx_to_sub_item = {}
num_tokens = 0
for i, item in enumerate(sub.items):
    text = item.en.lower()
    tokens = nltk.word_tokenize(text)
    item.tokens = tokens
    item.token_ids = [num_tokens + t for t in range(len(tokens))]
    num_tokens += len(tokens)
    for token_id, token in zip(item.token_ids, item.tokens):
        sub_token_idx_to_sub_item[token_id] = item
        sub_tokens += [token]

In [312]:
script_lines = []
# script_token_idx_to_script_item = {}
for i, item in enumerate(script):
    script_lines.append(item.text.lower())
#     tokens = nltk.word_tokenize(item.text.lower())
#     for token in tokens:
#         script_token_idx_to_script_item[len(script_token_idx_to_script_item)] = item
#         script_tokens.append(token)

In [384]:
limit = None
# limit = None
mono_increase_lim = 30
matching_sub_indices = [0]

for script_idx in tqdm(range(1, len(script_lines[:limit]))):
    lhs = ' '.join(line for line in script_lines[:script_idx])
    rhs = ' '.join(line for line in script_lines[script_idx:])
    
    mono = 0
    curr_min = float('inf')
    curr_min_i = None
    
    for sub_index in range(matching_sub_indices[-1], len(sub_tokens)):
        if mono > mono_increase_lim:
            mono = 0
            break
        sub_lhs = ' '.join(sub_tokens[:sub_index])
        sub_rhs = ' '.join(sub_tokens[sub_index:])
        lhs_dist = editdistance.eval(lhs, sub_lhs)
        rhs_dist = editdistance.eval(rhs, sub_rhs)
        sum_dist = lhs_dist + rhs_dist
        if sum_dist < curr_min:
            curr_min = sum_dist
            curr_min_i = sub_index
            mono = 0
        else:
            mono += 1
    
    matching_sub_indices.append(curr_min_i)
    
#         print(lhs, rhs, sec_lhs, sec_rhs)
#     break

HBox(children=(FloatProgress(value=0.0, max=153.0), HTML(value='')))




In [388]:
for item in sub.items:
    item.start, item.end = 0, 0
    
for line_idx, lower, upper in zip(range(len(script_lines)),
                                  matching_sub_indices[:-1], matching_sub_indices[1:]):
    script_item = script[line_idx]
    total_duration = script_item.duration.ordinal
#     print('total:', total_duration)
    counter = collections.Counter()
    for sub_token_idx in range(lower, upper):
        counter.update([sub_token_idx_to_sub_item[sub_token_idx]])
    total_weight = sum(counter.values())
    duration_used = 0
    for sub_item in counter:
        weight = counter[sub_item] / total_weight
        time_fract = int(total_duration * weight)
        if not sub_item.start:
            sub_item.start = script_item.start.ordinal + duration_used
            sub_item.end = sub_item.start
        sub_item.end += time_fract
        duration_used += time_fract

In [389]:
for item in sub.items:
    print(item.en)
    print(item.start / 1000, item.end / 1000)
    print()

I've been telling people online in various ways, and in lectures,
1.199 5.603

that they should start fixing up the world by cleaning up their room.
5.603 8.899

And I wanted to just elaborate on that a little bit before I get back to the lecture itself.
8.9 14.263

So as it becomes this weird internet meme you know.
14.263 20.697

And it's a joke. And good, it's a joke.
20.698 24.602

I'm really happy about the fact that, so much of this has got like the leaven of humor in it and it's really important.
24.603 30.033

Because that's what stops things from degenerating into conflict is humor.
30.034 33.998

And let's say you want to sort out your room and beautify it, because the beauty is also important.
33.999 39.478

And let's say that all you have is just a little room.
39.479 41.48

Like, you're not rich. You're poor.
41.481 43.398

And you don't have any power, that's another thing.
43.399 45.6

But you've got your damn room.
45.601 46.879

And you've got this space right in front

In [390]:
sub.to_srt(mode='both', ignore_spec=False).save('jp_clean_up_your_room/ref_synced.srt')

In [213]:
sub = DualSub()
sub.load_items("jp_clean_up_your_room/subtitle.txt")
sub.set_size(mode="cn", size=16)
sub.set_size(mode="en", size=9)
sub.sync_with(mode="en", "jp_clean_up_your_room/youtube_script.srt")
# sub.generate_times(324800)
sub.to_srt(mode='both', ignore_spec=False).save('jp_clean_up_your_room/subtitle.srt')
sub.to_srt(mode='en', ignore_spec=True).save('jp_clean_up_your_room/en.srt')