In [10]:
from pysubparser import parser
from pysubparser.cleaners import brackets, lower_case, formatting
import datetime

from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import RegexpParser

import tensorflow as tf
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt

In [2]:
model = tf.saved_model.load("../USE_model/")
print("module %s loaded" % model)
def embed(input):
    return model(input)

module <tensorflow.python.saved_model.load.Loader._recreate_base_user_object.<locals>._UserObject object at 0x000002020824DB50> loaded


In [88]:
NP = r"""
    NBAR:
        {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
    NP:
        {<NBAR>}
        {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
"""
NP_chunker = RegexpParser(NP)

In [137]:
def read_sub_file(path):
    subtitles = parser.parse(path)
    subtitles = brackets.clean(
        lower_case.clean(
            formatting.clean(
                subtitles
            )
        )
    )
    return subtitles


def read_gt_file(path):
    gt_file = open(path)
    file_content = gt_file.read().split("\n")
    gt_file.close()
    return file_content


def get_gt_start_times(file_content):
    start_times = []
    for line in file_content:
        st_time = line.split("\t")
        if (len(st_time) > 1):
            start_times.append(datetime.datetime.strptime(
                st_time[1], "%H:%M:%S,%f"))
    return start_times


def prepare_text(subs):
    sentences = []
    start_times = []
    for s in subs:
        sentences.append(s.text)
        start_times.append(s.start)

    sentences = [word_tokenize(sent) for sent in sentences]
    sentences = [pos_tag(w) for w in sentences]
    sentences = [NP_chunker.parse(sent) for sent in sentences]

    return sentences, start_times


# takes the chunked data and extracts the noun phrases
def parsed_text_to_noun_phrases(sentences):
    nps = []
    for sent in sentences:
        tree = NP_chunker.parse(sent)
        for subtree in tree.subtrees():
            if subtree.label() == "NP":
                t = subtree
                t = ' '.join(word for word, tag in t.leaves())
                nps.append(t)
    return nps


# builds the sliding window of given size and also returns the
# init times of all the windows
def build_sliding_window(init_times, sequence, window_size, step_size):
    numOfChunks = ((len(sequence) - window_size) // step_size) + 1
    print("\nLength of sequence: ", len(sequence))
    print("Number of chunks: ", numOfChunks)
    req_chunks = []
    req_window_start_times = []

    for ith_chunk in range(0, numOfChunks*step_size, step_size):
        req_chunks.append(sequence[ith_chunk: ith_chunk + window_size])
        if (ith_chunk < len(init_times)):
            req_window_start_times.append(init_times[ith_chunk])
        else:
            req_window_start_times.append(init_times[len(init_times) - 1])

    return req_chunks, req_window_start_times


def normalise_chunks(sentence_chunks):
    sentences = []
    for chunk in sentence_chunks:
        sent = " ".join(chunk)
        sentences.append(sent)
    return sentences


def plot_fig(x, score, heading, fig_no, can_plot):
    if can_plot:
        fig = plt.figure(fig_no, figsize=(10, 6))
        ax = fig.add_subplot(111)
        ax.plot(x, score, label=heading)

    score_arr = np.array(score)
    x_arr = np.array(x)

    b = (np.diff(np.sign(np.diff(score_arr)))
         > 0).nonzero()[0] + 1  # Local minimas

    if can_plot:
        ax.plot(x_arr[b], score_arr[b], "o",
                label="Deepest Valleys", color='r')
        ax.legend()

    return b


In [169]:
def vectorise(sentences, sentence_len, fig_no):
    sentence_embeddings = embed(sentences)
    score = []
    for i in range(0, len(sentence_embeddings) - 2):
        x = tf.nn.l2_normalize(sentence_embeddings[i:i+1], axis=1)
        y = tf.nn.l2_normalize(sentence_embeddings[i+1:i+2], axis=1)
        cos_sim = tf.reduce_sum(tf.multiply(x, y), axis=1)
        score.append(cos_sim[0])
    # Plotting Cosine Similarity
    minimas = plot_fig(range(1, len(sentence_embeddings) - 1), score,
                       'Cosine Similarity with window size ' + str(sentence_len), fig_no, False)
    return score, minimas


def get_k_best_points(sentences, local_minimas, win_start_times, k_val):
    selected_windows = []
    counter = 0
    for m in local_minimas:
#         if (counter == k_val):
#             break
        selected_windows.append((sentences[m], win_start_times[m]))
#         counter += 1

    return selected_windows


subtitles = read_sub_file(
    "../Lecture_Video_Fragmentation_Dataset/ALV_srt/0025.srt"
)
gt_content = read_gt_file(
    "../Lecture_Video_Fragmentation_Dataset/ALV_srt_GT/0025.txt"
)
gt_start_times = get_gt_start_times(gt_content)
cleaned_text, init_times = prepare_text(subtitles)
noun_phrases = parsed_text_to_noun_phrases(cleaned_text)

print("Number of noun phrases: " + str(len(noun_phrases)))

window_sizes = [60, 120, 180, 240, 300, 360, 420, 480, 540, 600, 660, 720]
# window_sizes = [600, 660, 720, 780, 840, 900, 960, 1020, 1080, 1140, 1200]

for window_size in window_sizes:
    step_size = window_size // 12
    k = 30
    print("---------------------------------------------------------")
    print("\nWindow size: ", window_size,
          "Step size: ", step_size, "K: ", k)

    chunks, window_start_times = build_sliding_window(
        init_times, noun_phrases, window_size, step_size)
    messages = normalise_chunks(chunks)
    cos_sim_scores, loc_minimas  = vectorise(messages, window_size, 1)
    print(loc_minimas)
    boundaries = get_k_best_points(messages, loc_minimas, window_start_times, k)

    print("no of boundaries " + str(len(boundaries)))
    arr = sorted(boundaries, key=lambda x: x[1])
    pos_predictions = 1
    
    for i in range(1, len(gt_start_times)):
        for j in range(0, len(arr)):
            dt = datetime.datetime.combine(
                datetime.date(1900, 1, 1), arr[j][1])
            diff = gt_start_times[i] - dt
            if (abs(diff.total_seconds()) < 30):
                pos_predictions += 1
#                 print("\n")
#                 print("boundary: " + str(dt))
#                 print("actual start time: " + str(gt_start_times[i]))
#                 print("difference in seconds: " + str(abs(diff.total_seconds())))
#                 print("\n")

    print("Number of segments predicted: " + str(pos_predictions))
    p = pos_predictions / len(boundaries)
    r = pos_predictions / len(gt_start_times)
    f = (2 * p * r) / (p + r)
    print("\nPrecision: " + str(p), "Recall: " + str(r), "F-Score: " + str(f))
    print("---------------------------------------------------------")

Number of noun phrases: 2826
---------------------------------------------------------

Window size:  60 Step size:  5 K:  30

Length of sequence:  2826
Number of chunks:  554
[  1   4   6   8  13  16  20  23  26  28  32  35  38  44  48  52  56  58
  60  64  67  70  72  75  79  81  85  87  91  93  97  99 103 106 108 110
 115 118 122 124 126 128 131 134 136 139 142 145 149 151 154 157 159 162
 164 167 169 172 176 178 181 186 190 193 198 201 205 208 210 213 215 218
 220 225 228 230 232 234 236 241 246 248 252 256 258 260 263 269 272 275
 277 279 286 289 293 296 301 306 310 312 316 319 322 326 328 332 334 336
 340 344 349 351 353 356 359 362 364 366 369 371 373 376 378 382 386 388
 391 393 396 400 403 405 407 412 415 417 419 424 427 430 434 437 439 442
 452 455 457 464 468 470 474 477 479 483 486 488 492 495 497 500 502 504
 506 509 514 518 520 522 525 527 529 532 534 537 539 541 544 547 549]
no of boundaries 179
Number of segments predicted: 22

Precision: 0.12290502793296089 Recall: 1.1