We tested our segmentation method on Cornell movie dialogue data set. And in this notebook we validated our method on original movie scripts.

In [295]:
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import numpy as np

## 1 Parse the scripts and segment dialogues

The scripts are not well formatted: even though they are in html format, actually the lines are not well constructed in tags. We cannot parse them with tags. Fortunately, every characters and changes of scenes are bolded with tag 'b', and we have files containing characters information. So first we extract all bolded lines, and substract characters, then changes of scenes are left, and then we can know the boundaries of dialogues.

In [234]:
folder_path = '/Users/yan/Documents/document/EPFL/MA2/semesterprj/datasets/scripts/code/opensubs-turns/'
path = folder_path+'scripts'

files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(path):
    for file in f:
        if (file.endswith('.html')) & (~file.startswith('fd')) & (~file.startswith('tbbt')) & (~file.startswith('friends')):
            test_path = folder_path+'sentences/'+file.split('.html')[0]+'-speakers.txt'
            if os.path.exists(test_path):
                files.append(file.split('.html')[0])


In [329]:
def Parser(files,j):
    """
    This function is used to parse the scripts in .html.
    Inputs:
        files: List of scripts' directory.
        j: index
    """
    file_path = folder_path+'sentences/'+files[j]+'-speakers.txt' # read the file with speakers in that script

    f = open(file_path)               
    lines = f.readlines()               
    tags_speakers = []
    for line in lines: 
        tags_speakers.append(line.replace('continued','').upper().rstrip())
    tags_speakers = set(tags_speakers) # characters in the scripts
    
    file_path = folder_path+'scripts/'+files[j]+'.html'
    soup = BeautifulSoup(open(file_path, errors='ignore'))
    
    tags = []
    for a in soup.find_all('b'):
        tags.append(a.string.rstrip().lstrip())
    tags = set(tags) # all bolded tags
    tags_background = tags - tags_speakers # only changes of scenes are left

    texts = [' '.join(x.rstrip().lstrip().split('\n\n')[0].split()) for x in soup.strings if str.strip(x) != '']

    idxs = []
    idxs_bg = []
    speaker = []
    lines = []
    for i in range(len(texts)):
        if texts[i] in tags_background:
            idxs_bg.append(i)
        if texts[i] not in tags_speakers:
            continue
        else:
            speaker.append(texts[i])
            line = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]|\\♪.*?♪|\\#.*?#|\\=.*?=|\\¶.*?¶", "", texts[i+1])
            lines.append(line)
            idxs.append(i)
            
    s = pd.Series(idxs)
    boundaries = pd.cut(s,idxs_bg, labels=False, retbins=False, right=False).get_values()
    boundaries = [1]+list((boundaries[1:] != boundaries[:-1])*1)
    
    MovieID = ['m%s'%(str(j))] * len(lines)
    MovieName = [files[j]] * len(lines)

    dialogue = pd.DataFrame([speaker,lines,boundaries,MovieID,MovieName]).T
    dialogue.columns = ['Speaker','Line','Label','MovieID','MoveiName']
    dialogue.to_csv(folder_path+'parsed/'+MovieID[0]+'.txt',sep=',', index=False, header=False)


In [333]:
for j in range(len(files)):
    try:
        Parser(files,j)
    except:
        print(j,files[j])

9 Crow-Salvation,-The
12 Star-Trek-First-Contact
23 i-walked_with_a_zombie
41 Who-Framed-Roger-Rabbit%3f
42 Platoon
43 the-x-files_production
51 Pitch-Black
56 pet-sematary
62 thethinman
93 Leaving-Las-Vegas
134 Stepmom
136 oneflewover
142 Star-Trek-Generations
147 Bones
149 Minority-Report
150 natural-born-killers_early
170 Buffy-the-Vampire-Slayer
177 Star-Trek-The-Motion-Picture
179 fivefeetandrising
181 Crying-Game
193 Clueless
197 natural-born-killers_shoot
215 Tremors
243 Sixth-Sense,-The
256 John-Q
273 Orgy-of-the-Dead
284 Almost-Famous
288 Blast-from-the-Past,-The
293 Anastasia
296 Memento
298 Aladdin
303 fabulous_baker_boys_final
305 Blade-II
308 hellraiser_ii
313 mission-impossible-2_shoot
315 English-Patient,-The
322 Independence-Day
340 halloween
364 Apartment,-The
368 kundun
373 Shampoo
374 True-Romance
378 Star-Trek-II-The-Wrath-of-Khan
390 Life-As-A-House
413 Little-Mermaid,-The
427 Red-Planet
432 Withnail-and-I
434 Heavy-Metal
436 Pearl-Harbor
442 thetimemachine_1959
45

## 2 Test the similarity segmentation

In [341]:
import matplotlib.pyplot as plt
from prettytable import PrettyTable
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize 
from gensim.models import Word2Vec
model = Word2Vec.load('./word2vec/model_opst')
tokenizer = RegexpTokenizer(r'\w+')

In [344]:
def corpus_clean(s):
#     s = s.replace('\'','')
    s = tokenizer.tokenize(s.lower())
#     s = ' '.join([ps.stem(x) for x in s])
    s = ' '.join(s)
    return s

In [347]:
def cos_sim(v1,v2):
    return v1.dot(v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

def heuristic_max(s1,s2,flag=False,model=model.wv):
    if len(s1)*len(s2) == 0:
        return np.nan

    matrix = np.zeros((len(s1),len(s2)))
    for i in range(len(s1)):
        if s1[i] in model.vocab.keys():
            s1_vec = model[s1[i]]
        else:
            continue # if the source target word is not in vocabulary list then corresponding similiarity row = 0
        for j in range(len(s2)):            
            if s2[j] in model.vocab.keys():
                s2_vec = model[s2[j]]
                matrix[i][j] = cos_sim(s1_vec,s2_vec)
            else:
                continue # for not-found words the similarity is 0
    s1_sim =  np.sum(np.max(matrix,1))/len(s1)
    s2_sim = np.sum(np.max(matrix,0))/len(s2)
    return 1/2*(np.round(s1_sim,5)+np.round(s2_sim,5))

In [376]:
# metrics
def calculate_p_k(p,r,s_size):
    P = np.ones(s_size)
    P[p:] = 2

    R = np.ones(s_size)
    R[r:] = 2
    
    k = int(s_size/2)

    delta_R = np.array([int(R[i]==R[i+k]) for i in range(len(R)-k)])
    delta_P = np.array([int(P[i]==P[i+k]) for i in range(len(P)-k)])

    P_k = sum(~(delta_R==delta_P)*1)/(len(R)-k)
    return P_k

def calculate_MAE(x,s_size):
    x[x<0] = s_size
    return x

In [348]:
def th_pred(x,alpha):
    x_ = x[x!=0]
    if len(x_)==0:
        th = 0
        label = [0]
    else:
        th = np.mean(x_)-alpha*np.std(x_)
        label = np.where(x<=th)[0]
        label = label if len(label)>0 else [0] # if th is too low, then the predicted label is the whole session
    return label[0]

def deriv_pred(x):
    tmp = np.where((x[1:]-x[:-1])>0)[0]
    if len(tmp)==0:
        return 0
    else:
        return tmp[0]

In [380]:
def texttiling_embedding(labels,smooth_score):
    s_dict = {}
    start_idx = np.where(labels==1)[0]

    s_size = 10

    for i in start_idx:
        s = smooth_score[i:i+s_size]
        block_label = np.where(labels[i:i+s_size])[0]
        if len(block_label) < 2:
            continue
        depth_score = [0]
        lpeak = s[0]
        for k in range(1,len(s)):
    #         idx = max(0,k-block_size)
            lpeak = max(s[0:k+1])
            depth_score.append(s[k]-lpeak)
        s_dict[i] = {}
        s_dict[i]['depth score'] = np.round(np.array(depth_score),5)
        s_dict[i]['smooth score'] = s
        s_dict[i]['depth mean'] = np.mean(depth_score)
        s_dict[i]['depth std'] = np.std(depth_score)
        s_dict[i]['block label'] = block_label
    s_df = pd.DataFrame.from_dict(s_dict).T  

    # block_labels = s_df['block label'].apply(lambda x:x[1])
#     length = s_df['length'].get_values()

    table = PrettyTable()
    table.field_names = ['alpha','ACC','MAE','P_k','Random ACC','Random MAE','Random P_k']

    # nltk.texttiling: mean - std/2
    alphas = [0.5]
    for alpha in alphas:
        block_labels = []
        pred = s_df['depth score'].apply(lambda x: th_pred(x,alpha))
        delta = s_df['depth score'].apply(lambda x: next(iter(np.where((x[1:]-x[:-1])>0)[0]), 0))
        s_df['pred'] = np.max((pred.get_values(),delta.get_values()),axis=0) 
        s_df.pred.replace(0,s_size-1,inplace=True) # predict = 0 means not found and set label to len of session
        for i in range(len(pred)): # find the nearest label to prediction
            block_labels.append(min(s_df['block label'].iloc[i][1:], key=lambda x:abs(x-s_df['pred'].iloc[i])))
        diff = block_labels - pred # diff>0 when the prediction before ground truth

        random_pred = np.random.randint(low=1,high=s_size+1,size=len(block_labels))
        random_diff = block_labels - random_pred

        acc = sum(block_labels==pred)/len(pred)
        random_acc = sum(block_labels==random_pred)/len(random_pred)
        
        p_k = np.mean([calculate_p_k(p,r,s_size) for (p,r) in zip(block_labels,pred)])
        random_p_k = np.mean([calculate_p_k(p,r,s_size) for (p,r) in zip(block_labels,random_pred)])
        
        table.add_row([alpha,
                       round(acc,3),
                      round(abs(calculate_MAE(diff,s_size)).mean(),3), # MAE
                       round(p_k,3),
                       round(random_acc,3),
                      round(abs(calculate_MAE(random_diff,s_size)).mean(),3), # random MAE
                       round(random_p_k,3)]
                     )
#     print(table)
    return round(acc,3),round(abs(calculate_MAE(diff,s_size)).mean(),3),round(p_k,3)

In [391]:
folder_path = '/Users/yan/Documents/document/EPFL/MA2/semesterprj/datasets/scripts/code/opensubs-turns/parsed/'

files = []
# r=root, d=directories, f = files
for r, d, f in os.walk(folder_path):
    for file in f:
        if file.endswith('.txt'):
            files.append(folder_path+file)


In [392]:
len(files)

946

In [None]:
accs = []
MAEs = []
p_ks = []


In [434]:
for j in range(499,len(files)):
    try:
        df = pd.read_csv(files[j],header=None)
        df.fillna('nan',inplace=True)
        df.columns = ['Speaker','Line','Label','MovieID','MoveiName']

        sim = [heuristic_max(df.Line.loc[i],
                             df.Line.loc[i+1]) for i in range(len(df)-1)]
        df['sim'] = [0]+sim

        scores = df.sim.get_values()
        labels = df.Label.get_values()

        acc,MAE,p_k = texttiling_embedding(labels,scores)

        accs.append(acc)
        MAEs.append(MAE)
        p_ks.append(p_k)
        print(j,files[j].split('/')[-1],acc,MAE,p_k)
    except:
        continue

499 m604.txt 0.349 3.616 0.286
500 m162.txt 0.291 4.649 0.305
501 m176.txt 0.331 4.216 0.276
502 m610.txt 0.405 4.071 0.184
503 m823.txt 0.1 4.1 0.397
504 m189.txt 0.263 5.0 0.358
505 m837.txt 0.364 4.0 0.248
506 m994.txt 0.267 4.32 0.331
507 m758.txt 0.482 2.816 0.137
508 m980.txt 0.286 2.786 0.357
509 m770.txt 0.292 4.651 0.291
510 m764.txt 0.387 3.247 0.215
511 m943.txt 0.205 4.53 0.39
512 m228.txt 0.083 5.525 0.413
513 m214.txt 0.345 4.029 0.255
514 m572.txt 0.291 4.013 0.291
515 m200.txt 0.387 4.071 0.211
516 m1004.txt 0.15 6.05 0.467
517 m34.txt 0.495 3.37 0.162
518 m599.txt 0.348 3.56 0.248
519 m1010.txt 0.699 1.887 0.069
520 m20.txt 0.366 3.269 0.248
521 m1038.txt 0.256 4.966 0.266
522 m1039.txt 0.893 0.143 0.029
523 m21.txt 0.184 5.105 0.374
524 m1011.txt 0.146 5.539 0.391
525 m598.txt 0.357 4.786 0.314
526 m35.txt 0.347 3.081 0.206
527 m1005.txt 0.311 3.992 0.234
528 m567.txt 0.198 4.296 0.309
529 m201.txt 0.417 4.917 0.317
530 m573.txt 0.248 4.497 0.291
531 m229.txt 0.415 4.

768 m128.txt 0.323 3.581 0.229
769 m882.txt 0.353 3.545 0.259
770 m316.txt 0.355 3.682 0.29
771 m470.txt 0.454 3.051 0.191
772 m464.txt 0.36 5.167 0.219
773 m302.txt 0.247 4.965 0.365
774 m458.txt 0.22 4.232 0.344
775 m328.txt 0.409 3.79 0.214
776 m472.txt 0.455 2.622 0.147
777 m314.txt 0.305 4.53 0.245
778 m300.txt 0.169 5.644 0.38
779 m466.txt 0.273 4.564 0.331
780 m499.txt 0.281 5.067 0.303
781 m894.txt 0.065 5.774 0.445
782 m880.txt 0.351 3.294 0.231
783 m658.txt 0.307 4.423 0.314
784 m116.txt 0.232 4.634 0.352
785 m670.txt 0.359 2.859 0.189
786 m664.txt 0.343 4.412 0.251
787 m102.txt 0.083 2.417 0.383
788 m857.txt 0.374 3.732 0.241
789 m843.txt 0.458 3.176 0.189
790 m738.txt 0.232 4.735 0.305
791 m704.txt 0.384 3.59 0.227
792 m710.txt 0.143 5.143 0.543
793 m923.txt 0.33 3.33 0.218
794 m937.txt 0.414 3.355 0.2
795 m83.txt 0.244 5.232 0.291
796 m248.txt 0.071 6.607 0.341
797 m97.txt 0.245 4.592 0.341
798 m506.txt 0.348 3.843 0.259
799 m260.txt 0.348 4.217 0.365
800 m274.txt 0.227 4.

In [441]:
len(accs)

942

In [438]:
np.mean(accs),np.std(accs)

(0.29023460721868366, 0.12428605092344469)

In [439]:
np.mean(MAEs),np.std(MAEs)

(4.342491507430998, 1.0184881657490008)

In [440]:
np.mean(p_ks),np.std(p_ks)

(0.2949405520169851, 0.08633173483985833)