In [1]:
import sys
sys.path.append('..')
from src.models.pipeline import pipeline
import glob
import json
import os
import pandas as pd
import ast
from datetime import datetime
from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events
from bayes_opt.util import load_logs

In [12]:
groundbase_dir = '../data/raw/groundbase'
transcripts_dir = os.path.join(groundbase_dir,'transcripts')
topic_dataset_path = os.path.join(groundbase_dir,'dataset.csv')
transcript_filespath = glob.glob(groundbase_dir + '/transcripts/*.json')

'''Read the transcript'''
transcripts_jsons = {}
for fl in transcript_filespath:
    with open(fl,encoding="utf8") as f:
        transcript =ast.literal_eval(f.read()) #json.load(f)
        vid = fl.split('\\')[-1].split('.')[0]
        #print(vid)
        transcripts_jsons[vid] = transcript
#print(transcripts_jsons)

'''Read the videos metadata to perform on them the segmentation'''
df_videos = pd.read_csv(topic_dataset_path)

''' Transfer topic shifts time to seconds units instead HH:MM:SS'''

def topic_shifts_seconds(topic_shifts):
    tp_shift_sec=[]
    for tp in topic_shifts:
        intervals = tp.split(':')
        seconds = int(intervals[2])
        minutes = int(intervals[1]) * 60
        hours = int(intervals[0]) * 60 *60
        tp_shift_sec.append(seconds + minutes + hours)
    return tp_shift_sec


for video_id in transcripts_jsons.keys():    
    df_videos.at[df_videos['video id'] == video_id,'topic shifts(ends)'] =\
    topic_shifts_seconds(\
                         df_videos[df_videos['video id'] == \
                                   video_id]['topic shifts(ends)'])

In [16]:
'''Create an array of functions to optimize'''
def build_function_to_optimize(workflow_label):
    pipeline_steps_values = workflow_label.split('-')
    steps_labels = ['segment','vectorize','similarity','added_filter','clustering']
    _pipeline = {}
    for key,value in list(zip(steps_labels,pipeline_steps_values)):
        _pipeline[key]=value
    
    '''takingcare of filters'''
    filter_params = {'filter_type': None,'mask_shape': None,'sim_thresh':None,'is_min_thresh':True}
    # if we have image filter 
    if _pipeline['added_filter'] != 'None':
        vals = _pipeline['added_filter'].split('_')
        filter_params['filter_type'] = vals[0]
        filter_params['mask_shape'] = ast.literal_eval(vals[1])
        
    #print(filter_params)    
    #print(_pipeline)
    '''Cosine and spectral clustering'''
    if _pipeline['vectorize'] =='tfidf' and  _pipeline['similarity'] == 'cosine' and _pipeline['clustering'] == 'spectral_clustering':
        def _f(window_size,step_size,sim_thresh,n_clusters):
            window_size = int(window_size)
            step_size = int(step_size)
            n_clusters = int(n_clusters)
            filter_params['sim_thresh'] = sim_thresh

            return pipeline.run_for_baye(groundbase,transcripts,
                                window_size=window_size,step_size=step_size,
                                vector_method=_pipeline['vectorize'],
                                similarity_method=_pipeline['similarity'],
                                filter_params=filter_params,
                                clustering_params={'algorithm': _pipeline['clustering'],
                                                   'n_clusters':n_clusters}
                               )

        return _f
    
    
    
    if _pipeline['vectorize'] =='tf' and  _pipeline['similarity'] == 'cosine' and _pipeline['clustering'] == 'spectral_clustering':
        def _f(window_size,step_size,sim_thresh,n_clusters):
            window_size = int(window_size)
            step_size = int(step_size)
            n_clusters = int(n_clusters)
            filter_params['sim_thresh'] = sim_thresh

            return pipeline.run_for_baye(groundbase,transcripts,
                                window_size=window_size,step_size=step_size,
                                vector_method=_pipeline['vectorize'],
                                similarity_method=_pipeline['similarity'],
                                filter_params=filter_params,
                                clustering_params={'algorithm': _pipeline['clustering'],
                                                   'n_clusters':n_clusters}
                               )

        return _f
    
    if _pipeline['vectorize'] =='lda' and _pipeline['similarity'] == 'jensen_shannon' and _pipeline['clustering'] == 'spectral_clustering':
        #print("here you")
        def _f(window_size,step_size,sim_thresh,
               n_clusters,alpha
               ,eta,chunksize,minimum_probability,passes
              ):
            window_size = int(window_size)
            step_size = int(step_size)
            n_clusters = int(n_clusters)
            filter_params['sim_thresh'] = sim_thresh
            filter_params['is_min_thresh'] = False
            passes = int(passes)
            chunksize = int(chunksize)
            minimum_probability = int(minimum_probability)
            vectorizing_params={"alpha":alpha,
                                'eta':eta,
                                'chunksize':chunksize,
                                'minimum_probability':minimum_probability,
                                'passes':passes,
                                'n_clusters':n_clusters}
            return pipeline.run_for_baye(groundbase,transcripts,
                                        window_size=window_size,step_size=step_size,
                                        vector_method=_pipeline['vectorize'],
                                        vectorizing_params=vectorizing_params,
                                        similarity_method=_pipeline['similarity'],
                                        filter_params=filter_params,
                                        clustering_params={'algorithm': _pipeline['clustering'],
                                                           'n_clusters':n_clusters}
                                       )
        return _f
    
    if _pipeline['vectorize'] =='lda' and _pipeline['similarity'] == 'cosine' and _pipeline['clustering'] == 'spectral_clustering':
        #print("here you")
        def _f(window_size,step_size,sim_thresh,
               n_clusters,alpha,
               eta,chunksize,minimum_probability,passes
              ):
            window_size = int(window_size)
            step_size = int(step_size)
            n_clusters = int(n_clusters)
            filter_params['sim_thresh'] = sim_thresh
            passes = int(passes)
            chunksize = int(chunksize)
            minimum_probability = int(minimum_probability)
            vectorizing_params={"alpha":alpha,
                                'eta':eta,
                                'chunksize':chunksize,
                                'minimum_probability':minimum_probability,
                                'passes':passes,
                                'n_clusters':n_clusters}
            return pipeline.run_for_baye(groundbase,transcripts,
                                        window_size=window_size,step_size=step_size,
                                        vector_method=_pipeline['vectorize'],
                                        vectorizing_params=vectorizing_params,
                                        similarity_method=_pipeline['similarity'],
                                        filter_params=filter_params,
                                        clustering_params={'algorithm': _pipeline['clustering'],
                                                           'n_clusters':n_clusters}
                                       )
        return _f

    if _pipeline['vectorize'] =='do_nothing' and _pipeline['similarity'] == 'wmdistance' and _pipeline['clustering'] == 'spectral_clustering':
        #print("here you")
        def _f(window_size,step_size,sim_thresh,n_clusters):
            window_size = int(window_size)
            step_size = int(step_size)
            n_clusters = int(n_clusters)
            filter_params['sim_thresh'] = sim_thresh
            filter_params['is_min_thresh'] = False
            return pipeline.run_for_baye(groundbase,transcripts,
                                        window_size=window_size,step_size=step_size,
                                        vector_method=_pipeline['vectorize'],
                                        similarity_method=_pipeline['similarity'],
                                        filter_params=filter_params,
                                        clustering_params={'algorithm': _pipeline['clustering'],
                                                           'n_clusters':n_clusters}
                                       )
        return _f        

In [17]:
models_path  = '../models/bayesian_opt'
params_path = os.path.join(models_path,"parameters_bounds.json")
#print(params_path)
with open(params_path,'r') as f:
    param_bounds = ast.literal_eval(f.read())
#print(param_bounds)

In [19]:

''' Finding optimization for each video'''

workflows = list(param_bounds.keys())
workflows = list(filter(lambda x: '-lda-' in x,workflows)) #[workflows[-1]]

for workflow in workflows:
    print('Training workflow %s' %(workflow))
    function_to_optimized = build_function_to_optimize(workflow)
    '''Define the optimizer'''    
    optimizer = BayesianOptimization(
                f=function_to_optimized,
                pbounds=param_bounds[workflow],
                verbose=2,
                random_state=1
                )
    
    model_file_path = os.path.join(models_path,('%s.json' %(workflow)))
    if os.path.isfile(model_file_path):
        load_logs(optimizer,logs=[model_file_path])
    else:
        logger = JSONLogger(path=model_file_path)
        optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)
    
    optimizer.probe(
            params={'window_size':60,'step_size':20,'sim_thresh':0.4,
                    'alpha': 1e-05, 'eta': 0.005, 'chunksize': 100,
                    'minimum_probability': 0.0, 'passes': 6, 'n_clusters': 10},
            lazy=True,
    )

    for vid in transcripts_jsons.keys():
        '''Define the function to optimize'''
        video_metadata = df_videos.loc[df_videos['video id'] == vid]
        groundbase = video_metadata['topic shifts(ends)'].values.tolist()[:-1]
        transcripts = transcripts_jsons[vid]

        print("Running the algorithm on %s " %(vid))
        '''Run the algorithm'''
        optimizer.maximize(
            init_points = 0,
            n_iter = 3
        )
        
    print(optimizer.max)

Training workflow sliding_window-lda-cosine-median_(2,2)-spectral_clustering
Running the algorithm on 2mC1uqwEmWQ 
|   iter    |  target   |   alpha   | chunksize |    eta    | minimu... | n_clus... |  passes   | sim_th... | step_size | window... |
-------------------------------------------------------------------------------------------------------------------------------------
|  1        |  0.0      |  0.001    |  99.81    |  0.005    |  0.0      |  12.71    |  5.178    |  0.4096   |  11.6     |  21.96    |
|  2        |  0.0      |  0.001    |  99.45    |  0.005    |  0.0      |  14.88    |  5.11     |  0.5495   |  59.98    |  148.8    |
|  3        |  0.0      |  0.001    |  99.61    |  0.005    |  0.0      |  16.47    |  6.558    |  0.4411   |  58.82    |  21.75    |
|  4        |  0.0      |  0.001    |  99.67    |  0.005    |  0.0      |  16.11    |  5.327    |  0.5834   |  10.29    |  149.9    |
Running the algorithm on B-Xe7_mf2CY 
|   iter    |  target   |   alpha   | chunk

Running the algorithm on Q-HugPvA7GQ 
|   iter    |  target   |   alpha   | chunksize |    eta    | minimu... | n_clus... |  passes   | sim_th... | step_size | window... |
-------------------------------------------------------------------------------------------------------------------------------------
|  11       |  0.0      |  0.001    |  99.69    |  0.005    |  0.0      |  16.38    |  5.495    |  0.5475   |  10.52    |  22.09    |
|  12       |  0.0      |  0.001    |  99.6     |  0.005    |  0.0      |  16.99    |  6.575    |  0.4166   |  10.92    |  148.0    |
|  13       |  0.0      |  0.001    |  80.21    |  0.005    |  0.0      |  16.28    |  5.902    |  0.5062   |  59.48    |  147.8    |
Running the algorithm on x5zLaWT5KPs 
|   iter    |  target   |   alpha   | chunksize |    eta    | minimu... | n_clus... |  passes   | sim_th... | step_size | window... |
------------------------------------------------------------------------------------------------------------------------

{'target': 0.21428571428571427, 'params': {'alpha': 0.001, 'chunksize': 99.03551099411804, 'eta': 0.005, 'minimum_probability': 0.0, 'n_clusters': 14.92049946415079, 'passes': 5.249145956248428, 'sim_thresh': 0.4052191921198328, 'step_size': 59.53678515503128, 'window_size': 20.674565583498428}}
Training workflow sliding_window-lda-jensen_shannon-median_(2,2)-spectral_clustering
Running the algorithm on 2mC1uqwEmWQ 
|   iter    |  target   |   alpha   | chunksize |    eta    | minimu... | n_clus... |  passes   | sim_th... | step_size | window... |
-------------------------------------------------------------------------------------------------------------------------------------
|  1        |  0.0      |  0.001    |  99.65    |  0.005    |  0.0      |  14.54    |  5.437    |  0.5064   |  35.49    |  20.87    |
|  2        |  0.0      |  0.001    |  99.99    |  0.005    |  0.0      |  17.97    |  5.02     |  0.4848   |  59.38    |  64.76    |
|  3        |  0.0      |  0.001    |  60.42



|  15       |  0.0      |  0.001    |  77.59    |  0.005    |  0.0      |  17.77    |  5.454    |  0.3339   |  43.42    |  44.68    |
|  16       |  0.0      |  0.001    |  60.02    |  0.005    |  0.0      |  17.72    |  6.681    |  0.4183   |  41.89    |  82.44    |
Running the algorithm on zWg7U0OEAoE 
|   iter    |  target   |   alpha   | chunksize |    eta    | minimu... | n_clus... |  passes   | sim_th... | step_size | window... |
-------------------------------------------------------------------------------------------------------------------------------------
|  17       |  0.0      |  0.001    |  60.32    |  0.005    |  0.0      |  17.92    |  6.187    |  0.5031   |  55.39    |  122.8    |
|  18       |  0.0      |  0.001    |  99.79    |  0.005    |  0.0      |  12.54    |  5.238    |  0.3264   |  38.25    |  134.9    |
|  19       |  0.0      |  0.001    |  61.32    |  0.005    |  0.0      |  17.46    |  5.246    |  0.4757   |  33.7     |  146.4    |
{'target': 0.21428571428

Running the algorithm on B-Xe7_mf2CY 
|   iter    |  target   |   alpha   | chunksize |    eta    | minimu... | n_clus... |  passes   | sim_th... | step_size | window... |
-------------------------------------------------------------------------------------------------------------------------------------
|  5        |  0.0      |  0.001    |  60.41    |  0.005    |  0.0      |  15.52    |  5.095    |  0.4472   |  59.84    |  147.2    |
|  6        |  0.0      |  0.001    |  60.41    |  0.005    |  0.0      |  15.52    |  5.095    |  0.4472   |  59.84    |  147.2    |
|  7        |  0.0      |  0.001    |  97.54    |  0.005    |  0.0      |  14.43    |  5.295    |  0.5298   |  58.84    |  20.26    |
Running the algorithm on MkiUBJcgdUY 
|   iter    |  target   |   alpha   | chunksize |    eta    | minimu... | n_clus... |  passes   | sim_th... | step_size | window... |
------------------------------------------------------------------------------------------------------------------------



|  10       |  0.0      |  0.001    |  99.86    |  0.005    |  0.0      |  17.92    |  6.983    |  0.4713   |  59.59    |  20.91    |
Running the algorithm on Q-HugPvA7GQ 
|   iter    |  target   |   alpha   | chunksize |    eta    | minimu... | n_clus... |  passes   | sim_th... | step_size | window... |
-------------------------------------------------------------------------------------------------------------------------------------
|  11       |  0.0      |  0.001    |  99.5     |  0.005    |  0.0      |  14.93    |  6.658    |  0.4599   |  58.39    |  149.8    |
|  12       |  0.0      |  0.001    |  99.71    |  0.005    |  0.0      |  17.77    |  6.946    |  0.48     |  11.61    |  20.66    |
|  13       |  0.0      |  0.001    |  60.42    |  0.005    |  0.0      |  16.28    |  5.902    |  0.5062   |  59.48    |  147.8    |
Running the algorithm on x5zLaWT5KPs 
|   iter    |  target   |   alpha   | chunksize |    eta    | minimu... | n_clus... |  passes   | sim_th... | step_size 