In [1]:
import sys
sys.path.append('..')
from src.models.pipeline import pipeline
import glob
import json
import os
import pandas as pd
import ast
from datetime import datetime
from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events
from bayes_opt.util import load_logs

In [2]:
groundbase_dir = '../data/raw/groundbase'
transcripts_dir = os.path.join(groundbase_dir,'transcripts')
topic_dataset_path = os.path.join(groundbase_dir,'dataset.csv')
transcript_filespath = glob.glob(groundbase_dir + '/transcripts/*.json')

'''Read the transcript'''
transcripts_jsons = {}
for fl in transcript_filespath:
    with open(fl,encoding="utf8") as f:
        transcript =ast.literal_eval(f.read()) #json.load(f)
        vid = fl.split('\\')[-1].split('.')[0]
        #print(vid)
        transcripts_jsons[vid] = transcript
#print(transcripts_jsons)

'''Read the videos metadata to perform on them the segmentation'''
df_videos = pd.read_csv(topic_dataset_path)

''' Transfer topic shifts time to seconds units instead HH:MM:SS'''

def topic_shifts_seconds(topic_shifts):
    tp_shift_sec=[]
    for tp in topic_shifts:
        intervals = tp.split(':')
        seconds = int(intervals[2])
        minutes = int(intervals[1]) * 60
        hours = int(intervals[0]) * 60 *60
        tp_shift_sec.append(seconds + minutes + hours)
    return tp_shift_sec


for video_id in transcripts_jsons.keys():    
    df_videos.at[df_videos['video id'] == video_id,'topic shifts(ends)'] =\
    topic_shifts_seconds(\
                         df_videos[df_videos['video id'] == \
                                   video_id]['topic shifts(ends)'])

In [34]:
'''Create an array of functions to optimize'''
def build_function_to_optimize(workflow_label):
    pipeline_steps_values = workflow_label.split('-')
    steps_labels = ['segment','vectorize','similarity','added_filter','clustering']
    _pipeline = {}
    for key,value in list(zip(steps_labels,pipeline_steps_values)):
        _pipeline[key]=value
    
    '''takingcare of filters'''
    filter_params = {'filter_type': None,'mask_shape': None,'sim_thresh':None,'is_min_thresh':True}
    # if we have image filter 
    if _pipeline['added_filter'] != 'None':
        vals = _pipeline['added_filter'].split('_')
        filter_params['filter_type'] = vals[0]
        filter_params['mask_shape'] = ast.literal_eval(vals[1])
        
    #print(filter_params)    
    #print(_pipeline)
    '''Cosine and spectral clustering'''
    if _pipeline['similarity'] == 'cosine' and _pipeline['clustering'] == 'spectral_clustering':
        def _f(window_size,step_size,sim_thresh,n_clusters):
            window_size = int(window_size)
            step_size = int(step_size)
            n_clusters = int(n_clusters)
            filter_params['sim_thresh'] = sim_thresh

            return pipeline.run_for_baye(groundbase,transcripts,
                                window_size=window_size,step_size=step_size,
                                vector_method=_pipeline['vectorize'],
                                similarity_method=_pipeline['similarity'],
                                filter_params=filter_params,
                                clustering_params={'algorithm': _pipeline['clustering'],
                                                   'n_clusters':n_clusters}
                               )

        return _f

In [35]:
param_bounds = {
    'sliding_window-tfidf-cosine-median_(2,2)-spectral_clustering':{
        "window_size": (20,200),
        "step_size":(10,60),
        'sim_thresh':(0.2,0.9),
        'n_clusters': (10,18)
    },
    'sliding_window-tfidf-cosine-median_(3,3)-spectral_clustering':{
        "window_size": (20,150),
        "step_size":(10,60),
        'sim_thresh':(0.3,0.6),
        'n_clusters': (12,18)
    },
    'sliding_window-tfidf-cosine-None-spectral_clustering':{
        "window_size": (20,150),
        "step_size":(10,60),
        'sim_thresh':(0.3,0.6),
        'n_clusters': (12,18)
    }
}

In [38]:
models_path  = '../models/bayesian_opt'
''' Finding optimization for each video'''

workflows = list(param_bounds.keys())

for workflow in workflows:
    print('Training workflow %s' %(workflow))
    function_to_optimized = build_function_to_optimize(workflow)
    '''Define the optimizer'''    
    optimizer = BayesianOptimization(
                f=function_to_optimized,
                pbounds=param_bounds[workflow],
                verbose=2,
                random_state=1
                )
    
    model_file_path = os.path.join(models_path,('%s.json' %(workflow)))
    if os.path.isfile(model_file_path):
        load_logs(optimizer,logs=[model_file_path])
    else:
        logger = JSONLogger(path=model_file_path)
        optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)


    for vid in transcripts_jsons.keys():
        '''Define the function to optimize'''
        video_metadata = df_videos.loc[df_videos['video id'] == vid]
        groundbase = video_metadata['topic shifts(ends)'].values.tolist()[:-1]
        transcripts = transcripts_jsons[vid]

        print("Running the algorithm on %s " %(vid))
        '''Run the algorithm'''
        optimizer.maximize(
            init_points = 1,
            n_iter = 7
        )

Training workflow sliding_window-tfidf-cosine-median_(2,2)-spectral_clustering
Running the algorithm on 2mC1uqwEmWQ 
|   iter    |  target   | n_clus... | sim_th... | step_size | window... |
-------------------------------------------------------------------------
|  1        |  0.0      |  18.0     |  0.2      |  31.16    |  46.41    |
|  2        |  0.0      |  17.24    |  0.3782   |  59.72    |  197.9    |




|  3        |  0.0      |  17.84    |  0.8651   |  35.14    |  107.5    |
|  4        |  0.3333   |  11.15    |  0.4821   |  10.13    |  45.7     |




|  5        |  0.0      |  10.0     |  0.9      |  60.0     |  44.4     |




|  6        |  0.0      |  10.0     |  0.2      |  35.17    |  20.0     |
|  7        |  0.3333   |  18.0     |  0.2      |  60.0     |  156.6    |
|  8        |  0.2      |  10.0     |  0.2      |  10.0     |  101.1    |
Running the algorithm on B-Xe7_mf2CY 
|   iter    |  target   | n_clus... | sim_th... | step_size | window... |
-------------------------------------------------------------------------
|  9        |  0.2      |  10.0     |  0.2      |  10.0     |  101.1    |




|  10       |  0.0      |  10.0     |  0.9      |  40.82    |  69.01    |




|  11       |  0.0      |  10.0     |  0.9      |  60.0     |  146.4    |
|  12       |  0.1667   |  10.26    |  0.8336   |  10.36    |  192.0    |
|  13       |  0.0      |  10.13    |  0.3622   |  11.55    |  156.3    |




|  14       |  0.0      |  18.0     |  0.9      |  38.78    |  168.3    |
|  15       |  0.1111   |  17.78    |  0.5598   |  59.96    |  118.0    |
|  16       |  0.1111   |  17.99    |  0.5063   |  11.81    |  55.0     |
Running the algorithm on MkiUBJcgdUY 
|   iter    |  target   | n_clus... | sim_th... | step_size | window... |
-------------------------------------------------------------------------
|  17       |  0.1111   |  17.99    |  0.5063   |  11.81    |  55.0     |




|  18       |  0.0      |  18.0     |  0.9      |  10.0     |  34.89    |




|  19       |  0.0      |  18.0     |  0.9      |  48.42    |  87.57    |
|  20       |  0.05556  |  10.0     |  0.9      |  10.0     |  127.6    |




|  21       |  0.2222   |  18.0     |  0.9      |  10.0     |  93.0     |
|  22       |  0.0      |  18.0     |  0.2      |  60.0     |  171.7    |




|  23       |  0.0      |  10.0     |  0.9      |  22.94    |  38.09    |




|  24       |  0.0      |  17.86    |  0.2654   |  49.13    |  34.9     |
Running the algorithm on Q-HugPvA7GQ 
|   iter    |  target   | n_clus... | sim_th... | step_size | window... |
-------------------------------------------------------------------------
|  25       |  0.07143  |  11.64    |  0.8147   |  11.37    |  140.7    |
|  26       |  0.0      |  18.0     |  0.2      |  17.21    |  152.1    |
|  27       |  0.0      |  10.0     |  0.2      |  43.99    |  119.0    |




|  28       |  0.0      |  10.44    |  0.6942   |  59.16    |  79.31    |
|  29       |  0.1429   |  10.0     |  0.2      |  29.05    |  168.8    |
|  30       |  0.07143  |  10.0     |  0.2      |  60.0     |  200.0    |




|  31       |  0.0      |  10.21    |  0.7945   |  20.83    |  59.47    |
|  32       |  0.07143  |  10.0     |  0.9      |  10.0     |  200.0    |
Running the algorithm on x5zLaWT5KPs 
|   iter    |  target   | n_clus... | sim_th... | step_size | window... |
-------------------------------------------------------------------------
|  33       |  0.4286   |  13.34    |  0.5911   |  17.02    |  55.66    |
|  34       |  0.0      |  18.0     |  0.2      |  28.35    |  125.2    |
|  35       |  0.2857   |  10.0     |  0.2      |  35.07    |  93.89    |




|  36       |  0.0      |  18.0     |  0.9      |  24.09    |  20.0     |
|  37       |  0.2857   |  10.0     |  0.2      |  23.71    |  107.8    |
|  38       |  0.0      |  18.0     |  0.9      |  10.0     |  185.5    |
|  39       |  0.0      |  18.0     |  0.2      |  25.25    |  94.27    |




|  40       |  0.2857   |  10.0     |  0.9      |  46.59    |  188.7    |
Running the algorithm on zWg7U0OEAoE 
|   iter    |  target   | n_clus... | sim_th... | step_size | window... |
-------------------------------------------------------------------------




|  41       |  0.0      |  16.41    |  0.8778   |  25.67    |  144.6    |




|  42       |  0.0      |  18.0     |  0.9      |  45.49    |  20.0     |




|  43       |  0.0      |  17.94    |  0.6927   |  51.3     |  52.85    |
|  44       |  0.1538   |  18.0     |  0.2      |  10.0     |  134.2    |
|  45       |  0.0      |  10.42    |  0.7398   |  10.2     |  57.99    |
|  46       |  0.0      |  18.0     |  0.2      |  19.0     |  47.73    |




|  47       |  0.0      |  10.0     |  0.9      |  44.79    |  200.0    |
|  48       |  0.0      |  18.0     |  0.2      |  32.03    |  74.24    |
Training workflow sliding_window-tfidf-cosine-median_(3,3)-spectral_clustering
Running the algorithm on 2mC1uqwEmWQ 




Running the algorithm on B-Xe7_mf2CY 




Running the algorithm on MkiUBJcgdUY 




Running the algorithm on Q-HugPvA7GQ 




Running the algorithm on x5zLaWT5KPs 




Running the algorithm on zWg7U0OEAoE 




Training workflow sliding_window-tfidf-cosine-None-spectral_clustering
Running the algorithm on 2mC1uqwEmWQ 
|   iter    |  target   | n_clus... | sim_th... | step_size | window... |
-------------------------------------------------------------------------
|  1        |  0.0      |  18.0     |  0.6      |  60.0     |  76.69    |




|  2        |  0.0      |  18.0     |  0.6      |  10.0     |  20.0     |




|  3        |  0.4667   |  18.0     |  0.6      |  10.0     |  98.83    |




|  4        |  0.2667   |  18.0     |  0.3      |  60.0     |  127.8    |




|  5        |  0.4      |  18.0     |  0.6      |  30.23    |  88.19    |




|  6        |  0.0      |  18.0     |  0.6      |  40.32    |  150.0    |




|  7        |  0.0      |  18.0     |  0.6      |  36.7     |  46.11    |
|  8        |  0.0      |  18.0     |  0.6      |  10.0     |  120.8    |
Running the algorithm on B-Xe7_mf2CY 
|   iter    |  target   | n_clus... | sim_th... | step_size | window... |
-------------------------------------------------------------------------




|  9        |  0.0      |  12.88    |  0.3277   |  19.31    |  64.92    |




|  10       |  0.0      |  18.0     |  0.6      |  60.0     |  103.1    |




|  11       |  0.1667   |  18.0     |  0.6      |  31.71    |  107.1    |




|  12       |  0.2222   |  18.0     |  0.6      |  10.0     |  80.65    |




|  13       |  0.0      |  12.0     |  0.6      |  60.0     |  47.91    |




|  14       |  0.0      |  12.19    |  0.4447   |  10.0     |  38.13    |




|  15       |  0.05556  |  12.0     |  0.6      |  60.0     |  150.0    |




|  16       |  0.0      |  12.0     |  0.6      |  34.81    |  20.0     |
Running the algorithm on MkiUBJcgdUY 
|   iter    |  target   | n_clus... | sim_th... | step_size | window... |
-------------------------------------------------------------------------




|  17       |  0.1667   |  14.38    |  0.4616   |  30.96    |  109.1    |




|  18       |  0.05556  |  12.0     |  0.6      |  10.0     |  95.04    |




|  19       |  0.0      |  17.79    |  0.5205   |  41.38    |  71.08    |




|  20       |  0.2222   |  18.0     |  0.6      |  40.96    |  130.1    |




|  21       |  0.0      |  12.0     |  0.6      |  43.89    |  91.5     |




|  22       |  0.0      |  18.0     |  0.3      |  19.47    |  95.41    |




|  23       |  0.0      |  18.0     |  0.6      |  51.08    |  31.99    |




|  24       |  0.2222   |  18.0     |  0.6      |  15.99    |  49.54    |
Running the algorithm on Q-HugPvA7GQ 
|   iter    |  target   | n_clus... | sim_th... | step_size | window... |
-------------------------------------------------------------------------
|  25       |  0.1429   |  13.23    |  0.5634   |  11.37    |  107.2    |




|  26       |  0.2143   |  17.8     |  0.5424   |  54.6     |  141.7    |




|  27       |  0.0      |  18.0     |  0.3      |  44.72    |  112.2    |




|  28       |  0.0      |  18.0     |  0.3      |  25.1     |  31.12    |




|  29       |  0.0      |  12.0     |  0.6      |  60.0     |  116.8    |




|  30       |  0.1429   |  12.0     |  0.6      |  22.18    |  138.0    |




|  31       |  0.2143   |  18.0     |  0.6      |  26.8     |  121.8    |




|  32       |  0.1429   |  18.0     |  0.6      |  29.02    |  78.32    |
Running the algorithm on x5zLaWT5KPs 
|   iter    |  target   | n_clus... | sim_th... | step_size | window... |
-------------------------------------------------------------------------




|  33       |  0.3571   |  14.5     |  0.4676   |  17.02    |  45.75    |




|  34       |  0.0      |  12.04    |  0.519    |  29.55    |  40.87    |




|  35       |  0.0      |  17.97    |  0.3881   |  59.78    |  57.41    |




|  36       |  0.0      |  18.0     |  0.6      |  39.71    |  86.01    |




|  37       |  0.0      |  12.0     |  0.6      |  46.8     |  59.08    |




|  38       |  0.0      |  12.0     |  0.6      |  23.33    |  150.0    |




|  39       |  0.3571   |  12.0     |  0.6      |  30.93    |  86.15    |




|  40       |  0.4286   |  12.0     |  0.6      |  10.0     |  50.86    |
Running the algorithm on zWg7U0OEAoE 
|   iter    |  target   | n_clus... | sim_th... | step_size | window... |
-------------------------------------------------------------------------




|  41       |  0.1538   |  16.8     |  0.5905   |  25.67    |  110.0    |




|  42       |  0.0      |  18.0     |  0.3      |  46.33    |  20.0     |




|  43       |  0.0      |  12.0     |  0.6      |  19.5     |  51.7     |




|  44       |  0.07692  |  12.0     |  0.3      |  10.0     |  138.0    |




|  45       |  0.1538   |  12.0     |  0.3      |  60.0     |  89.22    |




|  46       |  0.0      |  12.0     |  0.6      |  10.0     |  71.89    |




|  47       |  0.0      |  12.0     |  0.3      |  60.0     |  68.6     |




|  48       |  0.0      |  18.0     |  0.3      |  10.0     |  45.23    |


In [7]:
("%s %s" %(1,2))

'1 2'