#### model files

[audio-event-recognition/fsd-sinet/](https://essentia.upf.edu/models/audio-event-recognition/fsd-sinet/)    

!wget -q https://essentia.upf.edu/models/audio-event-recognition/fsd-sinet/fsd-sinet-vgg41-tlpf-1.pb

!wget -q https://essentia.upf.edu/models/audio-event-recognition/fsd-sinet/fsd-sinet-vgg41-tlpf-1.json


<https://mtg.github.io/essentia-labs/news/tensorflow/2023/02/08/fsdsinet-models/>
    
    tlpf : Trainable Low-Pass Filters
    aps : Adaptive Polyphase Sampling

    fsd-sinet-vgg42-tlpf_aps-1 - best
    fsd-sinet-vgg41-tlpf-1 - lighter


In [None]:
import essentia
print(essentia.__version__)
print(essentia.__file__)
import essentia.standard as es

# let's have a look at what is in there
#print(dir(essentia.standard))


import threading , cv2 , os , time , json
import matplotlib.pyplot as plt
from queue import Queue
import numpy as np

import utils.util as util
import moviepy.editor as mp

In [None]:
'''import pstats

# Load the profiling data from the .prof file
stats = pstats.Stats('profile.prof')

# Sort the profiling data by cumulative time
stats.sort_stats('cumulative')

# Print the profiling data to the console
stats.print_stats()'''

In [None]:
''' HELPERS '''
#print(dir(essentia.standard))
#print(help(es.AudioLoader))

In [None]:
'''
https://essentia.upf.edu/reference/streaming_TensorflowPredictFSDSINet.html

batchSize:
    integer ∈ [-1,inf) (default = 64)
    the batch size for prediction. This allows parallelization when GPUs are
    available. Set it to -1 or 0 to accumulate all the patches and run a single
    TensorFlow session at the end of the stream

graphFilename:
    string (default = "")
    the name of the file from which to load the TensorFlow graph

input:
    string (default = "x")
    the name of the input node in the TensorFlow graph

lastPatchMode:
    string ∈ {discard,repeat} (default = "discard")
    what to do with the last frames: `repeat` them to fill the last patch or
    `discard` them

normalize:
    bool ∈ {false,true} (default = true)
    whether to normalize the input audio signal. Note that this parameter is
    only available in standard mode

output:
    string (default = "model/predictions/Sigmoid")
    the name of the node from which to retrieve the output tensors

patchHopSize:
    integer ∈ [0,inf) (default = 50)
    number of frames between the beginnings of adjacent patches. 0 to avoid
    overlap

savedModel:
    string (default = "")
    the name of the TensorFlow SavedModel. Overrides parameter `graphFilename`
    '''

In [None]:
''' predicts per batch array while cv displaying'''
class FSDSINET_live():
    def __init__(self):
        
        # Loading the model
        self.graph_filename = "/raid/DATASETS/.zuble/vigia/zuwav/fsd-sinet-essentia/models/fsd-sinet-vgg41-tlpf-1.pb"
        self.model = es.TensorflowPredictFSDSINet(graphFilename=self.graph_filename)

        # Read the metadata
        self.metadata_file = "/raid/DATASETS/.zuble/vigia/zuwav/fsd-sinet-essentia/models/fsd-sinet-vgg41-tlpf-1.json"
        self.metadata = json.load(open(self.metadata_file, "r"))
        self.labels = self.metadata["classes"]
        self.anom_labels = ["Alarm","Boom","Crowd","Dog","Drill","Explosion","Fire","Gunshot and gunfire","Hammer","Screaming","Screech",\
                            "Shatter","Shout","Siren","Slam","Squeak","Yell"]
        self.anom_labels_i = [4,18,51,59,65,72,78,92,94,145,146,147,148,152,154,161,198]
        
        # get file list
        self.mp4_fn, *_ = util.load_xdv_test(util.SERVER_TEST_AUD_MONO_PATH)
        
        self.test_config = {
            "batch_len_secs":2,
            "batch_step_secs": 1,
            "audio_fs_input":22050
        }
      
      
    def plot_predictions(self, top_preds, top_labels_w_av,top_labels_with_av):
        # Generate plots and improve formatting
        matfig = plt.figure(figsize=(8, 3))
        plt.matshow(top_preds, fignum=matfig.number, aspect="auto")

        plt.yticks(np.arange(len(top_labels_w_av)), top_labels_with_av)
        locs, _ = plt.xticks()
        ticks = np.array(locs // 2).astype("int")
        plt.xticks(locs[1: -1], ticks[1: -1])
        plt.tick_params(bottom=True, top=False, labelbottom=True, labeltop=False)
        plt.xlabel("(s)")
        plt.show()
        if self.save_plot : plt.savefig("activations.png", bbox_inches='tight')
        
    def process_rslt_all(self,predictions):
        
        def top_from_average(data):
            av = np.mean(data, axis=0)
            sorting = np.argsort(av)[::-1]
            return sorting[:self.nlabels2predict], [av[i] for i in sorting] ,av

        top_labels_i, averages_sorted , averages = top_from_average(predictions)
        top_labels = [self.labels[i] for i in top_labels_i]
        top_labels_with_av = [f"{label} ({av:.3f})" for label, av in zip(top_labels, averages_sorted)]
        top_predictions = np.array([predictions[:,i] for i in top_labels_i])
        
        return top_labels, top_labels_with_av, top_predictions
    
    def process_rslt_anom(self,predictions):
        
        def top_from_anomaly(data):
            av = np.mean(data, axis=0)
            sorting = np.argsort(av)[::-1]
            sorting_anom = [x for x in sorting if x in self.anom_labels_i]
            return sorting_anom[:self.nlabels2predict],[av[i] for i in sorting_anom]

        top_labels_anom_i, averages_anom_sorted = top_from_anomaly(predictions)
        top_labels_anom = [self.labels[i] for i in top_labels_anom_i]
        top_labels_anom_with_av = [f"{label} ({av:.3f})" for label, av in zip(top_labels_anom, averages_anom_sorted)]
        top_predictions_anom = np.array([predictions[:,i] for i in top_labels_anom_i])
        
        return top_labels_anom, top_labels_anom_with_av,top_predictions_anom

    def predicition(self,audio2predict,printt=True,plott=False,save_plot=False):
        
        self.save_plot = save_plot
    
        predictions = self.model(audio2predict)
        print("predictions_shape",np.shape(predictions))

        top_labels, top_labels_with_av ,top_predictions = self.process_rslt_all(predictions)   
        top_labels_anom, top_labels_anom_with_av,top_predictions_anom = self.process_rslt_anom(predictions)     

        if printt:
            #print("\nall", top_labels_with_av)
            print("anom", top_labels_anom_with_av)
            #for label, probability in zip(self.metadata['classes'], predictions.mean(axis=0)):
            #    print(f'{label}: {100 * probability:.1f}%') 
        if plott:
            self.plot_predictions(top_predictions_anom,top_labels_anom_with_av,top_labels_anom_with_av)
                
        return top_labels_with_av,top_labels_anom_with_av


    def prediction_thread(self,batchinfo_queque , as_queque ):
        
        resampler = es.Resample(inputSampleRate=self.mp4_fs_aac, outputSampleRate=self.test_config["audio_fs_input"])
        
        # get full audio from atual mp4
        audio_mp = mp.AudioFileClip(filename=self.path,fps=self.mp4_fs_aac)#.fx(mp.afx.audio_normalize)
        audio_es = mp.AudioFileClip(filename=self.path,fps=self.mp4_fs_aac)
        
        
        while True:
            start, end = batchinfo_queque.get()
            
            # checks for close signal
            if start == -1 and end == -1:
                audio_mp.close() ; audio_es.close()
                break
                
            # MOVIEPY 
            audio_batch_mp = audio_mp.subclip(t_start=start,t_end=end)
            audio_batch_array = audio_batch_mp.to_soundarray(fps = self.test_config["audio_fs_input"])
            audio_batch_array_mp = np.mean(audio_batch_array, axis=1).astype(np.float32)
            
            #print("mp_batch")
            top_mp = self.predicition(audio_batch_array_mp,printt=False)
              
            
            # ESSENTIA
            audio_batch_es = audio_es.subclip(t_start=start,t_end=end)
            audio_batch_array2 = audio_batch_es.to_soundarray()
            audio_batch_array_mono_single2 = np.mean(audio_batch_array2, axis=1).astype(np.float32)
            audio_bacth_array_essentia = resampler(audio_batch_array_mono_single2) 
            
            #print("essentia_batch")
            top_es = self.predicition(audio_bacth_array_essentia,printt=False)
            
            
            print("mp",np.shape(audio_batch_array_mp),"\nessentia",np.shape(audio_bacth_array_essentia))
            as_queque.put((top_mp,top_es))
        
        return
    
    
    def fdspredict_from_video(self,index,path,nlabels2predict):
        
        self.nlabels2predict = nlabels2predict
        
        ''' CV video info '''
        self.path = path; self.index = index
        video = cv2.VideoCapture(path)
        width  = video.get(cv2.CAP_PROP_FRAME_WIDTH)
        height = video.get(cv2.CAP_PROP_FRAME_HEIGHT)
        total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
        self.fps = int(video.get(cv2.CAP_PROP_FPS))
        total_time = total_frames/self.fps
        print("total_time",total_time,"self.fps",self.fps)
        
        
        ''' CV window info '''
        frame_time_ms = int(1000/self.fps)
        font = cv2.FONT_HERSHEY_SIMPLEX;fontScale = 0.5;thickness = 1;lineType = cv2.LINE_AA
        strap_video_name = os.path.splitext(os.path.basename(path))[0]
        wn='asVwR @ '+str(index)+strap_video_name
        cv2.namedWindow(wn) 
        
        
        ''' FS converter '''
        self.mp4_fs_aac = util.print_acodec_from_mp4([path],only_sr=True) # get audio stream fs from mp4 
        print("mp4_fs_aac",self.mp4_fs_aac)
        resampler = es.Resample(inputSampleRate=self.mp4_fs_aac, outputSampleRate=self.test_config["audio_fs_input"])
        
        
        ''' THREAD prediction creation '''
        ## Create the input and as queues
        batchinfo_queque = Queue()
        as_queque = Queue()
        
        # Create the prediction thread
        prediction_thread = threading.Thread(target=self.prediction_thread, args=(batchinfo_queque , as_queque))
        prediction_thread.start()
        
        
        ''' BATCH init '''
        batch_len = self.test_config["batch_len_secs"]*int(self.fps)
        batch_step_len = self.test_config["batch_step_secs"]*int(self.fps)
        batch_step_atual = 0 
        

        top_mp3 = top_es3 = [['','',''],['','','']]
        while True:
            
            ret, frame = video.read()
            if not ret:break
            video_atual_frame = int(video.get(cv2.CAP_PROP_POS_FRAMES))
            
            #video_atual_time = video.get(cv2.CAP_PROP_POS_MSEC) / 1000
            #audio_frame = audio.subclip(video_atual_time , video_atual_time+(1/fps))
            #audio_frame_array = np.array(audio_frame.to_soundarray())
            
            if video_atual_frame == batch_len + batch_step_len * batch_step_atual:
                #print(video_atual_frame)
                
                # set start and end of batch in secs to use in mp.subclip
                start = (batch_step_len * batch_step_atual)/self.fps
                end = video_atual_frame / self.fps
                print('\n******************************************',\
                        batch_step_atual,start,end)
                
                # inject start and stop secs to queque
                try: batchinfo_queque.put_nowait((start, end))
                except: pass
                
                batch_step_atual += 1
            
            
            # Get aas if ready
            try: top_mp3,top_es3 = as_queque.get_nowait() # Use this to prefer smooth display over frame/text shift
            except: pass
             
            # frames / secs   
            cv2.putText(frame, '%d' % (video_atual_frame)+'/'+str(int(total_frames)), (5, int(height)-7),font,fontScale,[60,250,250],thickness,lineType)    
            cv2.putText(frame, '%.2f' % (video_atual_frame/self.fps)+'s',(5,int(height)-25),font,fontScale, [80,100,250],thickness,lineType)
          
            # aas
            cv2.putText(frame,str(top_mp3[1][0])+'\n'+str(top_mp3[1][1])+'\n'+str(top_mp3[1][2])+'\n'+str(batch_step_atual),(10,15),font,fontScale,[0,0,255],thickness,lineType)  
            cv2.putText(frame,str(top_es3[1][0])+'\n'+str(top_es3[1][1])+'\n'+str(top_es3[1][2])+str(batch_step_atual),(10,30),font,fontScale,[0,0,255],thickness,lineType)  
            
            cv2.imshow(wn, frame)
            
            key = cv2.waitKey(frame_time_ms)  
            if key == ord('q'): break  # quit
            if key == ord(' '):  # pause
                while True:
                    key = cv2.waitKey(1)
                    if key == ord(' '):break
        
        video.release()
        cv2.destroyAllWindows()

        print("signal frame queue to close")
        batchinfo_queque.put_nowait((-1, -1))
        
        print("closing predict thread")
        prediction_thread.join()

def init_watch_live(watch_this):
    print("\n\nINIT WATCH LIVE")
    
    test_mp4_paths,*_ = util.load_xdv_test(util.SERVER_TEST_AUD_ORIG_PATH)
    print('\n  test_mp4_paths',np.shape(test_mp4_paths))

    test_labels_indexs = util.get_index_per_label_from_filelist(test_mp4_paths)

    fsdsinet = FSDSINET_live()

    print('\n  watching',watch_this)
    for labels_2_watch in watch_this:
        print('  ',labels_2_watch,' : ',test_labels_indexs[labels_2_watch])
        
        all_or_specific = input("\n\nall indxs : enter  |  specific indxs : ex 3,4,77,7  |  dry_run no as window : dr\n\n")
        
        if all_or_specific == "": # all
            for i in range(len(test_labels_indexs[labels_2_watch])):
                index = test_labels_indexs[labels_2_watch][i]
                path = test_mp4_paths[index]
                print('\n#-------------------#$%--------------------#\n',labels_2_watch,index,path)
                fsdsinet.fdspredict_from_video(index,path,10)
        elif all_or_specific == "dr": 
            for i in range(len(test_labels_indexs[labels_2_watch])):
                index = test_labels_indexs[labels_2_watch][i]
                path = test_mp4_paths[index]
                print('\n#-------------------#$%--------------------#\n',labels_2_watch,index,path) 
        
        else: # specific
            all_or_specific = all_or_specific.split(",")
            all_or_specific = [int(num) for num in all_or_specific]
            for index in all_or_specific:
                path = test_mp4_paths[index]
                print('\n#-------------------#$%--------------------#\n',labels_2_watch,index,path)
                fsdsinet.fdspredict_from_video(index,path,10)

'''
    A  NORMAL  
    B1 FIGHT | B2 SHOOTING | B4 RIOT | B5 ABUSE | B6 CAR ACCIDENT | G  EXPLOSION 
    BG ALL ANOMALIES
'''
init_watch_live(watch_this=['B2'])

In [None]:
''' predicts per batch array while cv displaying'''
class FSDSINET():
    def __init__(self):
        
        # Loading the model
        self.graph_filename = "/raid/DATASETS/.zuble/vigia/zuwav/fsd-sinet-essentia/models/fsd-sinet-vgg41-tlpf-1.pb"
        self.model = es.TensorflowPredictFSDSINet(graphFilename=self.graph_filename)

        # Read the metadata
        self.metadata_file = "/raid/DATASETS/.zuble/vigia/zuwav/fsd-sinet-essentia/models/fsd-sinet-vgg41-tlpf-1.json"
        self.metadata = json.load(open(self.metadata_file, "r"))
        self.labels = self.metadata["classes"]
        self.anom_labels = ["Alarm","Boom","Crowd","Dog","Drill","Explosion","Fire","Gunshot and gunfire","Hammer","Screaming","Screech",\
                            "Shatter","Shout","Siren","Slam","Squeak","Yell"]
        self.anom_labels_i = [4,18,51,59,65,72,78,92,94,145,146,147,148,152,154,161,198]
        
        # get file list
        self.mp4_fn, *_ = util.load_xdv_test(util.SERVER_TEST_AUD_MONO_PATH)
        
        self.test_config = {
            "batch_len_secs":2,
            "batch_step_secs": 1,
            "audio_fs_input":22050
        }
        
    def plot_predictions(self, top_preds, top_labels_w_av,top_labels_with_av):
        # Generate plots and improve formatting
        matfig = plt.figure(figsize=(8, 3))
        plt.matshow(top_preds, fignum=matfig.number, aspect="auto")

        plt.yticks(np.arange(len(top_labels_w_av)), top_labels_with_av)
        locs, _ = plt.xticks()
        ticks = np.array(locs // 2).astype("int")
        plt.xticks(locs[1: -1], ticks[1: -1])
        plt.tick_params(bottom=True, top=False, labelbottom=True, labeltop=False)
        plt.xlabel("(s)")
        plt.show()
        if self.save_plot : plt.savefig("activations.png", bbox_inches='tight')
        
    def process_rslt_all(self,predictions):
        
        def top_from_average(data):
            av = np.mean(data, axis=0)
            sorting = np.argsort(av)[::-1]
            return sorting[:self.nlabels2predict], [av[i] for i in sorting] ,av

        top_labels_i, averages_sorted , averages = top_from_average(predictions)
        top_labels = [self.labels[i] for i in top_labels_i]
        top_labels_with_av = [f"{label} ({av:.3f})" for label, av in zip(top_labels, averages_sorted)]
        top_predictions = np.array([predictions[:,i] for i in top_labels_i])
        
        return top_labels, top_labels_with_av, top_predictions
    
    def process_rslt_anom(self,predictions):
        
        def top_from_anomaly(data):
            av = np.mean(data, axis=0)
            sorting = np.argsort(av)[::-1]
            sorting_anom = [x for x in sorting if x in self.anom_labels_i]
            return sorting_anom[:self.nlabels2predict],[av[i] for i in sorting_anom]

        top_labels_anom_i, averages_anom_sorted = top_from_anomaly(predictions)
        top_labels_anom = [self.labels[i] for i in top_labels_anom_i]
        top_labels_anom_with_av = [f"{label} ({av:.3f})" for label, av in zip(top_labels_anom, averages_anom_sorted)]
        top_predictions_anom = np.array([predictions[:,i] for i in top_labels_anom_i])
        
        return top_labels_anom, top_labels_anom_with_av,top_predictions_anom

    def predicition(self,audio2predict,printt=True,plott=False,save_plot=False):
        
        self.save_plot = save_plot
    
        predictions = self.model(audio2predict)
        print("predictions_shape",np.shape(predictions))

        top_labels, top_labels_with_av ,top_predictions = self.process_rslt_all(predictions)   
        top_labels_anom, top_labels_anom_with_av,top_predictions_anom = self.process_rslt_anom(predictions)     

        if printt:
            #print("\nall", top_labels_with_av)
            print("anom", top_labels_anom_with_av)
            #for label, probability in zip(self.metadata['classes'], predictions.mean(axis=0)):
            #    print(f'{label}: {100 * probability:.1f}%') 
        if plott:
            self.plot_predictions(top_predictions_anom,top_labels_anom_with_av,top_labels_anom_with_av)
                
        return top_labels_with_av,top_labels_anom_with_av

       
    def fdspredict_from_video(self,index,path,nlabels2predict):
        
        self.nlabels2predict = nlabels2predict
        
        ''' CV video info '''
        self.path = path; self.index = index
        video = cv2.VideoCapture(path)
        width  = video.get(cv2.CAP_PROP_FRAME_WIDTH)
        height = video.get(cv2.CAP_PROP_FRAME_HEIGHT)
        total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
        self.fps = int(video.get(cv2.CAP_PROP_FPS))
        total_time = total_frames/self.fps
        print("total_time",total_time)
        
        
        ''' CV window info '''
        frame_time_ms = int(1000/self.fps)
        font = cv2.FONT_HERSHEY_SIMPLEX;fontScale = 0.5;thickness = 1;lineType = cv2.LINE_AA
        strap_video_name = os.path.splitext(os.path.basename(path))[0]
        wn='asVwR @ '+str(index)+strap_video_name
        cv2.namedWindow(wn) 
        
        
        ''' FS converter '''
        self.mp4_fs_aac = util.print_acodec_from_mp4([path],only_sr=True) # get audio stream fs from mp4 
        print("mp4_fs_aac",self.mp4_fs_aac)
        resampler = es.Resample(inputSampleRate=self.mp4_fs_aac, outputSampleRate=self.test_config["audio_fs_input"])
          
        
        ''' predicts over total audio array'''

        # MOVIEPY
        tti = time.time()
        audio_mp = mp.AudioFileClip(filename=path,fps=self.mp4_fs_aac)#.fx(mp.afx.audio_normalize)
        print("audio",audio_mp,"fps",audio_mp.fps,"nch",audio_mp.nchannels)
        audio_total_array = audio_mp.to_soundarray(fps=self.test_config["audio_fs_input"])
        audio_total_array_mono_single = np.mean(audio_total_array, axis=1).astype(np.float32)
        
        _ = self.predicition(audio_total_array_mono_single,printt=False,plott=True)
        ttf = time.time()
        print("total_MOVIEPY_time",ttf-tti)
        
        #ESSENTIA
        tti2 = time.time()
        audio_es = mp.AudioFileClip(filename=path,fps=self.mp4_fs_aac)
        audio2_array = audio_es.to_soundarray()
        audio2_array_mono = np.mean(audio2_array, axis=1)
        audio2_array_mono_single = audio2_array_mono.astype(np.float32)
        audio2__array_essentia = resampler(audio2_array_mono_single) 
        
        _ = self.predicition(audio2__array_essentia,printt=False,plott=True)
        ttf2 = time.time()
        print("total_ESSENTIA_time",ttf2-tti2)
        
        
        # predict over total saved wavfile
        #fn2 = 'audio_mono_total'+str(index)+'.wav'
        #es.MonoWriter(filename=fn2 , sampleRate = 22050)(audio2__array_essentia)
        #aaa = es.MonoLoader(filename=fn2, sampleRate=22050)()
        #top_n2,*_ = self.predicition(aaa,5)
        
        

        batch_len = self.test_config["batch_len_secs"]*int(self.fps)
        batch_step_len = self.test_config["batch_step_secs"]*int(self.fps)
        batch_step_atual = 0 
        
        atual_label_mp0=atual_label_mp1=atual_label_mp2 = ''
        atual_label_es0=atual_label_es1=atual_label_es2 = ''
        while True:
            
            ret, frame = video.read()
            if not ret:break
            video_atual_frame = int(video.get(cv2.CAP_PROP_POS_FRAMES))
            
            #video_atual_time = video.get(cv2.CAP_PROP_POS_MSEC) / 1000
            #audio_frame = audio.subclip(video_atual_time , video_atual_time+(1/fps))
            #audio_frame_array = np.array(audio_frame.to_soundarray())
            
            if video_atual_frame == batch_len + batch_step_len * batch_step_atual:
                #print(video_atual_frame)
                
                # set start and end of batch in secs to use in mp.subclip
                start = (batch_step_len * batch_step_atual)/self.fps
                end = video_atual_frame / self.fps
                print('\n******************************************',\
                    batch_step_atual,start,end)
                

                
                ''' PREDICT batch without thread '''
                
                # MOVIEPY 
                audio_batch_mp = audio_mp.subclip(t_start=start,t_end=end)
                audio_batch_array = audio_batch_mp.to_soundarray(fps = self.test_config["audio_fs_input"])
                audio_batch_array_mp = np.mean(audio_batch_array, axis=1).astype(np.float32)
                #print("mono_single",audio_batch_array_mp.dtype,np.shape(audio_batch_array_mp))
                
                print("\nPREDICT")
                print("mp_batch")
                top_anom_mp = self.predicition(audio_batch_array_mp,5,printt=True)
                atual_label_mp0 = str(top_anom_mp[1][0])
                atual_label_mp1 = str(top_anom_mp[1][1])
                atual_label_mp2 = str(top_anom_mp[1][2])
                
                
                # ESSENTIA
                audio_batch_es = audio_es.subclip(t_start=start,t_end=end)
                audio_batch_array2 = audio_batch_es.to_soundarray()
                audio_batch_array_mono_single2 = np.mean(audio_batch_array2, axis=1).astype(np.float32)
                audio_bacth_array_essentia = self.resampler(audio_batch_array_mono_single2) 
                
                print("essentia_batch")
                top_anom_es = self.predicition(audio_bacth_array_essentia,5,printt=True)
                atual_label_es0 = str(top_anom_es[1][0])
                atual_label_es1 = str(top_anom_es[1][1])
                atual_label_es2 = str(top_anom_es[1][2])
                
                print("mp",np.shape(audio_batch_array_mp),"\nessentia",np.shape(audio_bacth_array_essentia))
                
                
                #return audio_batch_array_mono_single, audio_bacth_array_essentia
                #assert np.array_equal(audio_batch_array_mp, audio_bacth_array_essentia)
                #nsamples = np.shape(audio_batch_array)[0] ; secs = end-start ; sample_rate = nsamples / secs
                #print(sample_rate)
                
                
                ''' save both bacthes'''
                #print("\nFROM WAVS")
                #fnmp = 'mp_mono_'+str(batch_step_atual)+'.wav'
                #es.MonoWriter(filename=fnmp,sampleRate = self.test_config["audio_fs_input"])(audio_batch_array_mp)
                #fnes = 'essentia_mono_'+str(batch_step_atual)+'.wav'
                #es.MonoWriter(filename=fnes,sampleRate = self.test_config["audio_fs_input"])(audio_bacth_array_essentia)
                
                ''' predict on saved files '''
                #print("mp_batch")
                #monoloader_batch_mp = es.MonoLoader(filename=fnmp, sampleRate=self.test_config["audio_fs_input"])()
                #top_anom_mp2 = self.predicition(monoloader_batch_mp,5,printt=True)
                #print("essentia_batch")
                #monoloader_batch_es = es.MonoLoader(filename=fnes, sampleRate=self.test_config["audio_fs_input"])()
                #top_anom_es2 = self.predicition(monoloader_batch_es,5,printt=True)
                
                
                batch_step_atual += 1
            
            # frames / secs   
            cv2.putText(frame, '%d' % (video_atual_frame)+'/'+str(int(total_frames)), (5, int(height)-7),font,fontScale,[60,250,250],thickness,lineType)    
            cv2.putText(frame, '%.2f' % (video_atual_frame/self.fps)+'s',(5,int(height)-25),font,fontScale, [80,100,250],thickness,lineType)
            
            # aas without thread
            cv2.putText(frame,atual_label_mp0+'\n'+atual_label_mp1+'\n'+atual_label_mp2+'\n'+str(batch_step_atual),(10,15),font,fontScale,[0,0,255],thickness,lineType)  
            cv2.putText(frame,atual_label_es0+'\n'+atual_label_es1+'\n'+atual_label_es2+'\n'+str(batch_step_atual),(10,30),font,fontScale,[0,0,255],thickness,lineType)  
            
            cv2.imshow(wn, frame)
            
            key = cv2.waitKey(frame_time_ms)  
            if key == ord('q'): break  # quit
            if key == ord(' '):  # pause
                while True:
                    key = cv2.waitKey(1)
                    if key == ord(' '):break
        
        audio_mp.close()
        audio_es.close()
        video.release()
        cv2.destroyAllWindows()


def init_watch_live2(watch_this):
    print("\n\nINIT WATCH LIVE")
    
    test_mp4_paths,*_ = util.load_xdv_test(util.SERVER_TEST_AUD_ORIG_PATH)
    print('\n  test_mp4_paths',np.shape(test_mp4_paths))

    test_labels_indexs = util.get_index_per_label_from_filelist(test_mp4_paths)

    fsdsinet = FSDSINET()

    print('\n  watching',watch_this)
    for labels_2_watch in watch_this:
        print('  ',labels_2_watch,' : ',test_labels_indexs[labels_2_watch])
        
        all_or_specific = input("\n\nall indxs : enter  |  specific indxs : ex 3,4,77,7  |  dry_run no as window : dr\n\n")
        
        if all_or_specific == "": # all
            for i in range(len(test_labels_indexs[labels_2_watch])):
                index = test_labels_indexs[labels_2_watch][i]
                path = test_mp4_paths[index]
                print('\n#-------------------#$%--------------------#\n',labels_2_watch,index,path)
                fsdsinet.fdspredict_from_video(index,path,10)
        elif all_or_specific == "dr": 
            for i in range(len(test_labels_indexs[labels_2_watch])):
                index = test_labels_indexs[labels_2_watch][i]
                path = test_mp4_paths[index]
                print('\n#-------------------#$%--------------------#\n',labels_2_watch,index,path) 
        
        else: # specific
            all_or_specific = all_or_specific.split(",")
            all_or_specific = [int(num) for num in all_or_specific]
            for index in all_or_specific:
                path = test_mp4_paths[index]
                print('\n#-------------------#$%--------------------#\n',labels_2_watch,index,path)
                fsdsinet.fdspredict_from_video(index,path,10)

'''
    A  NORMAL  
    B1 FIGHT | B2 SHOOTING | B4 RIOT | B5 ABUSE | B6 CAR ACCIDENT | G  EXPLOSION 
    BG ALL ANOMALIES
'''
init_watch_live2(watch_this=['B2'])

In [None]:
''' predicts over entire audio array'''

class FSDSINET2():
    def __init__(self):
        
        # Loading the model
        self.graph_filename = "/raid/DATASETS/.zuble/vigia/zuwav/fsd-sinet-essentia/models/fsd-sinet-vgg41-tlpf-1.pb"
        self.model = es.TensorflowPredictFSDSINet(graphFilename=self.graph_filename)

        # Read the metadata
        self.metadata_file = "/raid/DATASETS/.zuble/vigia/zuwav/fsd-sinet-essentia/models/fsd-sinet-vgg41-tlpf-1.json"
        self.metadata = json.load(open(self.metadata_file, "r"))
        self.labels = self.metadata["classes"]
        self.anom_labels = ["Alarm","Boom","Crowd","Dog","Drill","Explosion","Fire","Gunshot and gunfire","Hammer","Screaming","Screech",\
                            "Shatter","Shout","Siren","Slam","Squeak","Yell"]
        self.anom_labels_i = [4,18,51,59,65,72,78,92,94,145,146,147,148,152,154,161,198]
        
        # get file list
        self.mp4_fn, *_ = util.load_xdv_test(util.SERVER_TEST_AUD_MONO_PATH)

    def plot_predictions(self, top_preds, top_labels_w_av,top_labels_with_av):
        # Generate plots and improve formatting
        matfig = plt.figure(figsize=(8, 3))
        plt.matshow(top_preds, fignum=matfig.number, aspect="auto")

        plt.yticks(np.arange(len(top_labels_w_av)), top_labels_with_av)
        locs, _ = plt.xticks()
        ticks = np.array(locs // 2).astype("int")
        plt.xticks(locs[1: -1], ticks[1: -1])
        plt.tick_params(bottom=True, top=False, labelbottom=True, labeltop=False)
        plt.xlabel("(s)")

        if self.save_plot : plt.savefig("activations.png", bbox_inches='tight')
        
    def process_rslt_all(self,predictions):
        
        def top_from_average(data, top_n):
            av = np.mean(data, axis=0)
            sorting = np.argsort(av)[::-1]
            return sorting[:top_n], [av[i] for i in sorting] ,av

        top_labels_i, averages_sorted , averages = top_from_average(predictions,self.nlabels2predict)
        top_labels = [self.labels[i] for i in top_labels_i]
        top_labels_with_av = [f"{label} ({av:.3f})" for label, av in zip(top_labels, averages_sorted)]
        top_predictions = np.array([predictions[:,i] for i in top_labels_i])
        if self.plott : self.plot_predictions(top_predictions, top_labels_with_av,top_labels_with_av)

        return top_labels, top_labels_with_av
    
    def process_rslt_anom(self,predictions):
        
        def top_from_anomaly(data):
            av = np.mean(data, axis=0)
            sorting = np.argsort(av)[::-1]
            sorting_anom = [x for x in sorting if x in self.anom_labels_i]
            return sorting_anom,[av[i] for i in sorting_anom]

        top_labels_anom_i, averages_anom_sorted = top_from_anomaly(predictions)
        top_labels_anom = [self.labels[i] for i in top_labels_anom_i]
        top_labels_anom_with_av = [f"{label} ({av:.3f})" for label, av in zip(top_labels_anom, averages_anom_sorted)]
        top_predictions_anom = np.array([predictions[:,i] for i in top_labels_anom_i])
        if self.plott : self.plot_predictions(top_predictions_anom, top_labels_anom_with_av,top_labels_anom_with_av)
        
        return top_labels_anom, top_labels_anom_with_av

    def predicition_complete(self,index,nlabels2predict,printt=True,plott=False,save_plot=False):
        
        self.plott = plott; self.save_plot = save_plot
        self.nlabels2predict = nlabels2predict
        
        path = self.mp4_fn[index]
        print("PATH",path)
        
        # Extract the audio from the video
        audio = mp.AudioFileClip(filename=path)#.fx(mp.afx.audio_normalize)
        audio_total_array = audio.to_soundarray(fps=22050)
        audio_total_array_mono = np.mean(audio_total_array, axis=1)
        audio_total_array_mono_single = audio_total_array_mono.astype(np.float32)
        audio2predict = audio_total_array_mono_single
        
        predictions = self.model(audio2predict)
        print("predictions_shape",np.shape(predictions))

        top_labels, top_labels_with_av = self.process_rslt_all(predictions)   
        top_labels_anom, top_labels_anom_with_av = self.process_rslt_anom(predictions)     


        if printt:
            print("\nall", top_labels_with_av)
            print("\nanom", top_labels_anom_with_av)
            #for label, probability in zip(self.metadata['classes'], predictions.mean(axis=0)):
            #    print(f'{label}: {100 * probability:.1f}%') 

def init_watch_live2(watch_this):
    print("\n\nINIT WATCH LIVE")
    
    test_mp4_paths,*_ = util.load_xdv_test(util.SERVER_TEST_AUD_ORIG_PATH)
    print('\n  test_mp4_paths',np.shape(test_mp4_paths))

    test_labels_indexs = util.get_index_per_label_from_filelist(test_mp4_paths)

    fsdsinet = FSDSINET2()
    
    print('\n  watching',watch_this)
    for labels_2_watch in watch_this:
        print('  ',labels_2_watch,' : ',test_labels_indexs[labels_2_watch])
        
        all_or_specific = input("\n\nall indxs : enter  |  specific indxs : ex 3,4,77,7  |  dry_run no as window : dr\n\n")
        
        if all_or_specific == "": # all
            for i in range(len(test_labels_indexs[labels_2_watch])):
                index = test_labels_indexs[labels_2_watch][i]
                path = test_mp4_paths[index]
                print('\n#-------------------#$%--------------------#\n',labels_2_watch,index,path)
                fsdsinet.predicition_complete(index,15,plott=True)
        elif all_or_specific == "dr": 
            for i in range(len(test_labels_indexs[labels_2_watch])):
                index = test_labels_indexs[labels_2_watch][i]
                path = test_mp4_paths[index]
                print('\n#-------------------#$%--------------------#\n',labels_2_watch,index,path) 
        else: # specific
            all_or_specific = all_or_specific.split(",")
            all_or_specific = [int(num) for num in all_or_specific]
            for index in all_or_specific:
                path = test_mp4_paths[index]
                print('\n#-------------------#$%--------------------#\n',labels_2_watch,index,path)
                fsdsinet.predicition_complete(index,15,plott=True)


'''
    A  NORMAL  
    B1 FIGHT | B2 SHOOTING | B4 RIOT | B5 ABUSE | B6 CAR ACCIDENT | G  EXPLOSION 
    BG ALL ANOMALIES
'''

init_watch_live2(watch_this=['G'])