In [None]:
import os
import torch
import glob
import json
from tqdm import tqdm
import pandas as pd
import math

In [None]:
import cv2

def get_video_fps(video_path):
    video = cv2.VideoCapture(video_path)
    if not video.isOpened():
        print("Không thể mở file video.")
        return None
    fps = video.get(cv2.CAP_PROP_FPS)
    video.release()
    return fps

In [None]:
all_video_paths = dict()
video_dir = "/kaggle/working/asr"
for part in sorted(os.listdir(video_dir)):
    data_path = part
    all_video_paths[data_path] = dict()

for data_part in sorted(all_video_paths.keys()):
    data_part_path = f'{video_dir}/Videos_{data_part}_a/video'
    video_paths = sorted(os.listdir(data_part_path))
    video_ids = [video_path.replace('.mp4', '').split('_')[-1] for video_path in video_paths]
    for video_id, video_path in zip(video_ids, video_paths):
        video_path_full = f'{data_part_path}/{video_path}'
        all_video_paths[data_part][video_id] = video_path_full

In [None]:
all_video_paths.keys()

In [None]:
keyframes_dir = '/kaggle/input/keyframes-v2-aic2024'
all_keyframe_paths = dict()
for part in sorted(os.listdir(keyframes_dir)):
    data_part = part.split('_')[-1]
    all_keyframe_paths[data_part] =  dict()
    data_part_path = f'{keyframes_dir}/Keyframes_{data_part}/keyframes'
    frame_dirs = sorted(os.listdir(data_part_path))
    frame_ids = [frame_dir.split('_')[-1] for frame_dir in frame_dirs]
    for frame_id, frame_dir in zip(frame_ids, frame_dirs):
        keyframe_paths = sorted(glob.glob(f'{data_part_path}/{frame_dir}/*.jpg'))
        all_keyframe_paths[data_part][frame_id] =  keyframe_paths

In [None]:
i = 0
map_ = dict()
context_all = []
for key in tqdm(all_video_paths.keys()):
    video_paths_dict = all_video_paths[key]
    video_ids = sorted(video_paths_dict.keys())
    
    
    for video_id in video_ids:
        video_dir = f'/kaggle/input/video-v2-aic2024/Videos_{key}_a/video/{key}_{video_id}.mp4'
        audio_detect_dir = f'/kaggle/input/aic24-asr-v1/audio_extracted/{key}/{video_id}.json'
        audio_recog_dir = f'/kaggle/input/aic24-asr-v1/audio_recognized/{key}/{video_id}.json'
        fps = get_video_fps(video_dir)
        map_keyframe_dir = f'/kaggle/input/map-keyframes-v2/map-keyframes/{key}_{video_id}.csv'
        with open(audio_detect_dir) as f:
            frames = json.load(f)
            
        with open(audio_recog_dir) as c:
            contexts = json.load(c)
            
        csvFile = pd.read_csv(map_keyframe_dir)
        for frame, context in zip(frames, contexts):
            range_frame = [math.floor(x)*fps for x in frame]
            frame_idx = [x for x in csvFile['frame_idx'] if x >= range_frame[0] and x <= range_frame[1]]
            frame_paths = []
            j=-1
            for num in csvFile['n'][csvFile['frame_idx'].isin(frame_idx)]:
                j+=1
                frame_path = f'/Keyframes/Keyframes_{key}/keyframes/{key}_{video_id}/{num:03d}.jpg'
                map_[str(i)] = {
                    'image_path': frame_path,
                    'frame_idx': frame_idx[j]
                }
                context_all.append(context)
                i+=1

In [None]:
for i, (key, value) in enumerate(map_.items()):
    if i < 10:
        print(f"{key}: {value}")
    else:
        break

In [None]:
len(map_)

In [None]:
len(context_all)

In [None]:
# Tạo file JSON và ghi dữ liệu vào đó
with open('map-asr-v2.json', 'w') as json_file:
    json.dump(map_, json_file, indent=4)

print("Created")

In [None]:
map_['1']

# connect asr

In [None]:
!mkdir asr

In [None]:
!cp -r /kaggle/input/aic24-asr-v1/audio_recognized/* /kaggle/working/asr

# Connect map asr

In [None]:
import json

In [None]:
with open('/kaggle/input/map-asr/map-asr.json', 'r') as f1, open('/kaggle/input/map-asr/map-asr-v2.json', 'r') as f2:
    data1 = json.load(f1)
    data2 = json.load(f2)

In [None]:
# Tạo dict mới để chứa kết quả
merged_data = {}

# Sao chép nội dung từ data1 vào merged_data
for key, value in data1.items():
    merged_data[key] = value

# Tính toán số thứ tự bắt đầu cho file thứ hai
next_index = len(merged_data)

# Nối nội dung của data2 vào merged_data
for key, value in data2.items():
    merged_data[str(next_index)] = value
    next_index += 1


In [None]:
len(data1), len(data2)

In [None]:
len(merged_data)

In [None]:
# Lưu kết quả vào file mới
with open('map_asr.json', 'w') as f_merged:
    json.dump(merged_data, f_merged, indent=4)

# tfidf embed

In [None]:
import os
import sys
import glob
import scipy
import pickle
import numpy as np
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import save_npz, load_npz
from scipy import sparse as sp

In [None]:
def preprocess_text(text:str):
#     text = text.lower()
    # keep letter and number remove all remain
    reg_pattern = '[^a-z0-9A-Z_ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂưăạảấầẩẫậắằẳẵặẹẻẽềềểỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễếệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸửữựỳỵỷỹ\s]'
    output = re.sub(reg_pattern, '', text)
#     output = text
    output = output.strip()
    return text

In [None]:
def load_context(clean_data_paths, input_datatype = 'txt'):
    context = []
    if input_datatype == 'txt':
        data_paths = []
        cxx_data_paths = glob.glob(clean_data_paths)
        cxx_data_paths.sort()
        for cxx_data_path in cxx_data_paths:
            data_path = glob.glob(cxx_data_path + '/*.txt')
            data_path.sort(reverse=False, key=lambda s:int(s[-7:-4]))
            data_paths += data_path
        for path in data_paths:
            with open(path, 'r', encoding='utf-8') as f:
                data = f.readlines()
                data = [item.strip() for item in data]
                context += data
    elif input_datatype == 'json':
        context_paths = glob.glob(clean_data_paths)
        context_paths.sort()
        for cxx_context_path in context_paths:
            paths = glob.glob(cxx_context_path + '/*.json')
            paths.sort(reverse=False, key=lambda x: int(x[-8:-5]))
            for path in paths:
                with open(path) as f:
                    context += [''.join(line) for line in json.load(f)]
    else:
        print(f'not support reading the {input_datatype}')
        sys.exit()
    return context

def TfIdfTransform(data_path, save_tfids_object_path,  update=False , all_datatype=None): 
    tfidf_transform = {}
    context_matrix = {}
    ngram_range = (1, 1)
    for datatype in all_datatype:
        print(f'processing {datatype}')
        data_type_path = os.path.join(PROJECT_ROOT, data_path[datatype])
        print(f'load {datatype} context data from {data_type_path}')
        context = load_context(data_type_path)
        if update:
            print(f'load {datatype} tfidf object and matrix')
            tfidf_transform_path = os.path.join(PROJECT_ROOT, save_tfids_object_path, f'tfidf_transform_{datatype}.pkl')
            context_matrix_path = os.path.join(PROJECT_ROOT, save_tfids_object_path, f'sparse_context_matrix_{datatype}.npz')

            with open(tfidf_transform_path, 'rb') as f:
                old_tfidf_transformer = pickle.load(f)
            old_tfidf_matrix = load_npz(context_matrix_path)

            print(f'update {datatype} tfidf object and matrix')
            new_tfidf_matrix = old_tfidf_transformer.transform(context)
            context_matrix[datatype] = scipy.sparse.vstack([old_tfidf_matrix, new_tfidf_matrix])

        else:
            print(f'create {datatype} tfidf object and matrix')
            tfidf_transform[datatype] = TfidfVectorizer(input = 'content', ngram_range = ngram_range, token_pattern=r"(?u)\b[\w\d]+\b")
            context_matrix[datatype] = tfidf_transform[datatype].fit_transform(context)

        tfidf_transform_path = os.path.join(PROJECT_ROOT, save_tfids_object_path, f'tfidf_transform_{datatype}_test.pkl')
        context_matrix_path = os.path.join(PROJECT_ROOT, save_tfids_object_path, f'sparse_context_matrix_{datatype}_test.npz')

        os.makedirs(os.path.dirname(tfidf_transform_path), exist_ok=True)
        os.makedirs(os.path.dirname(context_matrix_path), exist_ok=True)
        print(f'save tfidf object to : {tfidf_transform_path}')
        with open(tfidf_transform_path, 'wb') as f:
            pickle.dump(tfidf_transform[datatype], f)

        save_npz(context_matrix_path, context_matrix[datatype])


In [None]:
def load_context(clean_data_paths, input_datatype):
    context = []
    if input_datatype == 'txt':
        data_paths = []
        cxx_data_paths = glob.glob(clean_data_paths)
        cxx_data_paths.sort()
        for cxx_data_path in cxx_data_paths:
            data_path = glob.glob(cxx_data_path + '/*.txt')
            data_path.sort(reverse=False, key=lambda s:int(s[-7:-4]))
            data_paths += data_path
        for path in data_paths:
            with open(path, 'r', encoding='utf-8') as f:
                data = f.readlines()
                data = [item.strip() for item in data]
                context += data
    elif input_datatype == 'json':
        context_paths = glob.glob(clean_data_paths)
        context_paths.sort()
        for cxx_context_path in context_paths:
            paths = glob.glob(cxx_context_path + '/*.json')
            paths.sort(reverse=False, key=lambda x: int(x[-8:-5]))
            for path in paths:
                with open(path) as f:
                    context += [preprocess_text(''.join(line)) for line in json.load(f)]
    else:
        print(f'not support reading the {input_datatype}')
        sys.exit()
    return context

In [None]:
clean_data_path = {
                'asr':'/kaggle/working/asr/*',
            } 
save_tfids_object_path = '/kaggle/working/'
update = False
context_data = None
ngram_range = (1, 1)
input_datatype = 'json'

In [None]:
clean_data_paths = clean_data_path['asr']
context = load_context(clean_data_paths, input_datatype)

In [None]:
s = context #[1000000:-500000]

tf = TfidfVectorizer(input = 'content', ngram_range = ngram_range, token_pattern=r"(?u)\b[\w\d]+\b")
cm = tf.fit_transform(s).tocsr()
# cm.shape

In [None]:
len(tf.get_feature_names_out()),len(np.unique(tf.get_feature_names_out()))

In [None]:
with open(os.path.join(save_tfids_object_path, f'tfidf_transform_asr.pkl'), 'wb') as f:
    pickle.dump(tf, f)
scipy.sparse.save_npz(os.path.join(save_tfids_object_path, f'sparse_context_matrix_asr.npz'), cm)