In [1]:
!pip install elasticsearch==7.0.1

Collecting elasticsearch==7.0.1
  Downloading elasticsearch-7.0.1-py2.py3-none-any.whl.metadata (6.8 kB)
Downloading elasticsearch-7.0.1-py2.py3-none-any.whl (83 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.2/83.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: elasticsearch
Successfully installed elasticsearch-7.0.1


In [2]:
import os
import json
from elasticsearch import Elasticsearch

In [3]:
def create_index_if_not_exists(es, index_name):
    """
    Tạo một index mới trong Elasticsearch nếu nó chưa tồn tại.

    Parameters:
    - es: đối tượng Elasticsearch đã được khởi tạo.
    - index_name: tên của index Elasticsearch cần tạo.

    Returns:
    - None
    """
    if not es.indices.exists(index=index_name):
        try:
            es.indices.create(index=index_name)
            print(f"Index '{index_name}' created successfully.")
        except Exception as e:
            print(f"Error creating index '{index_name}': {str(e)}")
    else:
        print(f"Index '{index_name}' already exists.")


In [4]:
def load_asr_backup_to_elasticsearch(backup_file_path, es, index_name):
    """
    Tải dữ liệu từ tệp asr_backup.json lên Elasticsearch.
    """
    # Ensure the index exists
    create_index_if_not_exists(es, index_name)

    # Check if the backup file exists
    if not os.path.exists(backup_file_path):
        print(f"Error: Backup file {backup_file_path} does not exist.")
        return

    # Load the backup data
    with open(backup_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Kiểm tra xem dữ liệu có phải là danh sách hay không
    if isinstance(data, list):
        for doc in data:
            # Kiểm tra nếu doc là dictionary
            if isinstance(doc, dict):
                # Tạo id dựa trên các trường cần thiết
                doc_id = f"{doc.get('video_id', 'unknown')}_frame_{doc.get('start_frame', 0)}_{doc.get('end_frame', 0)}"
                try:
                    # Check if the document already exists in Elasticsearch
                    if es.exists(index=index_name, id=doc_id):
                       print(f"Document {doc_id} already exists. Skipping...")
                       continue
                    else:
                        print(f"Document {doc_id} does not exist. Indexing new document...")
                        es.index(index=index_name, id=doc_id, body=doc)
                except Exception as e:
                    print(f"Error checking/adding document {doc_id}: {str(e)}")
            else:
                print(f"Error: Document is not a dictionary. Skipping: {doc}")
    else:
        print(f"Error: Loaded data is not a list. Check the JSON structure.")


# Cách sử dụng cloudflare
- B1: tải cloudflare về máy
```
  winget install --id Cloudflare.cloudflared
```
  - Note: Kiểm tra cloundflare đã được tải về chưa
  ```
    cloudflared --version
  ```
- B2: Chạy Tunnel
```
  cloudflared tunnel --url http://localhost:9200
```
- B5: Lấy host rồi dán lên đây:
```
    Ví dụ:
    Your quick Tunnel has been created! Visit it at (it may take some time to be reachable):
    https://dome-disciplines-privilege-exhibitions.trycloudflare.com
```

In [5]:
# Example usage
es = Elasticsearch(['https://nvidia-paintball-futures-mortality.trycloudflare.com']) # Điền host
index_name = 'asr_video_1'
for i in range(1, 32):

  backup_file_path = f'/kaggle/input/output-asr/asr/output/L01_V0{str(i).zfill(2)}_transcription.json'

  load_asr_backup_to_elasticsearch(backup_file_path, es, index_name)
  print(f"Done L01_V0{str(i).zfill(2)}")

Index 'asr_video_1' already exists.
Document L01_V001_frame_0_125 already exists. Skipping...
Document L01_V001_frame_225_350 already exists. Skipping...
Document L01_V001_frame_375_475 already exists. Skipping...
Document L01_V001_frame_475_575 already exists. Skipping...
Document L01_V001_frame_650_825 already exists. Skipping...
Document L01_V001_frame_825_900 already exists. Skipping...
Document L01_V001_frame_900_1050 already exists. Skipping...
Document L01_V001_frame_1050_1150 already exists. Skipping...
Document L01_V001_frame_1150_1250 already exists. Skipping...
Document L01_V001_frame_1250_1350 already exists. Skipping...
Document L01_V001_frame_1350_1450 already exists. Skipping...
Document L01_V001_frame_1450_1525 already exists. Skipping...
Document L01_V001_frame_1525_1600 already exists. Skipping...
Document L01_V001_frame_1600_1675 already exists. Skipping...
Document L01_V001_frame_1675_1750 already exists. Skipping...
Document L01_V001_frame_1750_1825 already exists.