In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Setup notebook
import sys
from os import chdir
from pathlib import Path

if Path(".").absolute().name == "notebooks":
    chdir("..")
base_path = Path(".").absolute()
src_path = base_path / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

data_path = base_path / "data"
data_path.mkdir(parents=True, exist_ok=True)
model_path = data_path / "models"
model_path.mkdir(parents=True, exist_ok=True)
videos_path = data_path / "videos"
videos_path.mkdir(parents=True, exist_ok=True)
sample_video_path = videos_path / "sample_video.mp4"
transcript_file = data_path / "transcript.json"

force_download_model = False
force_download_video = False

In [None]:
# Download model
# See https://github.com/ggml-org/whisper.cpp/blob/master/models/README.md
import urllib.request

model_name = "ggml-medium-q5_0.bin"
model_file = model_path / model_name
if not model_file.exists() or force_download_model:
    hf_base_url = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main"
    urllib.request.urlretrieve(f"{hf_base_url}/{model_name}", model_file)

In [None]:
# Download sample video
import urllib.request

if not sample_video_path.exists() or force_download_video:
    url = "https://archive.org/download/of-science-and-scientists/Of%20Science%20And%20Scientists%2001%20Introduction.mp4"
    urllib.request.urlretrieve(url, sample_video_path)


In [None]:
# Transcribe Video
import json
from transcribe.transcriber import WhisperTranscriber

work_dir = data_path / "work"
work_dir.mkdir(parents=True, exist_ok=True)
transcriber = WhisperTranscriber(
    work_dir=work_dir,
    whisper_bin=base_path / "development/whisper",
    model_path=data_path / "models",
    model_name=model_name,
)
transcript = transcriber.transcribe(
    video_path=sample_video_path,
)

with transcript_file.open("w") as fp:
    json.dump(transcript, fp)

In [None]:
# Extract keywords via LLM
import os
from explain.openai import OpenAIExplainer

transcript_ft = " ".join([w["word"] for w in transcript["words"]])
api_key = ""
api_key_file = data_path / "openai_api_key.txt"
if api_key_file.exists():
    explainer = OpenAIExplainer(
        api_key=api_key_file.read_text().strip(),
    )
else:
    explainer = OpenAIExplainer(api_key=os.environ["OPENAI_API_KEY"])

explanation = explainer.explain(
    course_name="science",
    full_text=transcript_ft,
)
print(explanation)

In [None]:
from search.index import ElasticsearchConceptIndex

index = ElasticsearchConceptIndex(
    hosts=["http://localhost:19200"],
    username="elastic",
    password="dev-password-dont-use-in-production",
)
index.setup(drop_index=True)

In [None]:
from explain.openai import Concept

video_id = "12345"
course_id = "science"
video_length = transcript["duration_ms"]


def idx(
        concept: Concept,
        newly_introduced: bool = False,
        further_discussed: bool = False,
        just_mentioned: bool = False,
):
    index.index(
        course_id=course_id,
        video_id=video_id,
        concept_name=concept.name,
        description=concept.description,
        keywords=concept.keywords,
        share=concept.share,
        duration_ms=int(video_length * concept.share),
        newly_introduced=newly_introduced,
        further_discussed=further_discussed,
        just_mentioned=just_mentioned,
    )

for concept in explanation.newly_introduced:
    idx(concept, newly_introduced=True)
for concept in explanation.further_discussed:
    idx(concept, further_discussed=True)
for concept in explanation.just_mentioned:
    idx(concept, just_mentioned=True)

In [None]:
all_keywords = index.get_all_keywords(
    course_ids = [course_id],
)
print(all_keywords)

In [None]:
index.search(
    keywords=[
        "inference"
    ],
)