In [1]:
import requests
from bs4 import BeautifulSoup

url = "https://www.youtube.com/live/H8S9xg8iYuc?si=B_NMiiS7DPvN99-_"

response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

title = soup.title.string.replace(" - YouTube", "").strip()

In [2]:
import re

# get video title name
def clean_title(text):
    return re.sub(r'[^0-9a-zA-Z\u0E00-\u0E7F\.]', '', text)
title = clean_title(text=title)

# get video id
match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11}).*", url)
if match:
    video_id = match.group(1)

In [3]:
from youtube_transcript_api import YouTubeTranscriptApi

# Fetch transcript (auto-captions or uploaded)
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['th', 'en'])

# Optionally, save to file
file_name = f'{title}_{video_id}'
with open(f"{file_name}_subtitle.txt", "w", encoding="utf-8") as f:
    for entry in transcript:
        f.write(f"{entry['start']:.2f}s: {entry['text']}\n")
        # f.write(f"{entry['text']}")


In [4]:
import re

def load_subtitles(file_path):
    subtitles = []
    with open(file_path, encoding='utf-8') as f:
        for line in f:
            match = re.match(r'([0-9.]+)s:\s(.+)', line.strip())
            if match:
                start_time = float(match.group(1))
                text = match.group(2)
                subtitles.append({'start': start_time, 'text': text})
    return subtitles

def chunk_subtitles(subtitles, chunk_size=60, overlap=20):
    """
    Chunk subtitles into segments of `chunk_size` seconds with `overlap` seconds.
    """
    chunks = []
    max_time = subtitles[-1]['start']
    start_time = 0

    while start_time <= max_time:
        end_time = start_time + chunk_size
        chunk_text = []
        for entry in subtitles:
            if start_time <= entry['start'] < end_time:
                chunk_text.append(entry['text'])
        if chunk_text:
            chunks.append({
                'start': start_time,
                'end': end_time,
                'text': ' '.join(chunk_text)
            })
        start_time += chunk_size - overlap
    return chunks

subtitles = load_subtitles(f"{file_name}_subtitle.txt")
chunk_dict = chunk_subtitles(subtitles, chunk_size=30, overlap=10)

In [5]:
chunks = [chunk['text'] for chunk in chunk_dict]

In [6]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Embed text
embeddings = embedding_model.encode(chunks)
print(f"Vector length: {len(embeddings[0])}")

  from .autonotebook import tqdm as notebook_tqdm


Vector length: 384


In [7]:
import chromadb

# Local vector DB (PersistentClient = new style!)
chroma_client = chromadb.PersistentClient(path="./vector_database")

collection = chroma_client.get_or_create_collection(name="my_local_collection")

In [8]:
# Cloud collections work the same way!
collection.add(
    ids=[f"doc_{i}" for i in range(len(chunks))],  # unique IDs
    embeddings=embeddings.tolist(),               # must be list of lists!
    documents=chunks,                              # optional, but useful
    metadatas=[{"source": "example"} for _ in chunks]  # optional metadata
)

In [9]:
title = file_name.split('_')[0]

In [14]:
# query = "what are the growth driven strategy in the future?"
query = """Analyze the company's revenue and profit performance over the past 4 quarters.
1. Compare YoY (year-over-year) and QoQ (quarter-over-quarter) changes.
2. Check whether the actual results met, exceeded, or missed previous guidance.
3. Highlight any unexpected spikes, drops, or anomalies.
4. If possible, explain the reasons behind these surprises based on management's comments."""

# Embed locally
query_embedding = embedding_model.encode([query])

# Search Chroma Cloud
results = collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=20
)

# Extract relevant chunks
contexts = results['documents'][0]
context_text = "\n".join(contexts)

In [15]:
import ollama

system_prompt = f"""
you are the professional investor
"""

user_prompt = f"""Answer the question below using ONLY the context below.

Context:
{context_text}

Question:
{query}

Answer:"""


response = ollama.chat(
    model="llama3",
    messages=[
        # {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
)

summary = response["message"]["content"]

print(summary)

Based on the context provided, here is an analysis of the company's revenue and profit performance over the past 4 quarters:

1. YoY (year-over-year) changes:
The text does not provide specific numbers for revenue and profit growth year-over-year. However, it mentions that in Q4 of the previous year, the Redemption rate was higher than expected due to issues with app development.

2. QoQ (quarter-over-quarter) changes:
The text suggests that there was a significant increase in revenue and profit from Q1 to Q4 of the same year. In Q4, revenue growth reached 33%, which is a notable jump compared to previous quarters.

3. Actual results vs. guidance:
There is no specific mention of whether the actual results met, exceeded, or missed previous guidance. However, it seems that there were some unexpected changes in the business performance during certain periods.

4. Reasons behind surprises:
According to management's comments, one reason for the surprise in Q4 was issues with app development