In [1]:
from qdrant_client import QdrantClient
from zep_cloud.client import Zep
import openai

In [2]:
from dotenv import load_dotenv
import tiktoken
import os

load_dotenv()

True

In [3]:
client = QdrantClient(
    url="https://3973cdf9-4ba6-40b1-ae92-b2f952f82fb9.europe-west3-0.gcp.cloud.qdrant.io:6333", 
    api_key=os.getenv("QDRANT_CLOUD_API_KEY"),
)

In [None]:
zep_client = Zep(
    api_key=os.environ.get('ZEP_API_KEY'),
)

In [None]:
# create user if user not created

zep_client.user.add(
    email="test@email.com",
    first_name="Test",
    last_name="User",
    user_id="user_1", # do not change the id
)

In [4]:
print(client.get_collections())

collections=[]


In [5]:
import os
import re

In [6]:
def load_transcripts(data_dir):
    transcripts = []
    for file_name in os.listdir(data_dir):
        if file_name.endswith(".txt"):
            with open(os.path.join(data_dir, file_name), 'r', encoding='utf-8') as f:
                transcripts.append(f.read())
    return transcripts

In [7]:
def parse_transcript(transcript):
    title_match = re.search(r"Title: (.+)", transcript)
    url_match = re.search(r"URL Source: (.+)", transcript)
    content_match = re.search(r"Markdown Content:(.+)", transcript, re.DOTALL)

    return {
        "title": title_match.group(1) if title_match else None,
        "url": url_match.group(1) if url_match else None,
        "content": content_match.group(1).strip() if content_match else None
    }

In [8]:
# Initialize the tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")

def get_token_count_by_subtopic(subtopics):
    token_counts = []
    for subtopic in subtopics:
        content = ' '.join(subtopic['content'])
        tokens = tokenizer.encode(content)
        token_counts.append({
            'subtopic': subtopic['subtopic'],
            'token_count': len(tokens)
        })
    return token_counts

In [9]:
def chunk_text(text, max_tokens=500, min_tokens=300):
    # Tokenize the input text
    tokens = tokenizer.encode(text)
    chunks = []
    current_chunk = []
    for token in tokens:
        current_chunk.append(token)
        # If the current chunk exceeds the max token limit
        if len(current_chunk) >= max_tokens:
            chunks.append(current_chunk)
            current_chunk = []
    # Handle the last chunk, ensure it meets the minimum size requirement
    if current_chunk:
        if len(current_chunk) < min_tokens and chunks:
            # If the last chunk is smaller than the minimum, merge it with the previous chunk
            chunks[-1].extend(current_chunk)
        else:
            chunks.append(current_chunk)
    return [tokenizer.decode(chunk) for chunk in chunks]

In [10]:
def parse_and_chunk_transcript_by_subtopic(data):
    transcript = data["content"]
    # Regex to find subtopics (e.g., Introduction, Education)
    subtopic_pattern = re.compile(r"^(.*)\n-+\n", re.MULTILINE)
    # Regex to capture speaker dialogue (e.g., Destiny [(00:00:00)]...)
    dialogue_pattern = re.compile(r"(?P<speaker>\w+)\s\[\((?P<timestamp>\d{2}:\d{2}:\d{2})\)\]\((?P<url>https:\/\/youtube\.com\/watch\?v=[^&]+&t=\d+)\)\s(?P<text>.+)")
    
    chunks = []

    subtopics = subtopic_pattern.split(transcript)

    for i in range(1, len(subtopics), 2):
        subtopic = subtopics[i].strip()
        # print(subtopic)

        content_block = subtopics[i + 1] if i + 1 < len(subtopics) else ""

        # update the current subtopic
        current_subtopic = subtopic
        
        # Find all dialogues within this subtopic
        dialogues = dialogue_pattern.findall(content_block)

        # print(dialogues)

        formatted_text = []
        speakers = []
        tstamp = None
        for dialogue in dialogues:
            speaker, timestamp, url, text = dialogue
            while tstamp == None:
                tstamp = f"[({timestamp})]({url})"

            if speaker not in speakers:
                speakers.append(speaker)

            formatted_text.append(f"{speaker}: {text} \n")
        
        # token count
        tokens_enc = tokenizer.encode(' '.join(formatted_text))
        tok_count = len(tokens_enc)

        if tok_count > 500:
            token_chunks = chunk_text(' '.join(formatted_text))
            for chunk in token_chunks:
                current_chunk = {
                    "subtopic": subtopic,
                    "content": chunk,
                    "metadata": {
                        "speakers": speakers,
                        "dialogue_count": len(chunk),
                        "title": data["title"],
                        "url": data["url"],
                        "timestamp": tstamp
                    }
                }
                chunks.append(current_chunk)
        else:
            current_chunk = {
                "subtopic": subtopic,
                "content": formatted_text,
                "metadata": {
                    "speakers": speakers,
                    "dialogue_count": len(formatted_text),
                    "title": data["title"],
                    "url": data["url"],
                    "timestamp": tstamp
                }
            }
            chunks.append(current_chunk)
    return chunks

In [11]:
from qdrant_client.models import Distance, VectorParams, Batch
from qdrant_client import models
import requests

In [12]:
from typing import List

In [13]:
openai_client = openai.Client()

openai.api_key = os.getenv("OPENAI_API_KEY")

VECTOR_SIZE = 1536  # Size of OpenAI's text-embedding-3-small model output

def get_embedding(text: str) -> List[float]:
    """Get OpenAI embedding for the given text."""

    response = openai_client.embeddings.create(input=text, model="text-embedding-3-small")
    return response.data[0].embedding


In [14]:
embe = get_embedding("hello world")

In [15]:
len(embe)

1536

In [16]:
def create_collections(collection_name: str, vector_size = 1536):
    "Create new collection in qdrant cloud"
    client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=vector_size, 
            distance=models.Distance.COSINE,
            hnsw_config=models.HnswConfigDiff(
                m=16,
                ef_construct=100,
                full_scan_threshold=10000,
                max_indexing_threads=0
            )
        )
    )
    
    # Create indexes on metadata fields and full text
    client.create_payload_index(
        collection_name=collection_name,
        field_name="subtopic",
        field_schema=models.PayloadSchemaType.KEYWORD
    )
    client.create_payload_index(
        collection_name=collection_name,
        field_name="speakers",
        field_schema=models.PayloadSchemaType.KEYWORD
    )
    client.create_payload_index(
        collection_name=collection_name,
        field_name="title",
        field_schema=models.PayloadSchemaType.KEYWORD
    )
    client.create_payload_index(
        collection_name=collection_name,
        field_name="content",
        field_schema=models.PayloadSchemaType.TEXT
    )


In [17]:
import uuid

In [18]:
transcripts = load_transcripts("../data")

In [19]:
COLLECTION_NAME = "podcasts"
create_collections(COLLECTION_NAME, VECTOR_SIZE)

In [20]:
for transcript in transcripts:
    data = parse_transcript(transcript)
    chunks = parse_and_chunk_transcript_by_subtopic(data)
    for chunk in chunks:
        try:
            vector = get_embedding(chunk["content"])
        except Exception as e:
            # drop empty chunks
            print(f"Error getting embedding for {chunk['content']}")
            print(e)
            continue
        client.upsert(
            collection_name="podcasts",
            points=[
                models.PointStruct(
                    id=str(uuid.uuid4()),
                    vector=vector,
                    payload={
                        "subtopic": chunk["subtopic"],
                        "speakers": chunk["metadata"]["speakers"],
                        "content": chunk["content"],
                        "title": chunk["metadata"]["title"],
                        "url": chunk["metadata"]["url"],
                        "timestamp": chunk["metadata"]["timestamp"]
                    }
                )
            ]
        )

Error getting embedding for []
Error code: 400 - {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.", 'type': 'invalid_request_error', 'param': None, 'code': None}}


In [49]:
cnt = 0
chunk = None
for transcript in transcripts:
    data = parse_transcript(transcript)
    chunks = parse_and_chunk_transcript_by_subtopic(data)
    cnt += len(chunks)
    if cnt == 10:
        chunk = chunks

In [50]:
cnt

180