In [1]:
import pandas as pd
import json
import numpy as np
from tqdm import tqdm
from fastembed import TextEmbedding
from qdrant_client import QdrantClient, models

In [3]:
import os
from huggingface_hub import login

huggingface_token = os.environ.get("HUGGINGFACE_TOKEN")
login(token=huggingface_token)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Embed into Database

In [4]:
model = TextEmbedding(model_name="BAAI/bge-base-en-v1.5")
print("The model BAAI/bge-small-en-v1.5 is ready to use.")

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

model_optimized.onnx:   0%|          | 0.00/218M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/740 [00:00<?, ?B/s]

The model BAAI/bge-small-en-v1.5 is ready to use.


In [3]:
df = pd.read_csv('podcast_data.csv')

In [4]:
df.head()

Unnamed: 0,episode,title,summary,song_recommendation
0,1,沒錄到後半部的NMSL前世今生,NMSL你媽死了 最早是在CS看到的 msl是一個丹麥二線玩家（通常是一線貼紙比較貴）一個團...,
1,2,我實在是想不到比多人運動更委婉的說法,A 片打炮算偷情 戴 VR 做愛舉報通姦罪 做公投 究極 no 選擇 用 VR 跟老婆做愛 ...,
2,3,你聽過我爸連戰嗎?,強烈要求清大人社把學費還給艾莉莎莎，宿舍費不退 友人爭論時很愛拿綽號倒過來念來 不好笑的人都...,
3,4,抄襲仔求生指南與周遊記挑戰失敗,Lofi house 便當怎麼抄襲都沒差 破鍋配爛蓋 不流汗的室內設計 風格師 5566 抄...,
4,5,傑出人士表揚，李來希先生。,podcast 比 yt 好賺的原因 頻率 光頭葛格 抖內30元 祝生日快樂 先射箭再畫靶 ...,


In [6]:
def dataframe_to_points(model, df):
    points = []

    for _, row in tqdm(df.iterrows(), desc="Embedding rows of DataFrame"):
        # Create a list of "column:value" pairs for each row
        id_value = row['episode']
        
        pairs = [f"{col}:{val}" for col, val in row.items() if col != 'episode']
        
        # Join these pairs with commas
        row_text = ";".join(pairs)

        embedding = list(model.embed(row_text))[0]

        points.append(
            models.PointStruct(
                id=id_value,
                vector=embedding
            )
        )

    return points

In [8]:
embed_points = dataframe_to_points(model, df)

Embedding rows of DataFrame: 444it [02:11,  3.37it/s]


In [6]:
client = QdrantClient(url="http://localhost:6333")

In [7]:
client.collection_exists(collection_name="podcast_collection")

False

In [8]:
with open('podcast_topic.json', 'r', encoding='utf-8') as f:
    podcast_topic = json.load(f)

In [22]:
def json_to_points(json_data):
    """
    Converts the podcast episode JSON data into separate text strings for each episode.
    
    Args:
        json_data (dict): JSON dictionary containing episode data
        
    Returns:
        points: 
    """
    points = []
    
    for episode_id, content in tqdm(json_data.items()):
        episode_text = []
        
        # Add topics if available
        if 'topic' in content and content['topic']:
            topics_text = ', '.join(content['topic'])
            episode_text.append(topics_text)
        
        # Add song if available
        if 'song' in content and content['song']:
            episode_text.append(f"Song: {content['song']}")

        embedding = list(model.embed('\n'.join(episode_text)))[0]
        
        points.append(
            models.PointStruct(
                id=int(episode_id),
                vector=embedding
            )
        )
        
    return points

In [18]:
# Try to create collection (ignore if exists)
try:
    client.create_collection(
        collection_name="podcast_topic_collection",
        vectors_config=models.VectorParams(
            size=768,
            distance=models.Distance.COSINE
        )
    )
    print(f"Created new collection: podcast_topic_collection")
except Exception as e:
    print(f"Collection may already exist: {e}")

Created new collection: podcast_topic_collection


In [23]:
embed_points = json_to_points(podcast_topic)

100%|███████████████████████████████████████████████████████████████████████| 411/411 [00:57<00:00,  7.11it/s]


In [24]:
client.upsert(
    collection_name="podcast_topic_collection",
    points=embed_points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [4]:
client = QdrantClient(url="http://localhost:6333")

# Get vector dimension
sample_text = f"title:{df.iloc[0]['title']}; summary:{df.iloc[0]['summary']}"
sample_vector = list(model.embed(sample_text))[0]
vector_size = len(sample_vector)

# Try to create collection (ignore if exists)
try:
    client.create_collection(
        collection_name="podcast_collection",
        vectors_config=models.VectorParams(
            size=vector_size,
            distance=models.Distance.COSINE
        )
    )
    print(f"Created new collection: podcast_collection")
except Exception as e:
    print(f"Collection may already exist: {e}")

NameError: name 'model' is not defined

In [13]:
client.upsert(
    collection_name="podcast_collection",
    points=embed_points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
# def setup_qdrant(podcast_df, model, collection_name="podcast_collection"):
#     """Create a Qdrant collection and upload podcast vectors."""
#     # Initialize Qdrant client
#     client = QdrantClient(url="http://localhost:6333")
    
#     # Get vector dimension
#     sample_text = f"title:{podcast_df.iloc[0]['title']}; summary:{podcast_df.iloc[0]['summary']}"
#     sample_vector = list(model.embed(sample_text))[0]
#     vector_size = len(sample_vector)
    
#     # Try to create collection (ignore if exists)
#     try:
#         client.create_collection(
#             collection_name=collection_name,
#             vectors_config=models.VectorParams(
#                 size=vector_size,
#                 distance=models.Distance.COSINE
#             )
#         )
#         print(f"Created new collection: {collection_name}")
#     except Exception as e:
#         print(f"Collection may already exist: {e}")
    
#     # Embed and upload each podcast episode
#     points = []
#     for _, row in tqdm(podcast_df.iterrows(), total=len(podcast_df), desc="Embedding podcast episodes"):
#         episode_id = row['episode']
#         title = row['title'] if not pd.isna(row['title']) else ""
#         summary = row['summary'] if not pd.isna(row['summary']) else ""
        
#         # Create a text representation of the episode
#         text_representation = f"title:{title}; summary:{summary}"
        
#         # Embed the text
#         embedding = list(model.embed(text_representation))[0]
        
#         # Create a point
#         points.append(
#             models.PointStruct(
#                 id=int(episode_id),
#                 vector=embedding,
#                 payload={
#                     'episode': int(episode_id),
#                     'title': title,
#                     'summary': summary
#                 }
#             )
#         )
    
#     # Upload points in batches
#     batch_size = 100
#     for i in range(0, len(points), batch_size):
#         batch = points[i:i+batch_size]
#         client.upsert(
#             collection_name=collection_name,
#             points=batch
#         )
    
#     print(f"Uploaded {len(points)} episodes to Qdrant.")
#     return client

In [29]:
with open('podcast_ground_truth.json', 'r', encoding='utf-8') as f:
    ground_truth = json.load(f)

In [31]:
# Step 4: Evaluate retrieval using ground truth sentences
def evaluate_retrieval(client, model, ground_truth, collection_name="podcast_topic_collection", k=5):
    """
    Evaluate retrieval performance using ground truth sentences.
    
    For each sentence in the ground truth data, embed it and query Qdrant.
    Check if the correct episode is in the top k results.
    """
    # Prepare to store results
    all_relevance = []
    
    # Process each episode and its sentences
    for episode_id, episode_data in tqdm(ground_truth.items(), desc="Evaluating episodes"):
        sentences = episode_data.get('sentence', [])
        
        for sentence in sentences:
            # Embed the sentence
            query_vector = list(model.embed(sentence))[0]
            
            # Query Qdrant
            search_results = client.search(
                collection_name=collection_name,
                query_vector=query_vector,
                limit=k
            )
            
            # Check if the correct episode is in the results
            result_ids = [result.id for result in search_results]
            relevance = [int(episode_id) == result_id for result_id in result_ids]
            
            all_relevance.append(relevance)
    
    return all_relevance

In [26]:
# Step 5: Calculate metrics
def calculate_metrics(relevance_results):
    """Calculate hit rate and MRR from relevance results."""
    # Hit rate (Recall@k)
    hit_count = sum(1 for relevance in relevance_results if True in relevance)
    hit_rate = hit_count / len(relevance_results) if relevance_results else 0
    
    # Mean Reciprocal Rank (MRR)
    mrr_sum = 0
    for relevance in relevance_results:
        for i, is_relevant in enumerate(relevance):
            if is_relevant:
                mrr_sum += 1 / (i + 1)
                break
    
    mrr = mrr_sum / len(relevance_results) if relevance_results else 0
    
    return {
        'hit_rate': hit_rate,
        'mrr': mrr,
        'total_queries': len(relevance_results),
        'successful_queries': hit_count
    }


In [33]:
# Step 6: Evaluate for different k values
def evaluate_multiple_k(client, model, ground_truth, collection_name="podcast_topic_collection", k_values=[3, 5, 10]):
    """Evaluate retrieval for different k values."""
    results = {}
    
    for k in k_values:
        print(f"\nEvaluating for k={k}")
        relevance_results = evaluate_retrieval(client, model, ground_truth, collection_name, k)
        metrics = calculate_metrics(relevance_results)
        
        results[k] = metrics
        print(f"Hit Rate: {metrics['hit_rate']:.4f}, MRR: {metrics['mrr']:.4f}")
    
    return results

In [34]:
# Evaluate for different k values
print("Evaluating search performance...")
results = evaluate_multiple_k(client, model, ground_truth)

# Print final results
print("\nFinal Evaluation Results:")
print("========================")
for k, metrics in results.items():
    print(f"k={k}:")
    print(f"  Hit Rate: {metrics['hit_rate']:.4f}")
    print(f"  MRR:      {metrics['mrr']:.4f}")
    print(f"  ({metrics['successful_queries']} out of {metrics['total_queries']} queries successful)")


Evaluating search performance...

Evaluating for k=3


  search_results = client.search(
Evaluating episodes: 100%|██████████████████████████████████████████████████| 411/411 [03:50<00:00,  1.78it/s]


Hit Rate: 0.2092, MRR: 0.1672

Evaluating for k=5


Evaluating episodes: 100%|██████████████████████████████████████████████████| 411/411 [04:06<00:00,  1.67it/s]


Hit Rate: 0.2511, MRR: 0.1768

Evaluating for k=10


Evaluating episodes: 100%|██████████████████████████████████████████████████| 411/411 [03:07<00:00,  2.20it/s]

Hit Rate: 0.3148, MRR: 0.1855

Final Evaluation Results:
k=3:
  Hit Rate: 0.2092
  MRR:      0.1672
  (430 out of 2055 queries successful)
k=5:
  Hit Rate: 0.2511
  MRR:      0.1768
  (516 out of 2055 queries successful)
k=10:
  Hit Rate: 0.3148
  MRR:      0.1855
  (647 out of 2055 queries successful)



