In [None]:
import polars as pl
from openai import OpenAI
from dotenv import load_dotenv
import os

import minsearch

from qdrant_client import QdrantClient, models
import re
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fastembed import TextEmbedding
import numpy as np
from typing import List, Dict

In [None]:
# initialise gpt
load_dotenv()  # loads from .env
client = OpenAI()

In [103]:
qd_client = QdrantClient("http://localhost:6333")

In [None]:
# sample dataset due to limits for llm
df = pl.read_csv("../data/japantravel_posts_with_comments.csv")

df = df.with_columns(pl.col('selftext').fill_null(''),
                     pl.col('comment1').fill_null(''),
                     pl.col('comment2').fill_null(''),
                     pl.col('comment3').fill_null(''),
                     pl.col('comment4').fill_null(''),
                     pl.col('comment5').fill_null(''))
df = df.with_columns(
    (
        "Comment 1: " + pl.col("comment1") + "\n" +
        "Comment 2: " + pl.col("comment2") + "\n" +
        "Comment 3: " + pl.col("comment3") + "\n" +
        "Comment 4: " + pl.col("comment4") + "\n" +
        "Comment 5: " + pl.col("comment5")
    ).alias("comments_combined")
    ).drop(["comment1", "comment2", "comment3", "comment4", "comment5"])

RANDOM_SEED = 42
n_samples = 80
df_sampled = df.sample(n=n_samples, seed=RANDOM_SEED)
documents = df_sampled.to_dicts()

In [None]:
df_ground_truth = pl.read_csv('../data/ground_truth.csv')

ids_in_sample = df_sampled['id'].to_list()
df_ground_truth_sampled = df_ground_truth.filter(pl.col("id").is_in(ids_in_sample))
ground_truth = df_ground_truth_sampled.to_dicts()

In [212]:
df_sampled.head(5)

id,title,selftext,url,comments_combined
str,str,str,str,str
"""1n7nuhb""","""2 Week Itinerary Feedback""","""Hi guys, I’m looking for some …","""https://www.reddit.com/r/Japan…","""Comment 1: This feels like a l…"
"""1fgrwdx""","""Back-to-back Kusatsu Onsen and…","""In Nagano, we didn't find much…","""https://www.reddit.com/r/Japan…","""Comment 1: There are some grea…"
"""1n0490k""","""Itinerary and Suggestions - Si…","""Hello, My sister and I are go…","""https://www.reddit.com/r/Japan…","""Comment 1: I see some amount o…"
"""1nd0k84""","""3 week Itinerary feedback requ…","""# Day 1 — Arrival & Shibuya/S…","""https://www.reddit.com/r/Japan…","""Comment 1: Comment 2: Commen…"
"""1n8j5t2""","""Itinerary for relaxed first ti…","""Hello everyone! My friend an…","""https://www.reddit.com/r/Japan…","""Comment 1: I would move all yo…"


# Minsearch

In [97]:
def search(query):
    boost = {'title':1.3, 'selftext': 1.2}

    results = index.search(
        query = query,
        boost_dict = boost,
        num_results = 5
    )

    return results

index = minsearch.Index(
        text_fields=["title", "selftext", "comments_combined"],
        keyword_fields = ["id", "url"]
        )
index.fit(documents)

<minsearch.minsearch.Index at 0x1895a640bb0>

In [92]:
def build_prompt(search_results, query):
    prompt_template = """"
    You are a helpful travel guide for people visiting Japan. 
    Answer the QUESTION based on the CONTEXT from travel discussions and experiences.  
    Use only the facts from the CONTEXT when you are answering the QUESTION.  
    If the CONTEXT does not contain enough information, say that you don’t know.  

    QUESTION: {question}  

    CONTEXT: {context}

    """.strip()

    max_docs = len(search_results)
    context_parts = []
    for i, doc in enumerate(search_results[:max_docs], start=1):
        part = f"""
            Document {i}:
            Title: {doc.get('title', '')}
            Post: {doc.get('selftext', '')}
            Comments: {doc.get('comments_combined', '')}  
        """
        context_parts.append(part.strip())
        
    context = "\n\n".join(context_parts)
    prompt = prompt_template.format(question=query, context=context)
    return prompt


In [13]:
def llm(prompt):
    
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

In [14]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(search_results, query)
    answer = llm(prompt)
    return answer

In [None]:
relevance_total = []
llm_answers = []

for r in ground_truth: 
    doc_id = r['id']

    results = search(r['question'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

    llm_ans = rag(r['question'])
    llm_answers.append(llm_ans)

In [99]:
print(f"number of llm-generated questions answered: {len(llm_answers)}")
print(f"total number of llm-generated questions: {len(df_ground_truth)}")

number of llm-generated questions answered: 400
total number of llm-generated questions: 2225


In [100]:
def mrr(relevance_total):
    total_score = 0.0
    
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank+1)

    return total_score / len(relevance_total)

mrr(relevance_total)

0.30008333333333337

In [101]:
# calculate cosine similarity
final_answer_number = len(llm_answers)

corpus = [doc["comments_combined"] for doc in documents] 

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

ground_truth_ans = df_ground_truth.select(pl.col('answer')).to_series().to_list()[:final_answer_number]
gt_vectors = vectorizer.transform(ground_truth_ans)
llm_vectors = vectorizer.transform(llm_answers[:final_answer_number])

# gt_vectors and llm_vectors are sparse matrices of shape (n, vocab_size)
similarities = []
for i in range(len(ground_truth_ans)):
    sim = cosine_similarity(gt_vectors[i], llm_vectors[i])[0][0]
    similarities.append(sim)

print(f"Cosine similarity: {round(sum(similarities)/len(similarities), 4)}")

Cosine similarity: 0.0366


In [102]:
for i in range(0,5):
    print(f"Ground truth ans: {ground_truth_ans[i]}")
    print(f"LLM ans retrieval: {llm_answers[i]}")
    print()

Ground truth ans: The traveler unexpectedly joined a local Japanese family for dinner, sharing stories and laughter with them for several hours.
LLM ans retrieval: If you develop a UTI during your trip to Kyoto, you should consider visiting Kajita Urology, which has been highly recommended by others. Make sure to bring your passport or a valid driver's license for identification. While they may have a wait time of about 2 hours, the process is generally efficient. The clinic is open from 9:00-13:00 and 16:30-19:30, so plan accordingly. Additionally, to manage pain and symptoms before your appointment, you could try a herbal medication drink called "JinSenSan," which has been reported to help alleviate symptoms. Be prepared to pay around 5,700 yen ($37) for the consultation, urine test, and antibiotics.

Ground truth ans: Despite minimal Japanese language skills and the locals' limited English, they were able to communicate and form a connection through simple conversations and translat

## Vector Search

In [None]:
collection_name = "travel-rec-sparse"
qd_client.delete_collection(collection_name)

EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

qd_client.create_collection(
    collection_name=collection_name,
    vectors_config = models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  
        distance=models.Distance.COSINE  
    )
)

True

In [None]:
# ----------------
# Chunking Function
# ----------------
def chunk_text(
    text: str, 
    doc_id: str, 
    chunk_size: int = 200, 
    overlap: int = 20
) -> List[Dict]:
    """Split text into overlapping chunks of words with unique IDs."""
    words = re.split(r"\s+", text.strip())
    
    chunks = []
    start = 0
    chunk_num = 0

    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunk_words = words[start:end]
        chunk_text = " ".join(chunk_words)

        chunks.append({
            "id": f"{doc_id}_{chunk_num}",
            'doc_id': doc_id,
            "text": chunk_text
        })

        chunk_num += 1

        if end == len(words) or len(words) - end <= overlap:
            break
        start = end - overlap
    
    return chunks


# ----------------
# Embedding + Upsert
# ----------------
def upsert_documents(collection_name, docs):

    all_chunks = []
    for doc in docs:
        text = f"Title: {doc['title']}\n\nContent: {doc['selftext']}\n\nComments:\n{doc['comments_combined']}"
        chunks = chunk_text(text, doc['id'], chunk_size=200, overlap=20)
        all_chunks.extend(chunks)

    # Build Qdrant points
    points = [
        models.PointStruct(
            id=i, 
            vector=models.Document(text = doc['text'], model = model_handle),
            payload={"id": doc["id"], "doc_id": doc["doc_id"], "text": doc["text"]}
        )
        for i, doc in enumerate(all_chunks)
    ]

        # Upsert into Qdrant
    qd_client.upsert(
        collection_name=collection_name,
        points=points
    )

In [108]:
upsert_documents(collection_name, documents)

In [109]:
def vector_search(query, limit = 1):
    results = qd_client.query_points(
        collection_name=collection_name,
        query = models.Document(
            text = query,
            model = model_handle
        ),
        limit = limit,
        with_payload = True
    )
    return results.points

In [125]:
def build_prompt_vector_search(search_results, query):
    prompt_template = """"
    You are a helpful travel guide for people visiting Japan. 
    Answer the QUESTION based on the CONTEXT from travel discussions and experiences as concise as possible.  
    Use only the facts from the CONTEXT when you are answering the QUESTION.  
    If the CONTEXT does not contain enough information, say that you don’t know.  

    QUESTION: {question}  

    CONTEXT: {context}

    """.strip()

    max_docs = len(search_results)
    context_parts = []
    for i, result in enumerate(search_results[:max_docs], start=1):
        doc = result.payload
        part = f"""
            Document {i}:
            Text: {doc.get('text', '')}
            Doc_id: {doc.get('doc_id', '')}
            Id: {doc.get('id', '')}  
        """
        context_parts.append(part.strip())
        
    context = "\n\n".join(context_parts)
    prompt = prompt_template.format(question=query, context=context)
    return prompt


In [112]:
# test
query = 'give me a feasible itinerary'
limit = 3
search_results = vector_search(query, limit)
highest_score = search_results[0].score
print(f"The highest score is {highest_score}")
print(f"Doc_id: {search_results[0].payload['id']}")
print(search_results[0].payload['text'])

The highest score is 0.84673715
Doc_id: 1nbp784_0
Title: Sep. - Oct. itinerary (Kanazawa/Tokyo with day trips) Content: Apologies in advance for the novel. My dad (55M) and I (28F) will be visiting Japan for the first time in a couple of weeks (Sep. 24 - Oct. 3) and I’d love to get some feedback on our itinerary. Other than the specific questions peppered throughout, I’d also appreciate input on the general feasibility of these plans. I figure some things will need to go but don't yet know how to choose. Notes: * I’ve been asked why we’re spending so much time in Kanazawa, and the answer is that my dad has to be there for work/educational reasons. Some activities will be on my own and others we will do together. The Tokyo plans are all for us to do together. * We’re interested in arts, culture, stationery, and handicrafts; I’m also interested in history, anime, and video games. * I have a chronic illness that impacts my energy levels and will be renting a power wheelchair to have as ba

In [117]:
def rag_vector_search(query):
    search_results = vector_search(query, 1)
    prompt = build_prompt_vector_search(search_results, query)
    answer = llm(prompt)
    return answer

In [130]:
relevance_total = []
llm_answers = []

for r in ground_truth:
    doc_id = r['id']

    # get list for MRR
    results = vector_search(r['question'], 5)
    relevance = [d.payload['doc_id'] == doc_id for d in results]
    relevance_total.append(relevance)

    # get responses from llm
    llm_ans = rag_vector_search(r['question'])
    llm_answers.append(llm_ans)


In [131]:
def mrr(relevance_total):
    total_score = 0.0
    
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank+1)

    return total_score / len(relevance_total)

mrr(relevance_total)

0.6246249999999999

In [132]:
# calculate cosine similarity

# embed the vectors
model_handle = "jinaai/jina-embeddings-v2-small-en"
embedding_model = TextEmbedding(model_name = model_handle)
ground_truth_answers = df_ground_truth.select(pl.col('answer')).to_series().to_list()


# Encode ground truth and LLM answers
def cosine_similarity(ground_truth_ans, llm_ans):
    gt_emb = list(embedding_model.embed(ground_truth_ans))
    llm_emb = list(embedding_model.embed(llm_ans))

    # calculate cosine similarity
    def cosine(u, v):
        u_norm = np.sqrt(u.dot(u))
        v_norm = np.sqrt(v.dot(v))
        return u.dot(v) / (u_norm * v_norm)

    similarity_lst = []
    for i in range(len(llm_emb)):
        similarity = cosine(gt_emb[i], llm_emb[i])
        similarity_lst.append(similarity)

    return sum(similarity_lst) / len(similarity_lst)

In [None]:
cos_similarity = cosine_similarity(ground_truth_answers, llm_answers)
print(f"The cosine similarity is {round(cos_similarity, 4)}")

The cosine similarity is 0.7023


In [135]:
for i in range(10,15):
    print(f"Ground truth ans: {ground_truth_ans[i]}")
    print(f"LLM ans retrieval: {llm_answers[i]}")
    print()

Ground truth ans: You can refer to Japan-Guide.com, Accessible Japan, Accessible Travel Japan, and the Japan Accessible Tourism Center for information.
LLM ans retrieval: Watadzumi Shrine has prohibited entry to its grounds for all individuals except registered parishioners and devoted worshippers, due to a disrespectful incident by a foreign visitor. This includes banning photography, video recording, and participation in sightseeing tours.

Ground truth ans: He was last seen at Shin Kiba station on Platform 1.
LLM ans retrieval: The incident that prompted the shrine to restrict access to its grounds was an extremely serious and unacceptable act of disrespect committed within the sacred grounds by a foreign visitor on March 22.

Ground truth ans: He is 62 years old, about 5 feet 2 inches tall, and has a mustache.
LLM ans retrieval: The impact of disrespectful tourist behaviors on Watadzumi Shrine's operations has been significant. The shrine has prohibited entry to all tourists, restr

In [None]:
# save llm answers
# df_vector_search_results = df_ground_truth_sampled.with_columns(pl.Series(name="llm_answers", values=llm_answers)) 
# df_vector_search_results.write_csv("../data/vector_search_results.xlsx")

## Hybrid search

In [151]:
# Create the collection with specified sparse vector parameters
collection_name = "travel-rec-dense-and-sparse"
qd_client.delete_collection(collection_name)

EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"


qd_client.create_collection(
    collection_name=collection_name,
    vectors_config = {
        "jina-small": models.VectorParams(
            size=512,
            distance=models.Distance.COSINE,
        ),        
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        ),
    }
)

True

In [152]:
def upsert_documents_hybrid(collection_name, docs):

    all_chunks = []
    for doc in docs:
        text = f"Title: {doc['title']}\n\nContent: {doc['selftext']}\n\nComments:\n{doc['comments_combined']}"
        chunks = chunk_text(text, doc['id'], chunk_size=200, overlap=20)
        all_chunks.extend(chunks)

        # Build Qdrant points
    points = [
        models.PointStruct(
            id=i, 
            vector={
                "jina-small": models.Document(text = doc['text'], model = "jinaai/jina-embeddings-v2-small-en"),
                "bm25":models.Document(text = doc['text'], model = "Qdrant/bm25")},
            payload={"id": doc["id"], "doc_id": doc["doc_id"], "text": doc["text"]}
            
        )
        for i, doc in enumerate(all_chunks)
    ]

        # Upsert into Qdrant
    qd_client.upsert(
        collection_name=collection_name,
        points=points
    )

In [153]:
upsert_documents_hybrid(collection_name, documents)

In [154]:
# search
def hybrid_search(query, limit):
    results = qd_client.query_points(
        collection_name=collection_name,
        prefetch=[
            models.Prefetch(query=models.Document(
                text=query,
                model='jinaai/jina-embeddings-v2-small-en',
            ),
            using='jina-small',
            limit=limit,
            ),
            models.Prefetch(query=models.Document(
                text=query,
                model='Qdrant/bm25',
            ),
            using='bm25',
            limit=limit)
        ],
        
        query = models.FusionQuery(fusion=models.Fusion.RRF),
        with_payload=True
    )
    return results.points

In [155]:
# test
query = 'give me a reasonably-paced itinerary for Tokyo'
limit = 1

results = hybrid_search(query,1)
results[0].payload['text']
results[0].payload['id']

'1n7nuhb_1'

In [None]:
def rag_hybrid_search(query):
    search_results = hybrid_search(query, 1)
    prompt = build_prompt_vector_search(search_results, query) 
    answer = llm(prompt)
    return answer

In [193]:
relevance_total = []
llm_answers = []
document_retrieved = []

for r in ground_truth:
    doc_id = r['id']

    # get list for MRR
    results = hybrid_search(r['question'], 5)
    relevance = [d.payload['doc_id'] == doc_id for d in results]
    relevance_total.append(relevance)

    # get responses from llm
    llm_ans = rag_hybrid_search(r['question'])
    llm_answers.append(llm_ans)

    # save id of documents retrieved
    result_docs = [d.payload['doc_id'] for d in results]
    document_retrieved.append(result_docs)

In [208]:
retrieved_docs_json = [json.dumps(sublist) for sublist in document_retrieved]


In [None]:
# use this to read the json string (replace retrieved_docs_json with the pd column)
retrieved_docs_back = [json.loads(x) for x in retrieved_docs_json]


In [None]:
# df_hybrid_search= df_ground_truth_sampled.with_columns(
#     pl.Series(name = "retrieved_doc", values = retrieved_docs_json),
#     pl.Series(name = "llm_answers", values = llm_answers))
# df_hybrid_search.write_csv("../data/hybrid_search_resuls.csv")

In [197]:
print(f"MRR: {mrr(relevance_total)}")

MRR: 0.8100277777777772


In [198]:
cos_similarity = cosine_similarity(ground_truth_answers, llm_answers)
print(f"The cosine similarity is {round(cos_similarity, 4)}")

The cosine similarity is 0.7101
