In [1]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer
import pandas as pd
from IPython.display import display, HTML
from utils.document_parser import get_chunks
from qdrant_client.http import models
import ipywidgets as widgets
import uuid

In [2]:
encoder = SentenceTransformer('all-MiniLM-L6-v2')
# all-MiniLM-L6-v2
# all-mpnet-base-v2


In [3]:
docs = [
        {
            "url": "https://docs.google.com/document/d/1SxTsELQQafYLNXU7q4dcgR-1K0XbX29i",
            "type": "gdoc"
        },
        {
            "url": "https://github.com/virtual-labs/app-exp-create-web/blob/master/docs/developer-doc.md",
            "type": "md"
        },
        {
            "url": "https://docs.google.com/document/d/1SxTsELQQafYLNXU7q4dcgR-1K0XbX29i",
            "type": "gdoc"
        },
        {
            "url": "https://github.com/virtual-labs/app-exp-create-web/blob/master/docs/user-doc.md",
            "type": "md"
        },
        {
            "url": "https://github.com/virtual-labs/app-vlabs-pwa/blob/main/docs/tech_guide.md",
            "type": "md"
        },
        {
            "url": "https://github.com/virtual-labs/app-vlabs-pwa/blob/main/docs/user_manual.md",
            "type": "md"
        },
        {
            "url": "https://docs.google.com/document/d/1lGm88N-Z6fQM6v04k9NZTd-STZ0XYV6YRwIYT1JiSP8/",
            "type": "gdoc"
        }
     ]

In [4]:
qdrant = QdrantClient(":memory:")

In [5]:
collection_name = "my_doc"
qdrant.recreate_collection(
    collection_name=f"{collection_name}",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),
        distance=models.Distance.DOT
    )
)

True

In [6]:
for doc in docs:
    data = get_chunks(doc)
    if len(data):        
        qdrant.delete(
            collection_name=f"{collection_name}",
            points_selector=models.FilterSelector(
                filter=models.Filter(
                    must=[
                        models.FieldCondition(
                            key="page_title",
                            match=models.MatchValue(value=data[0]["payload"]["page_title"]),
                        ),
                    ],
                )
            ),
        )
        payloads = []
        vectors = []
        ids = []
        for doc in data:
            ids.append(uuid.uuid4().int >> 64)
            vectors.append(encoder.encode(doc["content"]).tolist())
            payloads.append(doc["payload"])
        qdrant.upsert(
            collection_name=f"{collection_name}",
            points=models.Batch(
                ids=ids,
                payloads=payloads,
                vectors=vectors
            ),
        )

[
    {
        "content": "Background  of CMS - Experiments :: CMS - Experiments :: All experiments in VLab are made/ stored in form of git repository (Link). Each experiment contains a predefined template constituting components such as Aim, Procedure, Theory etc. which are stored as markdown files in `. /experiment` folder. `Pretest` and `Posttest` are stored in Json file as they have question/answer format. This experiment folder is then converted into a static site by another program (Link)",
        "payload": {
            "page_title": "CMS - Experiments",
            "heading": "Background",
            "text": "Background  of CMS - Experiments :: CMS - Experiments :: All experiments in VLab are made/ stored in form of git repository (Link). Each experiment contains a predefined template constituting components such as Aim, Procedure, Theory etc. which are stored as markdown files in `. /experiment` folder. `Pretest` and `Posttest` are stored in Json file as they have question

In [7]:
queries = [
	"what are future update and releases for content development platform", #0
	"how to modify experiment after creating it", #1
	"getting authentication error in content development platform", #2
	"how do i search experiments in pwa", #3
	"how do i apply filters in vlabs pwa", #4
	"how do i apply filters on search in vlabs pwa", #5
	"what is google's revenue in 2022", #6
	"what is my name", #7
	"Virtual Labs Content Development Platform - Developer Document", #8
	"what was the problem which lead to cms development", #9
	"tell about shauryagarh in board game", #10
]


In [8]:
query_input = widgets.Text(placeholder="Enter your query...",description="Search",)
limit_input = widgets.Text(placeholder="Enter limit...",description="Limit", value="10")
search_button = widgets.Button(description="Search")
clear_button = widgets.Button(description="Clear")

filter_options = ['Any', 'md', 'gdoc']
filter_dropdown = widgets.Dropdown(options=filter_options, description='Doc Type:')
selected_filter = None

results_table = widgets.Output()

def perform_search(b):
    global selected_filter
    query = query_input.value
    limit = limit_input.value

    if query == '' : return 
    selected_filter = filter_dropdown.value

    filter = models.Filter(must=[
        models.FieldCondition(key="type",  match=models.MatchValue(value=selected_filter),)
    ]) if selected_filter != 'Any' else None

    hits = qdrant.search(
        collection_name="my_doc",
        query_vector=encoder.encode(
            query).tolist(),
        limit=int(limit),
        query_filter=filter
    )

    search_results = []
    for hit in hits:
        search_results.append({
            "type": hit.payload["type"], 
            "url": hit.payload["url"],
            "score": hit.score,
            "heading": hit.payload["heading"],
            "document": hit.payload["page_title"],
            "text": hit.payload["text"].split("::")[2].strip(),
        })

    if selected_filter != 'Any':
        search_results = [result for result in search_results if result['type'] == selected_filter]
    df = pd.DataFrame(search_results)
    if len(df):
        df["url"] = df["url"].apply(lambda link: f'<a href="{link}">Visit Link</a>')
        df["text"] = df["text"].apply(lambda link: link.replace("\n", "<br>"))
        with results_table:
            results_table.clear_output() 
            display(HTML(df.to_html(escape=False)))
    else:
        results_table.clear_output()

def perform_clear(b):
    results_table.clear_output() 

search_button.on_click(perform_search)
clear_button.on_click(perform_clear)

form = widgets.VBox([query_input,limit_input, filter_dropdown, search_button,clear_button, results_table])

display(form)

VBox(children=(Text(value='', description='Search', placeholder='Enter your query...'), Text(value='10', descr…