# Qdrant Vector Database Creation

In [None]:
import os
import requests

from getpass import getpass
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain_community.vectorstores import Qdrant
from llama_cpp import Llama
#import matplotlib.pyplot as plt
import pandas as pd
from tqdm.notebook import tqdm
from qdrant_client import QdrantClient

## GitHub Documents

In [None]:
# Enter your GitHub Personal Access Token securely
ACCESS_TOKEN=getpass(prompt="GitHub Personal Access Token: ")

In [1]:
#@limits(calls=, period=60)
def fetch_file(file_url):
    response = requests.get(file_url, headers={'Accept': 'application/vnd.github.v3.raw', 'Authorization': f'token {ACCESS_TOKEN}'})
    response.raise_for_status()
    return response

#@limits(calls=100, period=60)
def fetch_folder(base_url, headers):
    response = requests.get(base_url, headers=headers)
    response.raise_for_status()  # This will raise an error for failed requests
    return response


def fetch_and_process_rst_files(repo, branch, path):
    """
    Recursively fetch and process RST files from a GitHub repository.
    """
    base_url = f"https://api.github.com/repos/{repo}/contents/{path}?ref={branch}"
    headers = {'Accept': 'application/vnd.github.v3+json', 'Authorization': f'token {ACCESS_TOKEN}'}
    response = fetch_folder(base_url, headers)
    files = response.json()

    documents = []
    for file in files:
        if file['type'] == 'dir':  # This is a directory; recurse into it
            documents.extend(fetch_and_process_rst_files(repo, branch, file['path']))
        elif file['name'].endswith('.rst'):
            file_url = file['download_url']
            response = fetch_file(file_url)
            title = file['name'].replace('.rst', '').replace('_', ' ').title()
            documents.append(Document(page_content=response.text, metadata={"title": title, "url": file_url}))

    return documents
        

In [None]:
# Usage example
repository = 'boto/boto3'
branch = 'develop'
docs_path = 'docs'

In [None]:
github_documents = fetch_and_process_rst_files(
    repo=repository, 
    branch=branch, 
    path=docs_path,
)

In [None]:
len(github_documents)

In [None]:
github_documents[0]

In [None]:
gh_s = pd.Series(github_documents)
gh_s.to_pickle('resources/data/boto3_docs.pkl')

In [None]:
gh_df.to_list()

### Documents Loader

LangChain helps load different documents (.txt, .pdf, .docx, .csv, .xlsx, .json) to feed into the LLM. The Document Loader even allows YouTube audio parsing and loading as part of unstructured document loading.

Once loaded into the LangChain, the document can be pre-processed in different ways as required in the LLM application.  

In [None]:
from langchain_community.document_loaders import DataFrameLoader

In [None]:
# Load the dataframe full of abstracts
# to memory in the form of LangChain Document objects
loader = DataFrameLoader(astro_df, page_content_column="abstract") 
astrophysics_abstracts_documents = loader.load()

In [None]:
print("Number of astrophysics papers: ", len(astrophysics_abstracts_documents))

In [None]:
all_documents = github_documents
print("Total Number of Documents: ", len(all_documents))

## Qdrant Creation

In [None]:
gh_df = pd.read_pickle('resources/data/boto3_docs.pkl')

In [None]:
# TODO: Fix the path
qdrant_path = "resources/data/qdrant/usrse_qdrant/"
qdrant_collection = "boto3_docs"

In [None]:
model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
from langchain_text_splitters import MarkdownTextSplitter

#text_splitter = MarkdownTextSplitter(chunk_size=512, chunk_overlap=0)
text_splitter = MarkdownTextSplitter()
texts = text_splitter.split_documents(gh_df)

print("Number of text chunks: ", len(texts))

In [None]:
print(f"Creating new Qdrant collection '{qdrant_collection}' from {len(texts)} documents")
    
# Load the documents into a Qdrant Vector Database Collection
# this will save locally in the current directory as sqlite
qdrant = Qdrant.from_documents(
    documents=texts,
    embedding=model,
    path=qdrant_path,
    collection_name=qdrant_collection,
)

In [None]:
# Setup the retriever for later step
retriever = qdrant.as_retriever(search_type="mmr", search_kwargs={"k": 2})

In [None]:
retriever.invoke("How can I create an SQS queue?")

In [None]:
retriever.invoke("How can I perform celestial coordinate transformations?")

In [None]:
# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
print(format_docs(retriever.invoke("How do I create a DynamoDB table?")))

In [None]:
qdrant = None