In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
from tqdm.auto import tqdm

In [None]:
from llama_index.readers.github import GithubRepositoryReader, GithubClient

github_token = ""

client = github_client = GithubClient(github_token=github_token, verbose=False)

reader = GithubRepositoryReader(
    github_client=github_client,
    owner="streamlit",
    repo="docs",
    use_parser=False,
    verbose=True,
    filter_directories=(
        ["content"],
        GithubRepositoryReader.FilterType.INCLUDE,
    ),
    filter_file_extensions=(
        [".md"],
        GithubRepositoryReader.FilterType.INCLUDE,
    )
)

In [None]:
documents = reader.load_data(branch="main")

In [None]:
import re

def clean_up_text(content: str) -> str:
    """
    Remove unwanted characters and patterns in text input.

    :param content: Text input.
    
    :return: Cleaned version of original text input.
    """

    # Fix hyphenated words broken by newline
    content = re.sub(r'(\w+)-\n(\w+)', r'\1\2', content)

    unwanted_patterns = ['---\nvisible: false','---', '#','slug:']
    for pattern in unwanted_patterns:
        content = re.sub(pattern, "", content)

    # Remove all slugs starting with a \ and stopping at the first space
    content = re.sub(r'\\slug: [^\s]*', '', content)

    # normalize whitespace
    content = re.sub(r'\s+', ' ', content)
    return content

In [None]:
cleaned_documents = []

for d in documents:
    cleaned_text = clean_up_text(d.text)
    d.text = cleaned_text
    cleaned_documents.append(d)

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SemanticSplitterNodeParser

embed_model = HuggingFaceEmbedding("Snowflake/snowflake-arctic-embed-m")

splitter = SemanticSplitterNodeParser(
    buffer_size=1, breakpoint_percentile_threshold=85, embed_model=embed_model
)

# Pinecone

In [None]:
from pinecone import Pinecone, ServerlessSpec
pinecone_api_key = "..."
pc = Pinecone(api_key=pinecone_api_key)

index_name = "streamlit-docs"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        ) 
    )

pinecone_index = pc.Index(index_name)

In [None]:
from llama_index.vector_stores.pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

In [None]:
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import VectorStoreIndex

pipeline = IngestionPipeline(
    transformations=[
        splitter,
        embed_model
    ],
    vector_store=vector_store,
)

pipeline.run(show_progress=True, documents=cleaned_documents)

# Cortex search

In [None]:
from llama_index.core.ingestion import IngestionPipeline

cortex_search_pipeline = IngestionPipeline(
    transformations=[
        splitter,
    ],
)

res = cortex_search_pipeline.run(show_progress=True, documents=cleaned_documents)

In [None]:
import numpy as np

print(f"Roughly the proportion of chunks that are bigger than 512 tokens (approx 385 English words): {np.mean([len(curr.text.split()) > 385 for curr in res])}")

In [None]:
import os
import snowflake.connector
from tqdm.auto import tqdm

conn = snowflake.connector.connect(
    user=os.environ["SF_USER"],
    password=os.environ["SF_PASSWORD"],
    account=os.environ["SF_ACCOUNT"],
    warehouse=os.environ["SF_WAREHOUSE"],
    database="dkurokawa",
    schema="trulens_demo",
)

conn.cursor().execute("CREATE OR REPLACE TABLE streamlit_docs(doc_text VARCHAR)")
for curr in tqdm(res):
    conn.cursor().execute("INSERT INTO streamlit_docs VALUES (%s)", curr.text)