# Instructions

1. In `Notebook Settings > External Access`, turn on `ST_ASSISTANT_EXTERNAL_INTEGRATIONS`
1. Try running the notebook once to make sure it works
1. If all good, click on the calendar icon to schedule this notebook to rerun every day. 

In [None]:
import json
import pandas as pd
import re
import requests
import streamlit as st
from langchain.text_splitter import RecursiveCharacterTextSplitter, RecursiveJsonSplitter
from packaging import version

from snowflake.snowpark.context import get_active_session
session = get_active_session()

In [None]:
use role st_assistant_pipeline;

In [None]:
def get_docs_pages_df():
    PAGE_SEP_RE = re.compile("^---$", flags=re.MULTILINE)
    URL_RE = re.compile("^Source: (.*)$", flags=re.MULTILINE)
    
    url = "https://docs.streamlit.io/llms-full.txt"
    full_str = requests.get(url).text
    page_strs = PAGE_SEP_RE.split(full_str)
    
    text_splitter = RecursiveCharacterTextSplitter()
    page_table_rows = []
    
    for page_str in page_strs:
        url = None
    
        for match in URL_RE.finditer(page_str):
            if match.lastindex == 1:
                url = match[1]
                break
    
        chunks = text_splitter.split_text(page_str)
    
        for chunk in chunks:
            page_table_rows.append(
                dict(
                    PAGE_URL=url,
                    PAGE_CHUNK=chunk,
                )
            )
    
    return pd.DataFrame(page_table_rows)

docs_pages_df = get_docs_pages_df()

In [None]:
def update_dict_with_latest_streamlit_version(docstrings_dict):
    all_versions = []

    for v_str in docstrings_dict.keys():
        try:
            v = version.parse(v_str)
        except version.InvalidVersion:
            continue

        all_versions.append(v)

    latest_version = max(all_versions)
    docstrings_dict["latest"] = docstrings_dict[str(latest_version)]

    print("Detected latest Streamlit version as ", latest_version)


def get_docstrings_df():
    json_splitter = RecursiveJsonSplitter()
    
    url = "https://raw.githubusercontent.com/streamlit/docs/refs/heads/main/python/streamlit.json"
    full_str = requests.get(url).text
    docstrings_dict = json.loads(full_str)
    
    update_dict_with_latest_streamlit_version(docstrings_dict)
    
    docstrings_table_rows = []
    
    for st_version, version_docs in docstrings_dict.items():
        for command_name, command_docstring_obj in version_docs.items():
            chunks = json_splitter.split_text(command_docstring_obj)
    
            for chunk in chunks:
                docstrings_table_rows.append(
                    dict(
                        STREAMLIT_VERSION=st_version,
                        COMMAND_NAME=command_name,
                        DOCSTRING_CHUNK=chunk,
                    )
                )

    return pd.DataFrame(docstrings_table_rows)

docstrings_df = get_docstrings_df()

In [None]:
TRUNCATE TABLE ST_ASSISTANT.PUBLIC.STREAMLIT_DOCSTRINGS_CHUNKS;
TRUNCATE TABLE ST_ASSISTANT.PUBLIC.STREAMLIT_DOCS_PAGES_CHUNKS;

In [None]:
session.write_pandas(
    docs_pages_df,
    database="ST_ASSISTANT",
    schema="PUBLIC",
    table_name="STREAMLIT_DOCS_PAGES_CHUNKS",
)

session.write_pandas(
    docstrings_df,
    database="ST_ASSISTANT",
    schema="PUBLIC",
    table_name="STREAMLIT_DOCSTRINGS_CHUNKS",
)

"Done!"

In [None]:
st.write("# Quick check")

for table in ["STREAMLIT_DOCS_PAGES_CHUNKS", "STREAMLIT_DOCSTRINGS_CHUNKS"]:

    st.write(f"## Table `{table}`")
    
    df = session.sql(f"SELECT COUNT(1) FROM ST_ASSISTANT.PUBLIC.{table}").to_pandas()
    st.metric("Number of rows", df.iat[0, 0])
    
    st.write("#### Data sample")
    df = session.sql(f"SELECT * FROM ST_ASSISTANT.PUBLIC.{table} LIMIT 100").to_pandas()
    st.write(df)