In [1]:
# ! pip install pyvisa
# ! pip install toml

In [2]:
%load_ext autoreload
%autoreload 2
import os    
import sys    
import time
import pprint
import json
from bs4 import BeautifulSoup
import requests



# QGIS help page crawler

## How to use?
1. Update the `url` variable using the links in [https://docs.qgis.org/3.34/en/docs/user_manual/processing_algs/index.html](https://docs.qgis.org/3.34/en/docs/user_manual/processing_algs/index.html), e.g., [https://docs.qgis.org/3.34/en/docs/user_manual/processing_algs/qgis/rasteranalysis.html](https://docs.qgis.org/3.34/en/docs/user_manual/processing_algs/qgis/rasteranalysis.html).
2. Run the program. 

**Please check the results manually to ensure they are correct!**

In [3]:
import toml
import os
import re 
import io

In [4]:



##### These urls were processed.
# url = "https://docs.qgis.org/3.34/en/docs/user_manual/processing_algs/qgis/rasteranalysis.html"
# there are 156 sections, and 33 sections have h2 tag, while 31 tool sections.

# url = "https://docs.qgis.org/3.34/en/docs/user_manual/processing_algs/qgis/interpolation.html"
# url = "https://docs.qgis.org/3.34/en/docs/user_manual/processing_algs/qgis/vectoranalysis.html"
# url = "https://docs.qgis.org/3.34/en/docs/user_manual/processing_algs/qgis/vectorgeneral.html"
# url = "https://docs.qgis.org/3.34/en/docs/user_manual/processing_algs/qgis/vectorgeometry.html"
# url = "https://docs.qgis.org/3.34/en/docs/user_manual/processing_algs/gdal/rasteranalysis.html"
# url = "https://docs.qgis.org/3.34/en/docs/user_manual/processing_algs/qgis/plots.html"
# url = "https://docs.qgis.org/3.34/en/docs/user_manual/processing_algs/qgis/rasterterrainanalysis.html"
url = "https://docs.qgis.org/3.34/en/docs/user_manual/processing_algs/qgis/vectoroverlay.html"

response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    html_content = response.content
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Now you can work with 'soup' to extract the relevant sections
    # Example: printing the title of the page
    print(soup.title.get_text())
 
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

28.1.23. Vector overlay — QGIS Documentation  documentation


In [5]:
raw_sections = soup.find_all('section')
len(raw_sections)

60

In [6]:
# for idx, section in enumerate(sections):
    # print(idx)
    # print(section)

In [7]:
cnt = 0
sections = []
for idx, section in enumerate(raw_sections):
     
    section_id = section.get('id')  # Extract the section ID
    tag = section.find('h2')  # Try to find an <h3> tag in the section
    # print(tag)
    if tag:
        cnt += 1
        print(idx, cnt, f"Section with ID '{section_id}' contains an <h3> tag: '{tag.get_text()}'")
        # cnt += 1
        sections.append(section)
    else:
        # print(idx, f"Section with ID '{section_id}' does not contain an <h3> tag.")
        pass
len(sections)  

0 1 Section with ID 'None' contains an <h3> tag: '28.1.23.1. Clip'
1 2 Section with ID 'vector-overlay' contains an <h3> tag: '28.1.23.1. Clip'
2 3 Section with ID 'clip' contains an <h3> tag: '28.1.23.1. Clip'
6 4 Section with ID 'difference' contains an <h3> tag: '28.1.23.2. Difference'
12 5 Section with ID 'difference-multiple' contains an <h3> tag: '28.1.23.3. Difference (multiple)'
16 6 Section with ID 'extract-clip-by-extent' contains an <h3> tag: '28.1.23.4. Extract/clip by extent'
20 7 Section with ID 'intersection' contains an <h3> tag: '28.1.23.5. Intersection'
26 8 Section with ID 'intersection-multiple' contains an <h3> tag: '28.1.23.6. Intersection (multiple)'
32 9 Section with ID 'line-intersections' contains an <h3> tag: '28.1.23.7. Line intersections'
38 10 Section with ID 'split-with-lines' contains an <h3> tag: '28.1.23.8. Split with lines'
42 11 Section with ID 'symmetrical-difference' contains an <h3> tag: '28.1.23.9. Symmetrical difference'
48 12 Section

13

In [8]:
def get_section_by_heading(section, level, heading):        
    tags = section.find_all(level)
    for tag in tags:
        if heading in tag.get_text():
            section_tag = tag.find_parent('section')
            # print(section_tag.prettify())  # Print the entire section for visualization
            return section_tag
    return None
    
def extract_table(section):
    # print("section:", section)
    table_data = []
    tables = section.find_all('table')
    # print("table:", table)
    for table in tables:
        if table:
            headers = [th.get_text(strip=True) for th in table.find_all('th')]
            rows = table.find_all('tr')[1:]  # Skip header row
            for row in rows:
                cols = row.find_all('td')
                # row_data = {headers[i]: cols[i].get_text(strip=True) for i in range(len(headers))}
                row_data = {headers[i]: cols[i].get_text(separator=' ', strip=True) for i in range(len(headers))}
                table_data.append(row_data)
    return table_data
    
# Helper function to normalize whitespace in a string
def normalize_whitespace(text):
    # Replace multiple spaces and newlines with a single space
    return re.sub(r'\s+', ' ', text).strip()
    
# Function to extract paragraphs before a section with a specific id and ensure spaces around <a> and <code> elements
def extract_paragraphs_before(section, stop_section_id):
    paragraphs = []
    
    for element in section.find_all(['p', 'h3'], recursive=False):
        # Stop if we encounter the sub-section (like "Parameters")
        if element.name == 'h3' and stop_section_id in element.get('id', ''):
            break
        if element.name == 'p':
            # Rebuild the paragraph's text, ensuring spaces around <a> and <code> tags
            paragraph_text = []
            for content in element.children:
                
                if content.name in ['a', 'code']:
                    paragraph_text.append(f" {content.get_text(strip=True)} ")
                    # print(content)
                else:
                    paragraph_text.append(content if isinstance(content, str) else content.get_text(strip=True))
                    # print(paragraph_text[-1])
            
            paragraphs.append(normalize_whitespace(''.join(paragraph_text)))
    
    return paragraphs
    
def make_parameters_for_TOML(tool_info):
    lines = []
    # for parameter in tool_info['basic_parameters']:
    #     line = f"{parameter['Name']}: {parameter['Description'].replace('\n', '')}. Type: {parameter['Type']}"
    #     lines.append(line)
        
    # for parameter in tool_info['advanced_parameters']:
    #     line = f"{parameter['Name']}: {parameter['Description'].replace('\n', '')}. Type: {parameter['Type']}"
    #     lines.append(line)

    for parameter in tool_info['parameters']:
        line = f"{parameter['Name']}: {parameter['Label'].replace('\n', '')}. {parameter['Description'].replace('\n', '')}. Type: {parameter['Type']}"
        lines.append(line)
    
    para_str = "\n".join(lines)
    return para_str

def make_outputs_for_TOML(tool_info):
    lines = []
    for output in tool_info['outputs']:
        try:
            line = f"{output['Name']}: {output['Label'].replace('\n', '')}. {output['Description'].replace('\n', '')}. Type: {output['Type']}"
            lines.append(line)
        except Exception as e:
            print("Error in make_outputs_for_TOML():", e)
    
    para_str = "\n".join(lines)
    return para_str
def make_TOML_file(tool_info, fname):
    tool_toml = {}
    tool_toml['tool_ID'] = tool_info['algorithm_id']
    tool_toml['tool_name'] = tool_info['tool_name']    
    tool_toml['brief_description'] = tool_info['brief_description']
    tool_toml['full_description'] = '\n'.join(tool_info['paragraphs'])
    tool_toml['parameters'] = make_parameters_for_TOML(tool_info)
    tool_toml['outputs'] = make_outputs_for_TOML(tool_info)
    tool_toml['code_example'] = ""
    with open(fname, 'w', encoding='utf-8') as f:  # charmap' codec can't encode character
        toml_str = toml.dump(tool_toml, f)
        
    # toml_str = toml.dumps(tool_toml)
    return toml_str
    
def extract_tool_info(section):

    tool_info = {}
    section_id = section.get('id')
    # print("section_id:", section_id)
    
    h2 = section.find("h2")
    
    if h2:  # Check if an h2 element was found

        paragraph = section.find("p")
        if paragraph:
            paragraphs = extract_paragraphs_before(section, 'parameters')
 
            if len(paragraphs) > 0:
                tool_info['brief_description'] = paragraphs[0]
                # print("Tool description:", tool_info['brief_description'])  
            else: 
                tool_info['brief_description'] = ""
            tool_info['tool_name'] = h2.get_text(strip=True)[:-1].split(".")[-1] 
            tool_info['paragraphs'] = paragraphs
  
        algorithm_id = None 
        python_code_snippet = None
        
        python_code_section = get_section_by_heading(section, 'h3', 'Python code')
        if python_code_section: 
            algorithm_id = python_code_section.find('code').get_text(strip=True)
            if not algorithm_id:
                return tool_info
            # print("algorithm_id:", algorithm_id) 
            pre_tag = python_code_section.find('pre')
            # print("python_code_section:", python_code_section)
            if pre_tag:
                python_code_snippet = pre_tag.get_text()
            # print("algorithm_id:", algorithm_id) 

        # store algorithm_id
        tool_info['algorithm_id'] = algorithm_id
        tool_info['python_code_snippet'] = python_code_snippet

        paremeters_section = get_section_by_heading(section, 'h3', 'Parameters')
        if paremeters_section:
            # print("paremeters_section:", paremeters_section)
            # basic_parameters_section = section.find('section', id='basic-parameters')   # only for 28.1.15. Raster analysis 
            # advanced_parameters_section = section.find('section', id='advanced-parameters') # only for 28.1.15. Raster analysis
            
            
            # basic_parameters = extract_table(basic_parameters_section) if basic_parameters_section else []  # only for 28.1.15. Raster analysis
            # advanced_parameters = extract_table(advanced_parameters_section) if advanced_parameters_section else []  # only for 28.1.15. Raster analysis
            parameters = extract_table(paremeters_section) if paremeters_section else [] 
            # print("parameters:", parameters)

            # tool_info['basic_parameters'] = basic_parameters   # only for 28.1.15. Raster analysis
            # tool_info['advanced_parameters'] = advanced_parameters   # only for 28.1.15. Raster analysis
            tool_info['parameters'] = parameters
            

        outputs_section = get_section_by_heading(section, 'h3', 'Outputs')
        if outputs_section:
            outputs = extract_table(outputs_section) if outputs_section else []
            tool_info['outputs'] = outputs
            # print("outputs:", outputs)

        # print(tool_info)
        # print("Tool description:", tool_info['brief_description'])  
        return tool_info
    else:
        # print("No h2 found in this section.")
        pass
        
    return tool_info


save_dir = r'D:\OneDrive_PSU\OneDrive - The Pennsylvania State University\Research_doc\LLM_Geo\QGIS_plugin\toml'

count = 0

for section in sections[2:]:
    try:
        tool_info = extract_tool_info(section)    
        tool_name = tool_info['algorithm_id'].replace(":", "_")
        
        fname = os.path.join(save_dir, f"{tool_name}.toml")
        toml_str = make_TOML_file(tool_info, fname)
        # print("fname:", fname)
        count += 1

        print(f"{count} tool_name:", tool_name)

    except Exception as e:

        print("error:", e)
        pass

    # if count > 0: 
    #     break

1 tool_name: qgis_clip
2 tool_name: qgis_difference
3 tool_name: qgis_multidifference
4 tool_name: qgis_extractbyextent
5 tool_name: qgis_intersection
6 tool_name: qgis_multiintersection
7 tool_name: qgis_lineintersections
8 tool_name: qgis_splitwithlines
9 tool_name: qgis_symmetricaldifference
10 tool_name: qgis_union
11 tool_name: qgis_multiunion


## CODE SAMPLE INTEGRATION AND REFORMATTING TOML FILE

In [None]:
# ! pip install tomllib
# ! pip install tomli_w
# ! pip install langchain_openai

In [None]:
import asyncio
import os
import re
import tomllib
# import tomli_w
from langchain_openai import ChatOpenAI
import QGIS_tool_creation_Helper

In [None]:
import asyncio
import os
import re
import sys
import tomllib
import tomli_w
from langchain_openai import ChatOpenAI



# Get the directory of the current script
current_script_dir = os.path.dirname(os.path.abspath(__file__))
# Add the directory to sys.path
if current_script_dir not in sys.path:
    sys.path.append(current_script_dir)

import  QGIS_tool_creation_Helper as Helper


OpenAI_key = Helper.get_OpenAI_key()


# Tools_Documentation_dir = r"D:\Onedrive\OneDrive - The Pennsylvania State University\PhD Work\SpatialAnalysisAgent_Reasearch\Plugin\toml\TOML_10"
Tools_Documentation_dir = r"D:\Onedrive\OneDrive - The Pennsylvania State University\PhD Work\SpatialAnalysisAgent_Reasearch\Plugin\toml\TOML2_30"

def tool_documentation_collection(tool_ID, tool_dir=Tools_Documentation_dir):
    tool_file = os.path.join(tool_dir, f'{tool_ID}.toml')

    # Check if the file exists
    if not os.path.exists(tool_file):
        return ""

    with open(tool_file, "rb") as f:
        tool = tomllib.load(f)
    tool_parameter_str = tool['parameters']
    algorithm_id = tool['tool_ID']
    # algorithm_id = "qgis:buffer"

    tool_parameter_lines = tool_parameter_str.strip().split('\n')
    numbered_tool_parameter_str = ''
    for idx, line in enumerate(tool_parameter_lines):
        line = line.strip(' ')
        numbered_tool_parameter_str += f"{idx + 1}. {line}\n"

    return numbered_tool_parameter_str, algorithm_id



def append_code_to_toml(tool_ID, code_sample, tool_dir=Tools_Documentation_dir):
    tool_file = os.path.join(tool_dir, f'{tool_ID}.toml')

    # Check if the file exists
    if not os.path.exists(tool_file):
        print(f"TOML file for tool {tool_ID} does not exist.")
        return

    # Load the existing TOML file
    with open(tool_file, "rb") as f:
        tool_data = tomllib.load(f)

    # # Replace the sample code in the code_example section if it exists, otherwise add it
    # tool_data['code_example'] = code_sample

    # Append the sample code to the code_example section
    if 'code_example' in tool_data:
        tool_data['code_example'] += f"\n{code_sample}"
    else:
        tool_data['code_example'] = code_sample

    # Write the updated content back to the file using tomli_w
    with open(tool_file, "wb") as f:
        tomli_w.dump(tool_data, f)


def formatting_toml_file(tool_ID, tool_dir=Tools_Documentation_dir):
    #file_path = r"D:\Onedrive\OneDrive - The Pennsylvania State University\PhD Work\SpatialAnalysisAgent_Reasearch\Plugin\toml\toml\gdal_slope.toml"

    tool_file = os.path.join(tool_dir, f'{tool_ID}.toml')
    with open(tool_file, 'r', encoding = 'utf-8') as file:
        # tool = tomllib.load(f)
        # tool_parameter_str = tool['parameters']
        toml_content = file.read()

    # Function to replace single quotes with triple quotes for specific sections
    def replace_single_with_triple_quotes(section_name, content):
        # Pattern to match the section content that uses single quotes
        pattern = rf'({section_name}\s*=\s*)\"(.*?)\"'
        # Replace with triple quotes
        return re.sub(pattern, r'\1"""\2"""', content, flags=re.DOTALL)

    # Sections that need to be reformatted
    sections_to_format = ['brief_description', 'full_description', 'parameters', 'code_example']

    # Apply reformatting to each section
    for section in sections_to_format:
        toml_content = replace_single_with_triple_quotes(section, toml_content)


    # Replace occurrences of `n\` (escaped newline in TOML) with proper multiline format
    # Specifically, we will place strings in between triple double-quotes for basic multiline handling
    updated_code_example = toml_content.replace('\\n', '\n')

    # Write the updated content back into the file
    # updated_file_path = r'D:\Onedrive\OneDrive - The Pennsylvania State University\PhD Work\SpatialAnalysisAgent_Reasearch\Plugin\toml\toml\gdal_slope.toml'
    with open(tool_file, 'w', encoding='utf-8') as updated_file:
        updated_file.write(updated_code_example)

def process_toml_files_in_directory(directory):
    for file_name in os.listdir(directory):
        if file_name.endswith(".toml"):
            tool_file_name = file_name.split(".")[0]
            numbered_tool_parameter_str, algorithm_id = tool_documentation_collection(tool_ID = tool_file_name)
            Tooldoc_prompt_str = Helper.create_CodeSample_prompt(tool_doc=numbered_tool_parameter_str, algorithm_id=algorithm_id)

            # Generate code sample using OpenAI model
            OpenAI_key = Helper.get_OpenAI_key()

            model = ChatOpenAI(api_key=OpenAI_key, model="gpt-4o", temperature=1)
            CodeSample_prompt_str_chunks = asyncio.run(Helper.fetch_chunks(model, Tooldoc_prompt_str))
            LLM_reply_str = Helper.convert_chunks_to_str(chunks=CodeSample_prompt_str_chunks)

            # Extract Python code and clean it
            if 'python' in LLM_reply_str:
                sample_code = LLM_reply_str.split('python', 1)[1].strip()
            sample_code = sample_code.replace("\\n", "\n")

            # Append the sample code to the TOML file
            append_code_to_toml(tool_ID=tool_file_name, code_sample=sample_code)

            # Reformat the TOML file for multiline strings
            formatting_toml_file(tool_ID =tool_file_name)

            print(f"Processed file: {tool_file_name}.toml")

# Run the processing function for all TOML files in the directory
process_toml_files_in_directory(Tools_Documentation_dir)