In [9]:

import os

import geopandas as gpd
import numpy as np
# import json

import subprocess
import requests

from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import re

import folium
import csv
import concurrent.futures

import datetime

In [None]:
import dwae_utils

In [None]:
def create_folder (out_path, folder_name='extracted_data'):
    """
    Create a folder in the specified output path.
    Args:
        out_path (str): Output path where the folder will be created.
        folder_name (str): Name of the folder to be created.

    Returns:
        str: Export path of the created folder.

    Raises:
        OSError: If there is an error creating the folder.

    """
    export_path = os.path.join(out_path, folder_name)
    try:
        # Create the folder
        os.makedirs(export_path)
        print(f"Folder created: {export_path}")
    except:
        print('Folder already exists', export_path)

    return export_path


def bbox_shp (shp, crs=7844):
    """
    Get the bounding box coordinates of a shapefile.

    Args:
        shp (str): Path to the shapefile.
        crs (int, optional): Coordinate reference system (CRS) code. Default is 7844.

    Returns:
        tuple: A tuple containing the bounding box coordinates (list) and the CRS code (int).

    """
    
    # Read the shapefile using geopandas
    gdf = gpd.read_file(shp)

    # Convert the geometry to the specified CRS and calculate the bounding box
    bbox = list(gdf.to_crs(crs).total_bounds)

    return bbox, crs
    # return json.dumps(geometry_dict), crs

    
def run_esri2geojson (url, bbox, crs, layer_name, export_path):
    """
    Runs the 'esri2geojson' command-line tool to convert data from an Esri service to GeoJSON format.

    Args:
        url (str): The URL of the Esri service.
        bbox (list): A list of four float values representing the bounding box coordinates in the order [minx, miny, maxx, maxy].
        crs (str): The coordinate reference system (CRS) identifier.
        gjson_out_path (str): The file path for the output GeoJSON file.

    Returns:
        None

    Raises:
        subprocess.CalledProcessError: If the 'esri2geojson' command fails to execute.

    Example:
        url = 'https://services.slip.wa.gov.au/public/rest/services/SLIP_Public_Services/Boundaries/MapServer/2'
        bbox = [115.8444528, -31.98380876, 116.15245686, -31.70508152]
        crs = '4326'
        gjson_out_path = 'E:\Scripts\idot_roads5.geojson'
        run_esri2geojson(url, bbox, crs, gjson_out_path)
    """
    
    geojson_out_path = os.path.join(export_path, 'geojson', f'{layer_name}.geojson')

    # Edit geometry for input
    geometry_str = ''.join(['geometry=', ','.join([str(num) for num in bbox])])

    # Edit crs for input 
    crs_str = ''.join(['inSR=', str(crs)]) 

    # Concatenate the variables with spaces
    command = ' '.join(['esri2geojson',url, '-p', geometry_str, '-p', crs_str, geojson_out_path])

    # Execute the command
    result = subprocess.run(command, shell=True, capture_output=True, text=True)

    # Check the result
    if result.returncode == 0:
        pass

    else:
        print(f'{url} An error occurred while executing the run_esri2geojson.')

    return geojson_out_path


def clip_geojson_export_shp (shp, crs,  geojson_out_path, shp_out_path):
    """
    Clips a GeoJSON file using a polygon GeoDataFrame and exports the clipped data to a shapefile.

    Args:
        aoi_gdf (GeoDataFrame): A GeoDataFrame representing the area of interest polygon for clipping.
        gjson_out_path (str): The file path of the input GeoJSON file.

    Returns:
        None

    Example:
        import geopandas as gpd

        # Define the area of interest as a polygon GeoDataFrame
        aoi_polygon = gpd.read_file('path/to/aoi.shp')

        # Specify the input GeoJSON file
        input_gjson = 'path/to/input.geojson'

        # Call the function to clip and export
        clip_geojson_export_shp(aoi_polygon, input_gjson)
    """

    # Read the GeoJSON and shapefile into GeoDataFrames
    geojson_gdf = gpd.read_file(geojson_out_path).to_crs(crs)
    
    if geojson_gdf.empty:
        pass
    else:
        shp_gdf = gpd.read_file(shp).to_crs(crs)

        # Clip the GeoJSON with the shapefile
        clipped_gdf = gpd.clip(geojson_gdf, shp_gdf)

        # Shorten the column names to a maximum of 10 characters
        clipped_gdf = clipped_gdf.rename(columns=lambda x: x[:10])

        # Extract the file name without extension
        output_shapefile = f'{os.path.splitext(os.path.basename(geojson_out_path))[0]}.shp'

        # Export the clipped GeoDataFrame to a shapefile
        clipped_gdf.to_file(os.path.join(shp_out_path, output_shapefile), driver='ESRI Shapefile')


In [None]:
def retrieve_links (url, cache):
    """
    Retrieve all links from a webpage.

    Args:
        url (str): The URL of the webpage.
        cache (dict): A dictionary to cache previously retrieved links.

    Returns:
        list: A list of links found on the webpage.

    """

    if url in cache:
        return cache[url]

    # Send a GET request to the URL
    response = requests.get(url)
    # Check if the request was successful (HTTP status code 200)
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all <li> elements
        li_tags = soup.find_all('li')

        links = []

        for li in li_tags: 
                    
            if li.find_all('a'):
                # has_links = True
                # Find all <a> elements within the <li>
                a_tags = li.find_all('a')
                for a in a_tags:            
                    # Extract the link URL from the <a> element
                    link = urljoin(url, a.get('href'))
                    links.append(link)    
        return links
    else:
        print(url, "\nRequest failed with status code:", response.status_code)
        
        return []

def process_links(url, cache, visited):
    """
    Recursively process links from a starting URL.

    Args:
        url (str): The starting URL to process.
        cache (dict): A dictionary to cache previously retrieved links.
        visited (set): A set to keep track of visited URLs.

    Returns:
        list: A list of processed links.

    """

    if url in visited:
        return []

    visited.add(url)  # Mark the current URL as visited

    links = retrieve_links(url, cache)
    processed_links = []
   
    for link in links:
        if link not in visited:
            # Process the link recursively
            processed_links.append(link)
            processed_links.extend(process_links(link, cache, visited))

    return processed_links

def check_if_vector (soup):
    """
    Check if the HTML page contains information about a vector geometry type.

    Args:
        soup (BeautifulSoup): The BeautifulSoup object representing the parsed HTML.

    Returns:
        bool: True if the page contains vector geometry type information, False otherwise.

    """

    # Find all tags that contain the text "esriGeometry"
    cells = soup.find_all(lambda tag: tag.name == 'b' and 'Geometry Type:' in tag.text)

    # Check if any cell contains the desired text
    contains_esriGeometry = any('esriGeometry' in cell.next_sibling.strip() for cell in cells)

    if contains_esriGeometry:
        return True
    else:
        return False


In [None]:
def write_csv (file_path, columns, data):
    # Check if the file exists
    file_exists = os.path.isfile(file_path)

    # Write the extracted information to the CSV file
    with open(file_path, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)

        # Write header row if the file is newly created
        if not file_exists:
            writer.writerow(columns)
        # Write the new row of data
        writer.writerow(data)

def info_to_sheets (export_path, soup, layer_name, url, csv_name):
    """
    Write extracted information to a CSV file.

    Args:
        out_path (str): The output directory path.
        soup (BeautifulSoup): The BeautifulSoup object representing the parsed HTML.
        layer_name (str): The name of the layer.
        url (str): The URL of the layer.

    """
    # Extract the geometry type from the soup object
    geometry_type = soup.find('b', string='Geometry Type:').next_sibling.strip()

    # Extract the description text from the soup object
    description_text = soup.find('b', string='Description: ').next_sibling.strip()

    source = '_'.join(layer_name.split('_')[0:1])

    # Get the current date
    extraction_date = datetime.date.today()

    # Define the file path
    file_path = os.path.join(export_path, f'{csv_name}.csv')

    columns = ['Source', 'Name', 'Geometry Type', 'Description', 'URL', 'Extraction Date']
    data = [source, layer_name, geometry_type, description_text, url, extraction_date]

    write_csv (file_path, columns, data)


def filter_layer_name (soup):
    """
    Filter and format the layer name extracted from the HTML soup.

    Args:
        soup (BeautifulSoup): The BeautifulSoup object representing the parsed HTML.

    Returns:
        str: The filtered and formatted layer name.

    """

    # Retrieve the <h2> name ('Layer')
    h2_tag = soup.find('h2')
    layer_name = h2_tag.text.split(':')[1].split('(ID')[0].replace(')', '').strip()

    # Check if the page has a 'Parent Layer' section
    parent_layer_tag = soup.find('b', string='Parent Layer:')
    if parent_layer_tag:
        layer_name = h2_tag.text.split(':')[1].split('(')[0].strip()
        parent_layer_link = parent_layer_tag.find_next_sibling('a')
        if parent_layer_link:
            parent_layer_name = parent_layer_link.text.split('(')[1].split(')')[0].strip()
            layer_name = parent_layer_name + ' ' + layer_name

    # Replace non-alphanumeric characters with underscores
    layer_name = re.sub(r'\W+', '_', layer_name)

    layer_name = f"{'_'.join(layer_name.split('_')[-2:])}_{'_'.join(layer_name.split('_')[:-2])}"

    return layer_name


In [None]:
def download_data (url, shp, export_path):
    # Send a GET request to the URL
    response = requests.get(url)

    # Check if the request was successful (HTTP status code 200)
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all <li> elements
        for li in soup.find_all('li'):

            if li.find_all('a'):
                pass

            elif check_if_vector (soup):

                layer_name = filter_layer_name (soup)

                csv_name = 'extracted_data'
                    
                info_to_sheets (export_path, soup, layer_name, url, csv_name)

                bbox, crs = bbox_shp (shp)

                geojson_out_path = run_esri2geojson (url, bbox, crs, layer_name, export_path)

                break

            else:
                pass
        
        return geojson_out_path
    else:
        print(url, "\nRequest failed with status code:", response.status_code)


# Create a function to process each URL
def process_url(url):
    geojson_out_path = download_data(url, shp, export_path)
    clip_geojson_export_shp(shp, crs, geojson_out_path, shp_out_path)


In [None]:
#MAIN

# Path to the shapefile (.shp) file
shp = r'E:\Scripts\SHP\AOI_sample.shp'
out_path = r'E:\Scripts'
# url_base = 'https://services.slip.wa.gov.au/public/rest/services'

url_base = 'https://services.slip.wa.gov.au/public/rest/services/SLIP_Public_Services/Education/MapServer'
crs=7844

## transfer files to python file
# import utils

#create folders to export

##main folder
export_path = create_folder (out_path)
##geojson
geojson_out_path = create_folder (export_path, 'geojson')
##shp
shp_out_path = create_folder (export_path, 'shp')

#retrieve all links
cache = {}  # Dictionary to cache retrieved links
visited = set()  # Set to track visited URLs

all_links = process_links (url_base, cache, visited)
filtered_data = [value for value in all_links if 'FS/MapServer' not in value]

#retrieve bbox
bbox, crs = bbox_shp (shp, crs)

# #download data
# Set the number of threads you want to use
num_threads = 10  # Choose the desired number of threads

# Use ThreadPoolExecutor to execute the function concurrently
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
    executor.map(process_url, filtered_data)



In [None]:
csv_file = r'E:\Scripts\extracted_data\extracted_data.csv'

def remove_duplicate_headings(csv_file):
    # Read the CSV file and find duplicate headings
    with open(csv_file, "r") as file:
        reader = csv.reader(file)
        rows = list(reader)

    # Remove duplicate headings from subsequent rows
    for i in range(1, len(rows)):
        rows[i] = [value for value in rows[i] if value not in headings]

    # Remove empty rows
    rows = [row for row in rows if any(row)]

    # Rewrite the file with modified rows
    with open(csv_file, "w", newline="") as file:
        writer = csv.writer(file)
        writer.writerows(rows)

In [None]:
gdf_temp = clipped


# Create a Folium map centered on a specific location
map_center = [np.mean([gdf_temp.total_bounds[1],gdf_temp.total_bounds[3]]), 
              np.mean([gdf_temp.total_bounds[0],gdf_temp.total_bounds[2]])]

m = folium.Map(location=map_center, zoom_start=10)

# Convert the GeoPandas data to GeoJSON format
geojson_data = gdf_temp.to_crs(epsg='4326').to_json()

# Add the GeoJSON data as a GeoJSON layer to the Folium map
folium.GeoJson(geojson_data).add_to(m)

m

In [None]:
gdf_temp

In [None]:
import requests
import os
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import re

# # URL to retrieve data from
# url = "https://services.slip.wa.gov.au/public/rest/services/SLIP_Public_Services/Boundaries/MapServer"

# url = "https://services.slip.wa.gov.au/public/rest/services/SLIP_Public_Services"


url = 'https://services.slip.wa.gov.au/public/rest/services'

# url = 'https://services.slip.wa.gov.au/public/rest/services/Geocoder'


def retrieve_links (url):

    # Send a GET request to the URL
    response = requests.get(url)
    # Check if the request was successful (HTTP status code 200)
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all <li> elements
        li_tags = soup.find_all('li')

        links = []

        for li in li_tags: 

            print(li)
                   
            if li.find_all('a'):
                has_links = True
                # Find all <a> elements within the <li>
                a_tags = li.find_all('a')
                for a in a_tags:            
                    # Extract the link URL from the <a> element
                    link = urljoin(url, a.get('href'))
                    links.append(link)    

            elif check_if_vector (url) == True and url not in links:
                
                try:
                    print('EXECUTE', url)
                        
                    # Retrieve the <h2> name ('Layer')
                    h2_tag = soup.find('h2')
                    layer_name = h2_tag.text.split(':')[1].split('(ID')[0].replace(')', '').strip()

                    # Check if the page has a 'Parent Layer' section
                    parent_layer_tag = soup.find('b', string='Parent Layer:')
                    if parent_layer_tag:
                        layer_name = h2_tag.text.split(':')[1].split('(')[0].strip()
                        parent_layer_link = parent_layer_tag.find_next_sibling('a')
                        if parent_layer_link:
                            parent_layer_name = parent_layer_link.text.split('(')[1].split(')')[0].strip()
                            print(parent_layer_name)
                            layer_name = layer_name + ' ' + parent_layer_name

                    # Replace non-alphanumeric characters with underscores
                    layer_name = re.sub(r'\W+', '_', layer_name)
    

                    print(layer_name)

                    break

                except:
                    print('except')
                    pass

            else:
                pass
        
        return links
    else:
        print(url, "\nRequest failed with status code:", response.status_code)



In [None]:
%load_ext autoreload
%autoreload 2

# from dwae_utils import dwae

from dwea import dwae

In [None]:
# Path to the shapefile (.shp) file
shp = r'E:\Scripts\SHP\AOI_sample.shp'
out_path = r'E:\Scripts\canning'
# url_base = 'https://services.slip.wa.gov.au/public/rest/services'

url_base = 'https://services.slip.wa.gov.au/public/rest/services/SLIP_Public_Services/Education/MapServer'
crs=7844
num_threads = 10


dwae (url_base, shp, out_path, crs, num_threads)

In [8]:
# %load_ext autoreload
# %autoreload 2

# from dwea import dwae

# # Path to the shapefile (.shp) file

# shp = r'E:\Scripts\SHP\AOI_sample.shp'
# # shp = r'E:\Scripts\canning\SHP\AOI.shp'
# out_path = r'E:\canning_river'
# # url_base = 'https://services.slip.wa.gov.au/public/rest/services'


# url_base = 'https://services.slip.wa.gov.au/public/rest/services/Landgate_Public_Maps/Map_of_Bush_Fire_Prone_Areas_3/MapServer/1'

# crs=7844
# num_threads = 10


# dwae (url_base, shp, out_path, crs, num_threads)

In [16]:
import concurrent.futures


def retrieve_links (url, cache):
    """
    Retrieve all links from a webpage.
    Args:
        url (str): The URL of the webpage.
        cache (dict): A dictionary to cache previously retrieved links.
    Returns:
        list: A list of links found on the webpage.
    """

    if url in cache:
        return cache[url]

    # Send a GET request to the URL
    response = requests.get(url)
    # Check if the request was successful (HTTP status code 200)
    if response.status_code != 200:
        print(url, "\nRequest failed with status code:", response.status_code)
        return []

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all <li> elements
    li_tags = soup.find_all('li')

    links = []
    for li in li_tags:            
        if li.find_all('a'):
            # Find all <a> elements within the <li>
            a_tags = li.find_all('a')
            for a in a_tags:            
                # Extract the link URL from the <a> element
                link = urljoin(url, a.get('href'))
                links.append(link)    
    return links

def process_links(url, cache, visited):
    """
    Recursively process links from a starting URL.

    Args:
        url (str): The starting URL to process.
        cache (dict): A dictionary to cache previously retrieved links.
        visited (set): A set to keep track of visited URLs.

    Returns:
        list: A list of processed links.

    """

    if url in visited:
        return []
    visited.add(url)  # Mark the current URL as visited
    links = retrieve_links(url, cache)
    processed_links = []

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for link in links:
            if link not in visited:
                # Process the link asynchronously
                future = executor.submit(process_links, link, cache, visited)
                futures.append(future)

        for future in concurrent.futures.as_completed(futures):
            processed_links.extend(future.result())

    return processed_links

In [17]:
url_base = 'https://services.slip.wa.gov.au/public/rest/services/SLIP_Public_Services/Boundaries/MapServer'

#retrieve all links
cache = {}  # Dictionary to cache retrieved links
visited = set()  # Set to track visited URLs

all_links = process_links (url_base, cache, visited)

In [20]:
def retrieve_links (url, cache):
    """
    Retrieve all links from a webpage.
    Args:
        url (str): The URL of the webpage.
        cache (dict): A dictionary to cache previously retrieved links.
    Returns:
        list: A list of links found on the webpage.
    """

    if url in cache:
        return cache[url]

    # Send a GET request to the URL
    response = requests.get(url)
    # Check if the request was successful (HTTP status code 200)
    if response.status_code != 200:
        print(url, "\nRequest failed with status code:", response.status_code)
        return []

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all <li> elements
    li_tags = soup.find_all('li')

    links = []
    for li in li_tags:            
        if li.find_all('a'):
            # Find all <a> elements within the <li>
            a_tags = li.find_all('a')
            for a in a_tags:            
                # Extract the link URL from the <a> element
                link = urljoin(url, a.get('href'))
                links.append(link)    
    return links


def process_links(url, cache, visited):
    """
    Recursively process links from a starting URL.

    Args:
        url (str): The starting URL to process.
        cache (dict): A dictionary to cache previously retrieved links.
        visited (set): A set to keep track of visited URLs.

    Returns:
        list: A list of processed links.

    """

    if url in visited:
        return []
    visited.add(url)  # Mark the current URL as visited
    links = retrieve_links(url, cache)
    processed_links = []
  
    for link in links:
        if link not in visited:
            # Process the link recursively
            processed_links.append(link)
            processed_links.extend(process_links(link, cache, visited))
    return processed_links



# def process_links(url, cache, visited):
#     """
#     Recursively process links from a starting URL.

#     Args:
#         url (str): The starting URL to process.
#         cache (dict): A dictionary to cache previously retrieved links.
#         visited (set): A set to keep track of visited URLs.

#     Returns:
#         list: A list of processed links.

#     """

#     if url in visited:
#         return []
#     visited.add(url)  # Mark the current URL as visited
#     links = retrieve_links(url, cache)
#     processed_links = []
  
#     for link in links:
#         if link not in visited:
#             # Process the link recursively
#             processed_links.append(link)
#             processed_links.extend(process_links(link, cache, visited))
#     return processed_links

In [19]:
url_base = 'https://services.slip.wa.gov.au/public/rest/services/SLIP_Public_Services/Boundaries/MapServer'

#retrieve all links
cache = {}  # Dictionary to cache retrieved links
visited = set()  # Set to track visited URLs

all_links = process_links (url_base, cache, visited)