In [13]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import json


# Extract with URL

In [3]:
def write_to_txt(content, output_file):
    with open(output_file, "w") as text_file:
        text_file.write(content)

In [78]:
def extract_text_with_url(url):
    response = requests.get(url)
    # Check if the request was successful
    if response.status_code == 200:
        # Parse the content of the request with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract and print all text content
        text = soup.get_text(separator=' ', strip=True)
        write_to_txt(text, "output.txt")
    else:
        print(f'Failed to retrieve the webpage. Status code: {response.status_code}')

In [5]:
extract_text_with_url('https://www.xbrl.org')

# Extract from website

In [99]:
internal_urls = set()

In [100]:

visited_urls = set()

In [101]:
def is_valid(url, allow_external=True):
    # parsed = urlparse(url)
    # return bool(parsed.netloc) and bool(parsed.scheme)

    parsed_url = urlparse(url)
    if allow_external:
        # Define allowed external domains
        allowed_domains = ['xbrl.org', 'specifications.xbrl.org']
        return any(parsed_url.netloc.endswith(domain) for domain in allowed_domains)
    else:
        return bool(parsed_url.netloc) and bool(parsed_url.scheme)

In [102]:
def get_all_website_links(url):
    try:
        urls = set()
        domain_name = urlparse(url).netloc
        session = requests.Session()
        response = session.get(url)
        if response.status_code != 200:
            return urls  # Return empty set if failed to fetch the page
        soup = BeautifulSoup(response.text, 'html.parser')
        for a_tag in soup.findAll("a"):
            href = a_tag.attrs.get("href")
            if href == "" or href is None:
                continue
            href = urljoin(url, href)
            parsed_href = urlparse(href)
            href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
            if not is_valid(href):
                continue
            if href in internal_urls:
                continue
            if domain_name not in href:
                continue
            urls.add(href)
            internal_urls.add(href)
        return urls
    except Exception as e:
        print(f"Error processing error")
        return None  # Return None to skip this URL due to parsing errors
        

In [103]:
def extract_details(url):
    try:
        response = requests.get(url)
        if response.status_code != 200:
            print(f'Failed to retrieve the webpage. URL: {url}, Status code: {response.status_code}')
            return None  # Return None if the response is not successful
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.find('title').text if soup.find('title') else ''
        text = soup.get_text(separator=' ', strip=True)
        return {"title": title, "url": url, "text": text}
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")
        return None  # Return None to skip this URL due to parsing errors


In [104]:
def crawl(url, results, max_depth=5):
    if url in visited_urls or max_depth <= 0:  # Check if URL has already been visited or depth limit reached
        return
    visited_urls.add(url)  # Mark the URL as visited
    print(f"Visiting: {url}")
    details = extract_details(url)  # Extract details only if URL has not been visited before
    if details:
        details['id'] = len(results)  # Assign an ID based on the current length of the results list
        results.append(details)
    links = get_all_website_links(url)
    if links is not None:
        for link in links:
            crawl(link, results, max_depth - 1)  # Recurse into found links


In [105]:
results = []

In [58]:
start_url = "https://www.xbrl.org"
crawl(start_url, results, max_depth=5)

Visiting: https://www.xbrl.org
Visiting: https://www.xbrl.org/tag/eu/
Visiting: https://www.xbrl.org/home/tags/
Visiting: https://www.xbrl.org/events/data-amplified-2024/
Visiting: https://www.xbrl.org/news/esrs-taxonomy-outreach/
Visiting: https://www.xbrl.org/events/
Visiting: https://www.xbrl.org/news/ifrs-foundation-set-to-publish-second-digital-taxonomy-this-month/
Visiting: https://www.xbrl.org/news/enhancing-financial-transparency-with-digital-financial-reporting/
Visiting: https://www.xbrl.org/news/digital-signatures-at-data-amplified-2023/
Visiting: https://www.xbrl.org/events/29th-eurofiling-conference/
Visiting: https://www.xbrl.org/news/investors-take-note-iasb-outlines-acquisitions-update/
Visiting: https://www.xbrl.org/tag/iasb/
Visiting: https://www.xbrl.org/news/efrag-issues-a-draft-comment-letter-on-climate-related-uncertainties-in-financial-statements/
Visiting: https://www.xbrl.org/news/sec-launches-2024-cybersecurity-disclosure-taxonomy/
Visiting: https://www.xbrl.o

  k = self.parse_starttag(i)


Visiting: https://www.xbrl.org/news/what-does-good-disclosure-look-like/
Visiting: https://www.xbrl.org/tag/connectivity/
Visiting: https://www.xbrl.org/news/efrag-publishes-paper-on-enhancing-connectivity-and-reporting-boundaries-in-annual-reports/
Visiting: https://www.xbrl.org/tag/agenda/
Visiting: https://www.xbrl.org/news/international-standard-setters-unite-common-concerns-on-issbs-agenda-consultation/
Visiting: https://www.xbrl.org/tag/integrated/
Visiting: https://www.xbrl.org/news/issb-and-iasb-navigate-the-path-to-connectivity/
Visiting: https://www.xbrl.org/news/ifrs-foundation-monitoring-board-emphasises-oversight-of-standard-setting/
Visiting: https://www.xbrl.org/news/unlocking-synergy-efrag-symposium-explores-connectivity-between-financial-and-sustainability-reporting/
Visiting: https://www.xbrl.org/tag/connectivity/feed
Visiting: https://www.xbrl.org/tag/iasb/page/2/
Visiting: https://www.xbrl.org/tag/ias-1/
Visiting: https://www.xbrl.org/tag/iasb/page/2/feed
Failed to 

In [63]:
start_url_2 = 'https://specifications.xbrl.org/'
crawl(start_url_2, results, max_depth=5)

Visiting: https://specifications.xbrl.org/
Visiting: https://specifications.xbrl.org/transactional.html
Visiting: https://specifications.xbrl.org/spec-group-index-xbrl-gl.html
Visiting: https://specifications.xbrl.org/work-product-index-xbrl-gl-xbrl-gl-2007.html
Visiting: https://specifications.xbrl.org/release-history-xbrl-gl-2007-xbrl-gl-framework.html
Visiting: https://specifications.xbrl.org/work-product-index-xbrl-gl-xbrl-gl-2015.html
Visiting: https://specifications.xbrl.org/release-history-xbrl-gl-2015-xbrl-gl-srcd.html
Visiting: https://specifications.xbrl.org/release-history-xbrl-gl-2015-defaults.html
Visiting: https://specifications.xbrl.org/release-history-xbrl-gl-2015-attributes.html
Visiting: https://specifications.xbrl.org/release-history-xbrl-gl-2015-templates.html
Visiting: https://specifications.xbrl.org/release-history-xbrl-gl-2015-language-overview.html
Visiting: https://specifications.xbrl.org/release-history-xbrl-gl-2015-best-practice-annotated-instances.html
Visit

In [106]:
start_url_3 = 'https://www.xbrl.org/Specification/'
crawl(start_url_3, results, max_depth=5)

Visiting: https://www.xbrl.org/Specification/
Visiting: https://www.xbrl.org/Specification/versioning-concept-extended/
Visiting: https://www.xbrl.org/Specification/versioning-concept-extended/cr-2011-05-11/
Visiting: https://www.xbrl.org/Specification/versioning-concept-extended/cr-2011-05-11/versioning-concept-extended-cr-2011-05-11.html
Visiting: http://www.xbrl.org/Specification/genericLabels/REC-2009-06-22/genericLabels-REC-2009-06-22.html
Visiting: http://www.xbrl.org/Specification/versioning-concept-extended/CR-2011-05-11/versioning-concept-extended-CR-2011-05-11.html
Visiting: http://www.xbrl.org/Specification/genericReferences/REC-2009-06-22/genericReferences-REC-2009-06-22.html
Visiting: http://www.xbrl.org/Specification/versioning-concept-basic/CR-2010-07-31/versioning-concept-basic-CR-2010-07-31.html
Visiting: http://www.xbrl.org/legal
Visiting: http://www.xbrl.org/Specification/XBRL-RECOMMENDATION-2003-12-31+Corrected-Errata-2008-07-02.htm
Visiting: http://www.xbrl.org/Spe

  soup = BeautifulSoup(response.text, 'html.parser')
  soup = BeautifulSoup(response.text, 'html.parser')


Visiting: https://www.xbrl.org/Specification/formula/REC-2011-10-24/index-2011-10-24_files/panminus.gif
Visiting: https://www.xbrl.org/Specification/formula/REC-2011-10-24/index-2011-10-24_files/filelist.xml
Visiting: https://www.xbrl.org/Specification/formula/REC-2011-10-24/index-2011-10-24_files/toc2.gif
Visiting: https://www.xbrl.org/Specification/formula/REC-2011-10-24/index-2011-10-24_files/keys.js
Visiting: https://www.xbrl.org/Specification/formula/REC-2011-10-24/index-2011-10-24_files/fullpage.gif
Visiting: https://www.xbrl.org/Specification/formula/REC-2011-10-24/index-2011-10-24.htm
Visiting: https://www.xbrl.org/Specification/formula/REC-2011-10-24/conformance/
Visiting: https://www.xbrl.org/Specification/formula/REC-2011-10-24/conformance/formula-conformance-2022-07-21.zip
Visiting: https://www.xbrl.org/Specification/formula/REC-2011-10-24/conformance/formula-conf-rec-2011-10-24.zip
Visiting: https://www.xbrl.org/Specification/formula/REC-2011-10-24/conformance/conformance-

In [108]:
results_json = json.dumps(results, indent=2)
with open('xbrl_results_2_specifications.json', 'w') as file:
        file.write(results_json)

In [111]:
len(results)

3939

In [110]:
len(visited_urls)

4091

In [52]:
len(internal_urls)

985

# Filter Specifications

In [112]:
xbrl_spec_results_filtered = [entry for entry in results if (entry['url'].endswith('.html') or entry['url'].endswith('.htm'))]


In [113]:
len(xbrl_spec_results_filtered)

1045

In [114]:
xbrl_spec_results_filtered_json = json.dumps(xbrl_spec_results_filtered, indent=2)
with open('xbrl_results_2_specifications_filtered.json', 'w') as file:
        file.write(xbrl_spec_results_filtered_json)

In [None]:
with open("xbrl_results_2_specifications.json", 'r') as file:
    cdm_data = json.load(file)

# Re-index


In [115]:
with open("xbrl_results_2_specifications_filtered.json", 'r') as file:
    xbrl_spec_data = json.load(file)

In [116]:
for index, item in enumerate(xbrl_spec_data):
    item['id'] = index

In [117]:
with open('xbrl_results_2_spec_filtered.json', 'w') as file:
    json.dump(xbrl_spec_data, file, indent=2)