# Extract cube data from XBRL reports
  1. install the open source Arelle processor and XULE plugin
  2. save output variables to a XULE expression and compile it as a .zip
  3. run the .zip with Arelle to extract numeric facts from an XBRL report to a file

In [1]:
# @title Optional - filter eForm reports from most recent 650 RSS entries - leave blank for all
form_types = "1, 3Q_Electric" # @param {type:"string"}
num_entries = "5" # @param {type:"string"}
import requests
from bs4 import BeautifulSoup
import re

url = 'https://ecollection.ferc.gov/api/rssfeed'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'xml')

items = soup.find_all('item')

# Determine the number of entries to process
if num_entries.strip():
    try:
        num_to_process = int(num_entries)
    except ValueError:
        print("Invalid number of entries specified. Processing all entries.")
        num_to_process = len(items)
else:
    num_to_process = len(items)

for i, item in enumerate(items[:num_to_process]):
    title = item.find('title').text
    pub_date = item.find('pubDate').text


    description = item.find('description').text
    description_soup = BeautifulSoup(description, 'html.parser')

    # Find the link ending with .html
    html_link_tag = description_soup.find('a', href=lambda href: href and href.endswith('.html'))
    html_link = html_link_tag['href'] if html_link_tag else 'No HTML link found'

    # Extract the Form Type from the HTML URL
    form_type_match = re.search(r'-Form_(.*?)-2', html_link)
    form_type = form_type_match.group(1) if form_type_match else 'No Form Type found'

    # Filter by Form Type
    if not form_types.strip() or form_type in [ft.strip() for ft in form_types.split(',')]:
        print(f"Entity Name: {title}")
        print(f"HTML URL: {html_link}")
        print(f"Form Type: {form_type} Published: {pub_date}")
        print("-" * 100)

Entity Name: North Central Power Co., Inc.
HTML URL: https://eCollection.ferc.gov/api/DownloadDocument/366035/3?filename=273326-C011317-Form_3Q_Electric-2025-Q2_273326.html
Form Type: 3Q_Electric Published: Mon, 08 Sep 2025 10:39:40 -0400
----------------------------------------------------------------------------------------------------
Entity Name: Georgia Power Company
HTML URL: https://eCollection.ferc.gov/api/DownloadDocument/365215/3?filename=272936-C001553-Form_1-2024-Q4_272936.html
Form Type: 1 Published: Tue, 02 Sep 2025 15:24:12 -0400
----------------------------------------------------------------------------------------------------
Entity Name: DCR Transmission, L.L.C.
HTML URL: https://eCollection.ferc.gov/api/DownloadDocument/365096/3?filename=272880-C011902-Form_3Q_Electric-2025-Q2_272880.html
Form Type: 3Q_Electric Published: Sat, 30 Aug 2025 23:27:14 -0400
----------------------------------------------------------------------------------------------------
Entity Name: 

## 1. Run the cell to install the Arelle processor and XULE plugin to extract data.

In [1]:
# @title
import os, shutil, sys, site, platform
print('Please wait while Arelle, XULE and some helper packages are installed. \nA XULE version message appears below when the environment is ready.')

# In this example, Arelle and aniso are required to use XULE - get Arelle release details from GitHub (https://github.com/Arelle/arelle/releases).
# Use %pip -q install git+https://git@github.com/Arelle/arelle.git@master to use Arelle's development release
%pip -q install Arelle-release==2.37.6
%pip -q install aniso8601==9.0.1

# 1) locate Arelle's plugin directory (do not modify this location); remove temp and xuledir if they exist
plugindir = site.getsitepackages()[0] + '/arelle/plugin/'
edgardir = plugindir + 'EDGAR/'
xuledir = plugindir + 'xule/'
xodeldir = plugindir + 'xodel/'
serializerdir = plugindir + 'serializer/'
SimpleXBRLModeldir = plugindir + 'SimpleXBRLModel/'
edgartemp = plugindir + 'edgartemp/'
xtemp = plugindir + 'xtemp/'
if os.path.exists(xtemp):
  shutil.rmtree(edgardir)
  os.remove(plugindir + 'semanticHash.py')
  shutil.rmtree(xuledir)
  shutil.rmtree(xodeldir)
  shutil.rmtree(serializerdir)
  shutil.rmtree(SimpleXBRLModeldir)
  shutil.rmtree(xtemp)
else: ''
os.chdir(plugindir)

# 2) copy XULE and EDGAR plugins from GitHub to Arelle plugin directories
!git clone --quiet --depth=1 --branch 30050 --single-branch https://github.com/xbrlus/xule.git xtemp &> /dev/null
!git clone --quiet --depth=1 --branch 25.0.1 --single-branch https://github.com/Arelle/EDGAR.git edgartemp &> /dev/null
shutil.move(edgartemp, edgardir)
shutil.move(xtemp + 'plugin/semanticHash.py', plugindir)
shutil.move(xtemp + 'plugin/xule', xuledir)
shutil.move(xtemp + 'plugin/xodel', xodeldir)
shutil.move(xtemp + 'plugin/serializer', serializerdir)
shutil.move(xtemp + 'plugin/SimpleXBRLModel', SimpleXBRLModeldir)

# 3) confirm XULE (change -v to -h and re-run to see help contents for Arelle and XULE)
!arelleCmdLine --version
!arelleCmdLine --plugins 'EDGAR/transform' -v
!arelleCmdLine --plugins 'EDGAR/validate' -v
!arelleCmdLine --plugins 'xule' -v
print('\nArelle is installed and XULE is ready in ' + os.getcwd())

Please wait while Arelle, XULE and some helper packages are installed. 
A XULE version message appears below when the environment is ready.
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.9/44.9 MB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.8/103.8 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.8/52.8 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hArelle(r) 2.37.6 (64bit)
[info] Activation of plug-in SEC Inline Transforms successful, version 19.2. - EDGAR/transform 
[info] Activation of plug-in Validate EFM successful, version 1.25.0.1. - EDGAR/validate 
[info] Activation of plug-in XBRL rule processor (xule) successful, version Check version using Tools->Xule->Version on the GUI or --xule-version on the command line. - xule 
[info] Xule version: 3.0.30050 - 

Arelle is installed and XULE is ready


## 2. Run the cell to save variables to a file called 'extract-data.xule' and compile it to .zip so it can be used by the Arelle processor in the next step.

**Use 'Show code' in Colab to inspect XULE expression details**

\* /content/ is Colab default

In [1]:
# @title
# for mybinder.org, use /home/jovyan/ as location variable
location = '/content/' # @param {type:"string"}
name = 'extracted' # @param {type:"string"}
format = "xlsx" # @param ['xlsx', 'json']
if format == "xlsx":
  type = 'spreadsheet'
elif format == "json":
  type = 'json'
# delete the prior version of XULE file if the name matches what was provided in 'with open' code below
curdir = os.getcwd()
if os.path.isfile(curdir + '/extract-data.xule'):
    os.remove(curdir + '/extract-data.xule')
    print('previous extract-data.xule deleted')
else: ''

# XULE file starts after the ''' on the next line and continues to the second '''
xule_file = '''
constant $filedir = '%s'
constant $filename = '%s'
constant $fileformat = '%s'
constant $filetype = '%s'
constant $extracted = $filedir + $filename + '.' + $fileformat

// Create

output-attribute file-location
output-attribute file-content
output-attribute file-append

output cube-data

// Identify all the cubes in the instance document

$all-cubes = taxonomy().cubes

for $cube in $all-cubes
    if $cube.cube-concept.name.local-name == 'AccumulatedProvisionForDepreciationTable'
        skip
    else
    $cube-facts = $cube.facts
    $dict = dict(
                list($cube.cube-concept.name.local-name,
                list(for $fact in $cube-facts
                    list($fact.entity.id, $fact.concept.label.text, $fact.concept.name, $fact.period.start, $fact.period.end, $fact.unit.string, $fact.dimensions.join(', ','='), $fact)
                )
                )
    )
$dataextract = if $filetype == 'spreadsheet'
    $dict.to-spreadsheet
    else $dict.to-json
$dataextract

file-content $rule-value
file-location $extracted
file-append true
'''
# save the XULE file and print a time stamp on the screen
!pip install tzlocal
with open('extract-data.xule', mode='w') as file:
    file.write(xule_file % (location, name, format, type))
import datetime
from tzlocal import get_localzone
local_tz = get_localzone()
current_time = datetime.datetime.now(local_tz)
formatted_time = current_time.strftime('%H:%M:%S %Z on %b %d, %Y')
print('extract-data.xule updated ' + formatted_time +'\ndata will be saved to ' + location + name + '.' + format + '\n\nXULE is compiling the file to .zip, so it can be executed in the next step.\n')

# variables for Arelle processing
FILE_NAME = 'extract-data.xule'
ZIP_NAME = 'extract-data.zip'
LOG_LOCATION = location + name + '-log.xml'

# compile XULE into .zip
!arelleCmdLine --plugins "xule" --xule-compile $FILE_NAME \
--xule-rule-set $ZIP_NAME --logFormat="[%(messageCode)s] %(message)s"

# copy XULE and .zip to location specified
shutil.copy(os.getcwd() + '/' + FILE_NAME, location + FILE_NAME)
shutil.copy(os.getcwd() + '/' + ZIP_NAME, location + ZIP_NAME)
print('\nThe XULE expression is compiled and ready.')

extract-data.xule updated 13:50:23 UTC on Sep 19, 2025
data will be saved to /content/extracted.xlsx

XULE is compiling the file to .zip, so it can be executed in the next step.

[info] Activation of plug-in XBRL rule processor (xule) successful, version Check version using Tools->Xule->Version on the GUI or --xule-version on the command line.
[info] Xule version: 3.0.30050
2025-09-19T13:50:26.974821: extract-data.xule parse start
2025-09-19T13:50:28.418720: extract-data.xule parse end. Took 0:00:01.443899
2025-09-19T13:50:28.418967: extract-data.xule ast start
2025-09-19T13:50:28.427902: extract-data.xule ast end. Took 0:00:00.008935
2025-09-19T13:50:28.427985: post parse start
2025-09-19T13:50:28.430246: post parse end. Took 0:00:00.002261
2025-09-19T13:50:28.432277: Parsing finished. Took 0:00:01.457973

The XULE expression is compiled and ready.


## 3. Update the `reportfile` variable with the location of an XBRL report - .html, .xbrl, .zip - then (re-)run this cell to extract data (and append) to file output.

Download an XBRL report from the FERC eForms list above, rename it as sample.html and uploaded to Colab's storage directory to use the default `reportfile`. 

Alternatively, this URL is an inline XBRL submitted to the SEC that contains several cube examples: https://www.sec.gov/Archives/edgar/data/0001467858/000146785825000032/gm-20241231.htm There's also a [Jupyter Notebook example](https://github.com/xbrlus/xule/blob/jupyter/extract-fact-data.ipynb) with an output selector for switching between fact and cube extraction.

In [1]:
# @title
reportfile = location + "sample.html" # @param {type:"string"}
print('Data extraction is starting on ' + reportfile + '\nThis might take several minutes, depending on the size of the file.\n' )
# run .zip to create output
!arelleCmdLine --plugins "xule|EDGAR/transform|validate/EFM" \
--xule-rule-set $ZIP_NAME \
-v -f $reportfile \
--xule-time .000 --xule-debug --noCertificateCheck \
--logFormat="[%(messageCode)s] %(message)s" \
#--logFile $LOG_LOCATION
# uncomment the line above to save the processing log

print('\nData extracted to ' + location + name + '.' + format + ' from ' + reportfile + '\n\n' \
      'Update reportfile and run cell again to append data to the file,\n' \
      'or change prior cell variable to save to a different location, file name and/or type.')

Data extraction is starting on /content/sample.html
This might take several minutes, depending on the size of the file.

[info] Activation of plug-in XBRL rule processor (xule) successful, version Check version using Tools->Xule->Version on the GUI or --xule-version on the command line.
[info] Activation of plug-in SEC Inline Transforms successful, version 19.2.
[info] Activation of plug-in Validate EFM successful, version 1.25.0.1.
[info] Xule version: 3.0.30050
[lxml.SCHEMAV_ELEMENT_CONTENT] XML file syntax error Element '{http://www.w3.org/1999/xhtml}head': Missing child element(s). Expected is one of ( {http://www.w3.org/1999/xhtml}script, {http://www.w3.org/1999/xhtml}style, {http://www.w3.org/1999/xhtml}meta, {http://www.w3.org/1999/xhtml}link, {http://www.w3.org/1999/xhtml}object, {http://www.w3.org/1999/xhtml}title, {http://www.w3.org/1999/xhtml}base )., line 2, path 'head', xpath '/*/*[1]'
[lxml.SCHEMAV_CVC_COMPLEX_TYPE_3_2_1] XML file syntax error Element '{http://www.w3.org/