# Extract facts or cubes from XBRL reports
  1. install the open source Arelle processor and XULE plugin
  2. save output variables to a XULE expression and compile it as a .zip
  3. run the .zip with Arelle to extract numeric facts from an XBRL report to a file

## 1. Run the cell to install the Arelle processor and XULE plugin to extract data.

In [None]:
# @title
import os, shutil, sys, site, platform
print('Please wait while Arelle, XULE and some helper packages are installed. \nA XULE version message appears below when the environment is ready.')

# In this example, Arelle and aniso are required to use XULE - get Arelle release details from GitHub (https://github.com/Arelle/arelle/releases).
# Use %pip -q install git+https://git@github.com/Arelle/arelle.git@master to use Arelle's development release
%pip -q install Arelle-release==2.37.6
%pip -q install aniso8601==9.0.1

# 1) locate Arelle's plugin directory (do not modify this location); remove temp and xuledir if they exist
plugindir = site.getsitepackages()[0] + '/arelle/plugin/'
edgardir = plugindir + 'EDGAR/'
xuledir = plugindir + 'xule/'
xodeldir = plugindir + 'xodel/'
serializerdir = plugindir + 'serializer/'
SimpleXBRLModeldir = plugindir + 'SimpleXBRLModel/'
edgartemp = plugindir + 'edgartemp/'
xtemp = plugindir + 'xtemp/'
if os.path.exists(xtemp):
  shutil.rmtree(edgardir)
  os.remove(plugindir + 'semanticHash.py')
  shutil.rmtree(xuledir)
  shutil.rmtree(xodeldir)
  shutil.rmtree(serializerdir)
  shutil.rmtree(SimpleXBRLModeldir)
  shutil.rmtree(xtemp)
else: ''
os.chdir(plugindir)

# 2) copy XULE and EDGAR plugins from GitHub to Arelle plugin directories
!git clone --quiet --depth=1 --branch 30050 --single-branch https://github.com/xbrlus/xule.git xtemp &> /dev/null
!git clone --quiet --depth=1 --branch 25.0.1 --single-branch https://github.com/Arelle/EDGAR.git edgartemp &> /dev/null
shutil.move(edgartemp, edgardir)
shutil.move(xtemp + 'plugin/semanticHash.py', plugindir)
shutil.move(xtemp + 'plugin/xule', xuledir)
shutil.move(xtemp + 'plugin/xodel', xodeldir)
shutil.move(xtemp + 'plugin/serializer', serializerdir)
shutil.move(xtemp + 'plugin/SimpleXBRLModel', SimpleXBRLModeldir)

# 3) confirm XULE (change -v to -h and re-run to see help contents for Arelle and XULE)
!arelleCmdLine --version
!arelleCmdLine --plugins 'EDGAR/transform' -v
!arelleCmdLine --plugins 'EDGAR/validate' -v
!arelleCmdLine --plugins 'xule' -v
print('\nArelle is installed and XULE is ready in ' + os.getcwd())

## 2. Run the cell to select a XULE expression that extracts facts or cubes and defines output variables. 
The XULE expression is saved to a file called 'extract-data.xule' and the compiled .zip is used by the Arelle processor in the next step.  

**Use 'Show code' in Colab to inspect XULE expression details and get details for extracting all facts (numeric and text)**

\* /content/ is Colab default

In [None]:
# @title
# for mybinder.org, use /home/jovyan/ as location variable
output = 'fact' # @param ['fact', 'cube']
location = '/content/' # @param {type:"string"}
name = 'extracted' # @param {type:"string"}
format = 'xlsx' # @param ['xlsx', 'json']
if format == 'xlsx':
  type = 'spreadsheet'
elif format == 'json':
  type = 'json'
# delete the prior version of XULE file if the name matches what was provided in 'with open' code below
curdir = os.getcwd()
if os.path.isfile(curdir + '/extract-data.xule'):
    os.remove(curdir + '/extract-data.xule')
    print('previous extract-data.xule deleted')
else: ''

# XULE facts expression starts after the ''' on the next line and continues to the second '''
xule_fact = '''
constant $filedir = '%s'
constant $filename = '%s'
constant $fileformat = '%s'
constant $filetype = '%s'
constant $extracted = $filedir + $filename + '.' + $fileformat

output-attribute file-location
output-attribute file-content
output-attribute file-append

output report-data


// Identify all the facts in the instance document

$report-facts = list({covered @})

$dict = dict(
  list('%s data',

// This returns numeric facts from the report with attributes. To return all facts,
//  CHANGE 'filter $report-facts where $item.is-numeric' to '$report-facts' on the line below,
//  AND ADD '.to-xince' to '$fact,' in the list.
// FYI, spreadsheet cell limits might prevent complete HTML for some string facts.

    list(for $fact in filter $report-facts where $item.is-numeric
      list(

// Get the full list of attributes - https://xbrl.us/xule

             $fact.concept.local-name,
             $fact.concept.name.namespace-uri,
             $fact.concept.label.text,
             $fact,
             $fact.concept.balance,
             $fact.concept.data-type.name,
             $fact.unit.string,
             $fact.concept.period-type,
             $fact.period.start,
             $fact.period.end,
             $fact.dimensions.join(', ','='),
             $fact.entity.id
      )
    ).sort('asc')
  )
)
$dataextract = if $filetype == 'spreadsheet'
    $dict.to-spreadsheet
    else $dict.to-json
$dataextract

file-content $rule-value
file-location $extracted
file-append true
'''

# XULE cubes expression starts after the ''' on the next line and continues to the second '''
xule_cube = '''
constant $filedir = '%s'
constant $filename = '%s'
constant $fileformat = '%s'
constant $filetype = '%s'
constant $extracted = $filedir + $filename + '.' + $fileformat

// Create

output-attribute file-location
output-attribute file-content
output-attribute file-append

output cube-data

// Identify all the cubes in the instance document

$all-cubes = taxonomy().cubes

for $cube in $all-cubes
  $cube-facts = $cube.facts
  $dict = dict(
    list($cube.cube-concept.name.local-name,
      list(for $fact in $cube-facts
        list(
          $fact.concept.local-name,
          $fact.concept.name.namespace-uri,
          $fact.concept.label.text,
          $fact,
          $fact.concept.balance,
          $fact.concept.data-type.name,
          $fact.unit.string,
          $fact.concept.period-type,
          $fact.period.start,
          $fact.period.end,
          $fact.dimensions.join(', ','='),
          $fact.entity.id
        )
      )
    )
  )
$dataextract = if $filetype == 'spreadsheet'
    $dict.to-spreadsheet
    else $dict.to-json
$dataextract

file-content $rule-value
file-location $extracted
file-append true
'''

# save the selected XULE file and print a time stamp on the screen
!pip install tzlocal
with open('extract-data.xule', mode='w') as file:
    if output == 'fact':
        file.write(xule_fact % (location, name, format, type, name))
    elif output == 'cube':
        file.write(xule_cube % (location, name, format, type))
import datetime
from tzlocal import get_localzone
local_tz = get_localzone()
current_time = datetime.datetime.now(local_tz)
formatted_time = current_time.strftime('%H:%M:%S %Z on %b %d, %Y')
print('extract-data.xule updated ' + formatted_time +'\n' + output + ' data will be saved to ' + location + name + '.' + format + '\n\nXULE is compiling the file to .zip, so it can be executed in the next step.\n')

# variables for Arelle processing
FILE_NAME = 'extract-data.xule'
ZIP_NAME = 'extract-data.zip'
LOG_LOCATION = location + name + '-log.xml'

# compile XULE into .zip
!arelleCmdLine --plugins "xule" --xule-compile $FILE_NAME \
--xule-rule-set $ZIP_NAME --logFormat="[%(messageCode)s] %(message)s"

# copy XULE and .zip to location specified
shutil.copy(os.getcwd() + '/' + FILE_NAME, location + FILE_NAME)
shutil.copy(os.getcwd() + '/' + ZIP_NAME, location + ZIP_NAME)
print('\nThe XULE expression is compiled and ready.')

## 3. Update the `reportfile` variable with the location of an XBRL report - .html, .xbrl, .zip - then (re-)run this cell to extract (and append) data to file output.


In [None]:
# @title
reportfile = 'https://www.sec.gov/Archives/edgar/data/314227/000165495425004233/tomz_10k.htm' # @param {type:"string"}
print('Data extraction is starting on ' + reportfile + '\nThis might take several minutes, depending on the size of the file.\n' )
# run .zip to create output
!arelleCmdLine --plugins "xule|EDGAR/transform|validate/EFM" \
--xule-rule-set $ZIP_NAME \
-v -f $reportfile \
--xule-time .000 --xule-debug --noCertificateCheck \
--logFormat="[%(messageCode)s] %(message)s" \
#--logFile $LOG_LOCATION
# uncomment the line above to save the processing log

print('\nData extracted to ' + location + name + '.' + format + ' from ' + reportfile + '\n\n' \
      'Update reportfile and run cell again to append data to the file,\n' \
      'or change prior cell variable to save to a different location, file name and/or type.')