https://gist.github.com/anshoomehra/ead8925ea291e233a5aa2dcaa2dc61b2

In [1]:
# Import requests to retrive Web Urls example HTML. TXT 
import requests

# Import BeautifulSoup
from bs4 import BeautifulSoup

# import re module for REGEXes
import re

# import pandas
import pandas as pd

In [5]:
with open('sec-edgar-filings/AES/10-K/0000874761-23-000010/full-submission.txt', 'r') as file:
    raw_10k = file.read()


In [6]:
print(raw_10k[0:1300])

<SEC-DOCUMENT>0000874761-23-000010.txt : 20230301
<SEC-HEADER>0000874761-23-000010.hdr.sgml : 20230301
<ACCEPTANCE-DATETIME>20230301164429
ACCESSION NUMBER:		0000874761-23-000010
CONFORMED SUBMISSION TYPE:	10-K
PUBLIC DOCUMENT COUNT:		229
CONFORMED PERIOD OF REPORT:	20221231
FILED AS OF DATE:		20230301
DATE AS OF CHANGE:		20230301

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			AES CORP
		CENTRAL INDEX KEY:			0000874761
		STANDARD INDUSTRIAL CLASSIFICATION:	COGENERATION SERVICES & SMALL POWER PRODUCERS [4991]
		IRS NUMBER:				541163725
		STATE OF INCORPORATION:			DE
		FISCAL YEAR END:			1231

	FILING VALUES:
		FORM TYPE:		10-K
		SEC ACT:		1934 Act
		SEC FILE NUMBER:	001-12291
		FILM NUMBER:		23694366

	BUSINESS ADDRESS:	
		STREET 1:		4300 WILSON BOULEVARD
		CITY:			ARLINGTON
		STATE:			VA
		ZIP:			22203
		BUSINESS PHONE:		7035221315

	MAIL ADDRESS:	
		STREET 1:		4300 WILSON BOULEVARD
		CITY:			ARLINGTON
		STATE:			VA
		ZIP:			22203

	FORMER COMPANY:	
		FORMER CONFORMED NAME:	AES C

In [7]:
# Regex to find <DOCUMENT> tags
doc_start_pattern = re.compile(r'<DOCUMENT>')
doc_end_pattern = re.compile(r'</DOCUMENT>')
# Regex to find <TYPE> tag prceeding any characters, terminating at new line
type_pattern = re.compile(r'<TYPE>[^\n]+')

In [8]:
# Create 3 lists with the span idices for each regex

### There are many <Document> Tags in this text file, each as specific exhibit like 10-K, EX-10.17 etc
### First filter will give us document tag start <end> and document tag end's <start> 
### We will use this to later grab content in between these tags
doc_start_is = [x.end() for x in doc_start_pattern.finditer(raw_10k)]
doc_end_is = [x.start() for x in doc_end_pattern.finditer(raw_10k)]

### Type filter is interesting, it looks for <TYPE> with Not flag as new line, ie terminare there, with + sign
### to look for any char afterwards until new line \n. This will give us <TYPE> followed Section Name like '10-K'
### Once we have have this, it returns String Array, below line will with find content after <TYPE> ie, '10-K' 
### as section names
doc_types = [x[len('<TYPE>'):] for x in type_pattern.findall(raw_10k)]

In [9]:
document = {}

# Create a loop to go through each section type and save only the 10-K section in the dictionary
for doc_type, doc_start, doc_end in zip(doc_types, doc_start_is, doc_end_is):
    if doc_type == '10-K':
        document[doc_type] = raw_10k[doc_start:doc_end]

In [10]:
# display excerpt the document
document['10-K'][0:500]

'\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>aes-20221231.htm\n<DESCRIPTION>10-K\n<TEXT>\n<XBRL>\n<?xml version="1.0" ?><!--XBRL Document Created with Wdesk from Workiva--><!--Copyright 2023 Workiva--><!--r:82161566-b5d4-463c-bc19-3799237b023b,g:8f97136d-1a4b-4b31-9d3f-870eb6982d60,d:84f31ef528bc41899c5480059e42eda9--><html xmlns:xbrldi="http://xbrl.org/2006/xbrldi" xmlns:aes="http://www.aes.com/20221231" xmlns:utr="http://www.xbrl.org/2009/utr" xmlns:ix="http://www.xbrl.org/2013/inlineXBRL" xmlns="http://www.'

In [37]:
# Write the regex
# regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|7A|7|8)\.{0,1})|(ITEM\s(1A|1B|7A|7|8))')
regex = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|1B|2|3|4|5|6|7|7A|8|9A|9B|9|[1][0-5])\.{0,1})|(ITEM(\s|&#160;|&nbsp;)(1A|1B|2|3|4|5|6|7|7A|8|9A|9B|9|[1][0-5])\.{0,1})')

# Use finditer to math the regex
matches = regex.finditer(document['10-K'])

# Write a for loop to print the matches
for match in matches:
    print(match)

<re.Match object; span=(811500, 811513), match='ITEM&#160;1A.'>
<re.Match object; span=(812356, 812369), match='ITEM&#160;1B.'>
<re.Match object; span=(813225, 813237), match='ITEM&#160;2.'>
<re.Match object; span=(814078, 814090), match='ITEM&#160;3.'>
<re.Match object; span=(814938, 814950), match='ITEM&#160;4.'>
<re.Match object; span=(816641, 816648), match='ITEM 5.'>
<re.Match object; span=(817622, 817634), match='ITEM&#160;6.'>
<re.Match object; span=(818489, 818501), match='ITEM&#160;7.'>
<re.Match object; span=(824757, 824768), match='ITEM&#160;7'>
<re.Match object; span=(825662, 825674), match='ITEM&#160;8.'>
<re.Match object; span=(857128, 857140), match='ITEM&#160;9.'>
<re.Match object; span=(858058, 858071), match='ITEM&#160;9A.'>
<re.Match object; span=(858928, 858941), match='ITEM&#160;9B.'>
<re.Match object; span=(859792, 859798), match='ITEM 9'>
<re.Match object; span=(861539, 861552), match='ITEM&#160;10.'>
<re.Match object; span=(862469, 862482), match='ITEM&#160;11.'

In [39]:
# Matches
matches = regex.finditer(document['10-K'])

# Create the dataframe
test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])

test_df.columns = ['item', 'start', 'end']
test_df['item'] = test_df.item.str.lower()

# Display the dataframe
test_df.head()

Unnamed: 0,item,start,end
0,item&#160;1a.,811500,811513
1,item&#160;1b.,812356,812369
2,item&#160;2.,813225,813237
3,item&#160;3.,814078,814090
4,item&#160;4.,814938,814950


In [41]:
# Get rid of unnesesary charcters from the dataframe
test_df.replace('&#160;',' ',regex=True,inplace=True)
test_df.replace('&nbsp;',' ',regex=True,inplace=True)
test_df.replace(' ','',regex=True,inplace=True)
test_df.replace('\.','',regex=True,inplace=True)
test_df.replace('>','',regex=True,inplace=True)

# display the dataframe
test_df.head()

Unnamed: 0,item,start,end
0,item1a,811500,811513
1,item1b,812356,812369
2,item2,813225,813237
3,item3,814078,814090
4,item4,814938,814950


In [53]:
# Drop duplicates
pos_dat = test_df.sort_values('start', ascending=True).drop_duplicates(subset=['item'], keep='last')

# Display the dataframe
pos_dat.head()

Unnamed: 0,item,start,end
20,item1a,1809543,1809556
21,item1b,1975382,1975390
22,item2,1975844,1975851
23,item3,1977374,1977386
24,item4,2006440,2006452


In [54]:
# Get Item 1a÷
# item_1a_raw = document['10-K'][pos_dat['start'].loc['item1a']:pos_d÷at['start'].loc['item1b']]

# # Get Item 7
# item_7_raw = document['10-K'][pos_dat['start'].loc['item7']:pos_dat['start'].loc['item7a']]

# # Get Item 7a
# item_7a_raw = document['10-K'][pos_dat['start'].loc['item7a']:pos_dat['start'].loc['item8']]

In [55]:
item1a_start = pos_dat[pos_dat['item'] == 'item1a']['start'].values[0]
item1b_start = pos_dat[pos_dat['item'] == 'item1b']['start'].values[0]
item_1a_raw = document['10-K'][item1a_start:item1b_start]


In [58]:
item_1a_raw[:300]

'ITEM&#160;1A. RISK FACTORS </span></div><div style="margin-top:6pt;text-indent:22.5pt"><span style="color:#000000;font-family:\'Arial\',sans-serif;font-size:10pt;font-weight:400;line-height:120%">You should consider carefully the following risks, along with the other information contained in or incorp'

In [59]:
item_1a_content = BeautifulSoup(item_1a_raw, 'lxml')

In [60]:
print(item_1a_content.prettify()[0:1000])

<html>
 <body>
  <p>
   ITEM 1A. RISK FACTORS
  </p>
  <div style="margin-top:6pt;text-indent:22.5pt">
   <span style="color:#000000;font-family:'Arial',sans-serif;font-size:10pt;font-weight:400;line-height:120%">
    You should consider carefully the following risks, along with the other information contained in or incorporated by reference in this Form 10-K. Additional risks and uncertainties also may adversely affect our business and operations. We routinely encounter and address risks, some of which may cause our future results to be materially different than we presently anticipate. The categories of risk we have identified in Item 1A.—
   </span>
   <span style="color:#000000;font-family:'Arial',sans-serif;font-size:10pt;font-style:italic;font-weight:400;line-height:120%">
    <a href="#i84f31ef528bc41899c5480059e42eda9_58" style="color:#000000;font-family:'Arial',sans-serif;font-size:10pt;font-style:italic;font-weight:400;line-height:120%;text-decoration:none">
     Risk Factors

In [61]:
print(item_1a_content.get_text("\n\n")[0:1500])

ITEM 1A. RISK FACTORS 

You should consider carefully the following risks, along with the other information contained in or incorporated by reference in this Form 10-K. Additional risks and uncertainties also may adversely affect our business and operations. We routinely encounter and address risks, some of which may cause our future results to be materially different than we presently anticipate. The categories of risk we have identified in Item 1A.—

Risk Factors

 include risks associated with our operations, governmental regulation and laws, our indebtedness and financial condition. These risk factors should be read in conjunction with Item 7

.—

Management's Discussion and Analysis of Financial Condition and Results of Operations

 

in this Form 10-K and the Consolidated Financial Statements and related notes included elsewhere in this Form 10-K. If any of the following events actually occur, our business, financial results and financial condition could be materially adversely a