In [1]:
import pandas as pd

In [9]:
import glob
results = []

# Specify the directory path where your text files are located
directory_path = "./tests/"

# Use glob to get a list of all files with the .txt extension in the specified directory
txt_files = glob.glob(directory_path + "*.txt")

# Iterate over the list of text files and perform actions
for file_path in txt_files:
    with open(file_path, 'r',encoding="utf-8") as file:
        # Do something with the contents of the file
        file_contents = file.readlines()
        file_contents = [item for item in file_contents if item.strip() != "\n"]
        results.extend(file_contents[:10])

In [10]:
df = pd.DataFrame(results)

In [11]:
df[0] = df[0].apply(lambda x: x.strip().replace("\n",""))

In [13]:
df.loc[df[0]!=""].to_excel('document_types2.xlsx')

In [23]:
import re
import requests
import os
import time
import zipfile

OUTDIR = './edgar_files'
BASE_URL = 'https://www.sec.gov/Archives/'
YEARS = range(2023, 1992, -1)
QS = ['QTR1', 'QTR2', 'QTR3', 'QTR4']
VALID_FORMS = ['10-K', '10-Q', '8-K']
SLEEP_TIME = 0.15
header = {"User-Agent":"sean.guarnacciO@gmail.com"}

def fetch_master_files():
    """Get the master files"""
    
    for year in YEARS:
        year = str(year)
        outdir_year = os.path.join(OUTDIR, year)
        if not os.path.exists(outdir_year):
            os.makedirs(outdir_year)

        for q in QS:
            outdir_year_q = os.path.join(outdir_year, q)
            if not os.path.exists(outdir_year_q):
                os.makedirs(outdir_year_q)

            outdir_year_q_master = os.path.join(outdir_year_q, 'master.zip')
            if not os.path.exists(outdir_year_q_master):
                master_url = BASE_URL + 'edgar/full-index/' + year + '/' + q + '/master.zip'
                print('Downloading', master_url)
                time.sleep(SLEEP_TIME)
                response = requests.get(master_url,headers=header)
                with open(outdir_year_q_master, 'wb') as f:
                    f.write(response.content)


def crawl_master_files():
    """Get crawlable URLs from master files and download contracts"""

    for year in YEARS:
        print(year)
        year = str(year)
        outdir_year = os.path.join(OUTDIR, year)

        for q in QS:
            print(q)
            outdir_year_q = os.path.join(outdir_year, q)
            outdir_year_q_master = os.path.join(outdir_year_q, 'master.zip')
            try:
                z = zipfile.ZipFile(outdir_year_q_master)  # Fails for non-existant Qs, e.g. 2019 Q3
            except:
                continue

            with z.open('master.idx') as f:

                for line in f:
                    line = line.decode('utf8', errors='ignore')

                    if line[0].isdigit():  # CIK number
                        line = line.split('|')

                        if line[2] in VALID_FORMS:
                            filing_txt = line[4].strip().split('/')[-1]
                            filing_id = filing_txt.replace('-', '').replace('.txt', '')
                            filing_dir = os.path.join(outdir_year_q, filing_id)
                            if not os.path.exists(filing_dir):
                                os.makedirs(filing_dir)
                            MAX_REDIRECTS = 50  # You can adjust this number based on your needs

                            def get_with_max_redirects(url, headers=None):
                                session = requests.Session()
                                adapter = requests.adapters.HTTPAdapter(max_retries=MAX_REDIRECTS)
                                session.mount('http://', adapter)
                                session.mount('https://', adapter)

                                response = session.get(url, headers=headers)

                                return response

                            filing_index = os.path.join(filing_dir, filing_txt.replace('.txt', '') + '-index.html')
                            if not os.path.exists(filing_index):  # Check if we already have downloaded the filing index
                                index_url = os.path.join(BASE_URL, 'edgar/data', filing_id, filing_txt.replace('.txt', '') + '-index.html')
                                print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), year, q, 'Downloading index', index_url)
                                time.sleep(SLEEP_TIME)
                                index_html = get_with_max_redirects(index_url,headers=header)
                                with open(os.path.join(filing_dir, filing_txt.replace('.txt', '') + '-index.html'), 'w',encoding="utf-8") as f:
                                    f.write(index_html.text)

                            # Load the index_html
                            index_html = open(filing_index).read()
                            trs = re.findall('<tr[^>]*>(.*?)</tr>', index_html, re.S)

                            for row in trs:
                                if '<td' not in row:
                                    continue

                                tds = re.split('</?td[^>]*>', row)
                                if tds[7].startswith('EX-10'):
                                    file_name = re.search('"(.+)"', tds[5]).group(1)
                                    file_url = 'https://www.sec.gov' + file_name

                                    if file_url.endswith('htm') or file_url.endswith('html'):
                                        filing_file = os.path.join(filing_dir, file_name.split('/')[-1])

                                        if not os.path.exists(filing_file):
                                            print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), year, q, 'Downloading contract', file_url)
                                            filing_html = get_with_max_redirects(file_url,headers=header)
                                            with open(filing_file, 'w') as f:
                                                f.write(filing_html.text)
                                                

if __name__ == '__main__':

    print('Fetching master files')
    fetch_master_files()
    print('Fetching contracts')
    crawl_master_files()

Fetching master files
Downloading https://www.sec.gov/Archives/edgar/full-index/2023/QTR1/master.zip
Downloading https://www.sec.gov/Archives/edgar/full-index/2023/QTR2/master.zip
Downloading https://www.sec.gov/Archives/edgar/full-index/2023/QTR3/master.zip
Downloading https://www.sec.gov/Archives/edgar/full-index/2023/QTR4/master.zip
Downloading https://www.sec.gov/Archives/edgar/full-index/2022/QTR1/master.zip
Downloading https://www.sec.gov/Archives/edgar/full-index/2022/QTR2/master.zip
Downloading https://www.sec.gov/Archives/edgar/full-index/2022/QTR3/master.zip
Downloading https://www.sec.gov/Archives/edgar/full-index/2022/QTR4/master.zip
Downloading https://www.sec.gov/Archives/edgar/full-index/2021/QTR1/master.zip
Downloading https://www.sec.gov/Archives/edgar/full-index/2021/QTR2/master.zip
Downloading https://www.sec.gov/Archives/edgar/full-index/2021/QTR3/master.zip
Downloading https://www.sec.gov/Archives/edgar/full-index/2021/QTR4/master.zip
Downloading https://www.sec.go

TooManyRedirects: Exceeded 30 redirects.

In [24]:
html = open("C:/Users/seang/OneDrive/New Folder/data/2018/QTR1/000000217818000009/a4q2017exhibit104.htm").read()

# Two major html layouts: <p> or <div> tags for paragraphs
if '<p' in html or '<P' in html:
    elem_regex = re.compile('<[Pp][^>]*>(.*?)</[Pp]>', re.S)
else:
    elem_regex = re.compile('<div[ >].*?</div>', re.S)

# Label highlighting is either <u> or <font> tag
u_tag, font_tag = False, False
if '<u>' in html or '<U>' in html:
    u_tag = True
elif '<font' in html and ('underline' in html or 'bold' in html):
    font_tag = True

ps = elem_regex.findall(html)

In [33]:
from bs4 import BeautifulSoup

def extract_text_from_html(html_file_path):
    with open(html_file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')
    text = soup.get_text(separator='\n', strip=True)

    return text

text = extract_text_from_html("C:/Users/seang/OneDrive/New Folder/data/2018/QTR1/000000217818000009/a4q2017exhibit104.htm")

In [36]:
import os

def find_html_files(directory):
    html_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.htm'):
                html_files.append(os.path.join(root, file))
    return html_files

# Replace 'your_directory' with the actual path to your root directory
directory_to_search = 'C:/Users/seang/OneDrive/New Folder/data/'
html_files_found = find_html_files(directory_to_search)

len(html_files_found)


58159

In [38]:
from tqdm import tqdm

In [39]:
results = []
for file in tqdm(html_files_found):
    text = extract_text_from_html(file)
    file_contents = text.split("\n")
    file_contents = [item for item in file_contents if item.strip() != "\n"]
    results.extend(file_contents[:10])

100%|██████████| 58159/58159 [1:16:29<00:00, 12.67it/s]  


In [37]:
import pandas as pd

In [43]:
def clean_text(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

In [54]:
def clean_text(text):
    # Remove characters outside the range of printable ASCII characters
    cleaned_text = ''.join(char for char in text if 32 <= ord(char) < 127)
    return cleaned_text

In [55]:
cleaned_data = [clean_text(text) for text in results]
df = pd.DataFrame(cleaned_data)

In [56]:
df.to_excel('./doc_type_training.xlsx')

In [1]:
%pip install xgboost

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.comNote: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip



Collecting xgboost
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
     --------------------------------------- 99.8/99.8 MB 28.5 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-2.0.3
