In [11]:
from pathlib import Path
from datetime import datetime
import json
from io import BytesIO
from zipfile import ZipFile, BadZipFile
from tqdm import tqdm
import requests
import time

import pandas_datareader.data as web
import pandas as pd

from pprint import pprint

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker

import warnings

In [12]:
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

## SEC Edgar Data

- EDGAR acess policy: https://www.sec.gov/os/accessing-edgar-data
- EDGAR FSN data: https://www.sec.gov/about/divisions-offices/division-economic-risk-analysis/data/financial-statement-and-notes-data-set
- EDGAR FS data: https://www.sec.gov/dera/data/financial-statement-data-sets

In [26]:
def download_FSN_from_sec(url, path):

        # Declare user agent in request headers
    headers = {
        'User-Agent': 'xikest12@gmail.com',
        'Accept-Encoding': 'gzip, deflate',
        'Host': 'www.sec.gov'
    }
    # Download and save file
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            with open('downloaded_file.zip', 'wb') as f:
                f.write(response.content)
                
            # Decompress and save
            with ZipFile(BytesIO(response.content)) as zip_file:
                for file in zip_file.namelist():
                    local_file = path / file
                    if local_file.exists():
                        continue
                    with local_file.open('wb') as output:
                        for line in zip_file.open(file).readlines():
                            output.write(line)

            # Sleep to comply with request rate limit
            time.sleep(0.1)  # Adjust as needed
        else:
            print(f"Failed to download file. Status code: {response.status_code}")
            print("Response content:", response.content)
    except BadZipFile:
        print(f'\nBad zip file: {url}\n')
        pass


def download_SEC_reports(data_path='data', start_date='2009', end_date='2020-11-30', freq='Q'):
    SEC_URL = 'https://www.sec.gov/'
    FSN_PATH = 'files/dera/data/financial-statement-and-notes-data-sets/'
    periods = []
        
    if freq == 'Q':
        periods = [(d.year, d.quarter) for d in pd.date_range(start_date, end_date, freq='Q')]
    elif freq == 'M':
        periods = [(d.year, d.month) for d in pd.date_range(start_date, end_date, freq='M')]
    else:
        raise ValueError("Invalid frequency. Please use 'Q' for quarters or 'M' for months.")

    for yr, time_period in tqdm(periods):
        # Set (and create) directory
        if freq == 'Q':
            path = data_path / f'{yr}_{time_period}q' / 'source'
            filing = f'{yr}q{time_period}_notes.zip'
        elif freq == 'M':
            path = data_path / f'{yr}_{time_period:02}' / 'source'
            filing = f'{yr}_{time_period:02}_notes.zip'

        if not path.exists():
            path.mkdir(parents=True)
        
        url = SEC_URL + FSN_PATH + filing
        # print(url)
        download_FSN_from_sec(url, path)

In [27]:
STORAGE_PATH = "E:"
data_path = 'edgar/fsn'
data_path = STORAGE_PATH/Path(data_path)
if not data_path.exists():
    data_path.mkdir()

In [None]:
# 분기별 데이터
download_SEC_reports(data_path, '2009', '2020-11-30', freq='Q')

 23%|██████████████████████████████▍                                                                                                   | 11/47 [03:16<20:14, 33.73s/it]

In [None]:
# 월간 데이터
download_SEC_reports(data_path, '2020-10-1', '2023-11-30', freq='M')

  0%|                                                                                                                                                                                                                                                                                       | 0/38 [00:00<?, ?it/s]

https://www.sec.gov/files/dera/data/financial-statement-and-notes-data-sets/2020_10_notes.zip


  3%|███████                                                                                                                                                                                                                                                                     | 1/38 [01:52<1:09:31, 112.75s/it]

https://www.sec.gov/files/dera/data/financial-statement-and-notes-data-sets/2020_11_notes.zip


  5%|██████████████                                                                                                                                                                                                                                                              | 2/38 [05:45<1:49:54, 183.18s/it]

https://www.sec.gov/files/dera/data/financial-statement-and-notes-data-sets/2020_12_notes.zip


  8%|█████████████████████▏                                                                                                                                                                                                                                                      | 3/38 [06:48<1:14:53, 128.39s/it]

https://www.sec.gov/files/dera/data/financial-statement-and-notes-data-sets/2021_01_notes.zip


 11%|████████████████████████████▌                                                                                                                                                                                                                                                  | 4/38 [07:20<51:13, 90.40s/it]

https://www.sec.gov/files/dera/data/financial-statement-and-notes-data-sets/2021_02_notes.zip


 13%|███████████████████████████████████▎                                                                                                                                                                                                                                        | 5/38 [11:57<1:26:41, 157.62s/it]

https://www.sec.gov/files/dera/data/financial-statement-and-notes-data-sets/2021_03_notes.zip


 16%|██████████████████████████████████████████▎                                                                                                                                                                                                                                 | 6/38 [16:53<1:49:13, 204.79s/it]

https://www.sec.gov/files/dera/data/financial-statement-and-notes-data-sets/2021_04_notes.zip


 18%|█████████████████████████████████████████████████▎                                                                                                                                                                                                                          | 7/38 [20:41<1:49:38, 212.21s/it]

https://www.sec.gov/files/dera/data/financial-statement-and-notes-data-sets/2021_05_notes.zip


In [None]:
from google.colab import files
!zip -r /content/edgar/fsn.zip /content/edgar/fsn
files.download('/content/edgar/fsn.zip')

---