<a href="https://colab.research.google.com/github/tobytoyin/sec-10q-msc-report/blob/main/1_Download_10_Q_Reports.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook, I will be using beautifulsoup and requests to download all the sec 10-q filings.

In [None]:
import requests
import pandas as pd 
from bs4 import BeautifulSoup
import time
import numpy as np
import os 
import re

## retrieve the selected company 
selected_companies = pd.read_csv('/content/drive/MyDrive/Aston/Term 3/data/selected_companies.csv')

# extracted records 
extracted_records = pd.read_csv('/content/drive/MyDrive/Aston/Term 3/data/edgar-filing-new.csv')
extracted_companies = extracted_records.groupby('sym').count()

In [None]:
resp = requests.get('https://free-proxy-list.net/')
df = pd.read_html(resp.text)[0].dropna()
proxies = list('https://' + df['IP Address'].astype(str) + ':' + df['Port'].astype(int).astype(str))
get_proxy = 0

In [None]:
def polite_try(url, try_n=10, sleep=5, **kwargs):
  """Retry politely"""
  global get_proxy
  trial = 0

  while(trial <= try_n):
      # make request 
      response = requests.get(url, **kwargs)
      
      # check if the request is okay
      if response.status_code == 200:
        return response
      
      # if fail, add trial by 1
      trial += 1
      print(f'--- failure - retry ({trial}/{try_n})')
      print(response.status_code)

      # wait for 60s before next try
      time.sleep(sleep)
      get_proxy += 1

  return None


### A class to download from Edgar
class Downloader:
  def __init__(self, sym, cik, type_):
    self.sym = sym
    self.cik = cik
    self.type_ = type_

  @property
  def params(self): 
    """Create 10-Q params"""
    return {
      'action': 'getcompany',
      'CIK': self.cik,
      'type': self.type_,
      'datea': '20000101',
      'dateb': '20200101',
      'owner': 'include',
      'count': '100',
      'output': 'atom',
    }

  def request_data(self):
    """Make request to the Website"""
    res = polite_try('https://www.sec.gov/cgi-bin/browse-edgar', params=self.params)
    print(res.url)

    return res.content

# html = Downloader('AMD', '2488', '10-Q').request_data()

In [None]:
### Class to parse the table and select required information
class DownloadFiling:
  def __init__(self, sym, cik, response_html, 
               save_dir='/content/drive/MyDrive/Aston/Term 3/data/10-Q-original'):
    self.sym = sym
    self.cik = cik
    self.response_html = response_html
    self.save_dir = save_dir
    self.records = None
    
  def gen_filing_url(self, acc_num):
    """generate the direct link to the txt url"""
    url = 'https://www.sec.gov/Archives/edgar/data/{cik}/{acc_num_long}/{acc_num}.txt'

    acc_num_long = acc_num.replace('-', '')
    return url.format(cik=self.cik, acc_num_long=acc_num_long, acc_num=acc_num)

  def get_records(self):
    # retreive the index 
    soup = BeautifulSoup(self.response_html, 'lxml')
    records = pd.DataFrame(columns=[
      'sym', 'cik', 'type', 'acc_num', 'filing_date', 'filing_dir', 'filing_url', 'downloaded'
    ])  # store the info 

    for entry in soup('entry'):
      # only focus on the 10-Q not 10-Q/A
      if entry.find('filing-type').text == '10-Q':
        ## details that are required
        detail = {
            'sym': self.sym, 
            'cik': self.cik, 
            'type': entry.find('filing-type').text, 
            'acc_num': entry.find('accession-number').text, 
            'filing_date': entry.find('filing-date').text,
            'filing_dir': entry.find('filing-href').text, 
            'filing_url': self.gen_filing_url(entry.find('accession-number').text), 
            'section_2_path': None, 
            'section_2_length': 0, 
            'section_1a_path': None,
            'section_1a_length': 0,
            'report_date': None,
        }
        records = records.append(pd.DataFrame(detail, index=[0]))
        self.records = records

    return records

  
  def download(self):
    """Retreive the records and download the filing """
    records = self.get_records()
    # make dir 
    try: 
      dl_dir = f'{self.save_dir}/{self.sym}'
      print(dl_dir)
      os.mkdir(dl_dir)
    except FileExistsError: 
      pass 

    # completion records 
    dl_records = pd.DataFrame()

    # download records
    for idx, record in records.iterrows():
      # request the filing 
      response = polite_try(record['filing_url'])
      print(response.status_code)
      if response.status_code == 200:

        save_name = self.sym + '-' + record['filing_url'].split('/')[-1]
        print(f'Downloaded - {dl_dir}/{save_name}')
        # write file 
        with open(f'{dl_dir}/{save_name}', 'wb') as f: 
          f.write(response.content)
        # update record
        record['downloaded'] = save_name

      # append record 
      dl_records = dl_records.append(record)
      self.records = dl_records

    return dl_records

In [None]:
# extracted_records = pd.DataFrame()
i = 0
for idx, company in selected_companies.iterrows():
  symbol = company['Symbol']

  # check if extracted 
  if (symbol in extracted_companies.index and 
      extracted_companies.loc[symbol][0] >= 10):
    print(symbol, '---pass')
    continue

  # # download list of links 
  html = Downloader(symbol, company['CIK'], '10-Q').request_data()

  # # download the files 
  downloader = DownloadFiling(symbol, company['CIK'], html)
  extracted_records = extracted_records.append(downloader.download())

  if i % 5 == 0:
    extracted_records.to_csv('/content/drive/MyDrive/Aston/Term 3/data/edgar-filing-new.csv')

  i += 1

MMM ---pass
ABT ---pass
ABBV ---pass
ABMD ---pass
ACN ---pass
ATVI ---pass
ADBE ---pass
AMD ---pass
AAP ---pass
AES ---pass
A ---pass
APD ---pass
AKAM ---pass
ALK ---pass
ALB ---pass
ARE ---pass
ALXN ---pass
ALGN ---pass
ALLE ---pass
LNT ---pass
GOOGL ---pass
GOOG ---pass
MO ---pass
AMZN ---pass
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=1748790&type=10-Q&datea=20000101&dateb=20200101&owner=include&count=100&output=atom
/content/drive/MyDrive/Aston/Term 3/data/10-Q-original/AMCR
--- failure - retry (1/10)
403
200
Downloaded - /content/drive/MyDrive/Aston/Term 3/data/10-Q-original/AMCR/AMCR-0001748790-19-000007.txt
200
Downloaded - /content/drive/MyDrive/Aston/Term 3/data/10-Q-original/AMCR/AMCR-0001104659-19-028242.txt
AEE ---pass
AAL ---pass
AEP ---pass
AMT ---pass
AWK ---pass
ABC ---pass
AME ---pass
AMGN ---pass
APH ---pass
ADI ---pass
ANSS ---pass
ANTM ---pass
AOS ---pass
APA ---pass
AAPL ---pass
AMAT ---pass
APTV ---pass
ADM ---pass
ANET ---pass
T ---pass
ATO --

In [None]:
extracted_records.to_csv('/content/drive/MyDrive/Aston/Term 3/data/edgar-filing-new.csv')