In [2]:
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

from ratelimit import limits, sleep_and_retry

In [3]:
import Edgar_scrapper

In [4]:
import re

In [293]:
pd.set_option('display.max_colwidth',200)

In [5]:
edgar_access = Edgar_scrapper.EdgarAccess()

In [280]:
def get_fillings(fillings_ticker,fillings_cik, doc_type, start=0, count=60):
    fillings_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}\
            &type={}&start={}&count={}&owner=exclude&output=atom'.format(fillings_cik, doc_type, start, count)
    fillings_html = edgar_access.get(fillings_url)
    fillings_soup = BeautifulSoup(fillings_html, features="html.parser")

    fillings_list = [
        (fillings_ticker,
         fillings_cik,
         link.find('filing-date').getText(),
         link.find('filing-href').getText())
         #link.find('filing-type').getText())
        for link in fillings_soup.find_all('entry')]

    return fillings_list

In [281]:
ticker_ciks = pd.read_csv('tickers.csv')

In [282]:
sample_ticker = pd.DataFrame(data={'ticker' : ['AMC'],'cik':[1411579]})

In [283]:
sec_fillings_df = pd.DataFrame(columns=['ticker','cik','date','annual_report_url'])

In [284]:
for ticker, cik in sample_ticker[:1].values:
    temp = pd.DataFrame(data = get_fillings(ticker,cik, '10-K'),columns=['ticker','cik','date','annual_report_url'])
    sec_fillings_df = sec_fillings_df.append(temp,ignore_index = True)

del(temp)

In [285]:
sec_fillings_df.annual_report_url = sec_fillings_df.annual_report_url.\
                                    replace('-index.htm', '.txt',regex=True)\
                                    .replace('.txtl', '.txt',regex=True) 

In [286]:
sec_fillings_df

Unnamed: 0,ticker,cik,date,annual_report_url
0,AMC,1411579,2021-03-12,https://www.sec.gov/Archives/edgar/data/1411579/000141157921000006/0001411579-21-000006.txt
1,AMC,1411579,2020-02-28,https://www.sec.gov/Archives/edgar/data/1411579/000141157920000027/0001411579-20-000027.txt
2,AMC,1411579,2019-03-01,https://www.sec.gov/Archives/edgar/data/1411579/000141157919000013/0001411579-19-000013.txt
3,AMC,1411579,2018-03-01,https://www.sec.gov/Archives/edgar/data/1411579/000141157918000014/0001411579-18-000014.txt
4,AMC,1411579,2017-03-10,https://www.sec.gov/Archives/edgar/data/1411579/000141157917000021/0001411579-17-000021.txt
5,AMC,1411579,2016-03-08,https://www.sec.gov/Archives/edgar/data/1411579/000104746916010880/0001047469-16-010880.txt
6,AMC,1411579,2015-03-10,https://www.sec.gov/Archives/edgar/data/1411579/000104746915001918/0001047469-15-001918.txt
7,AMC,1411579,2014-03-04,https://www.sec.gov/Archives/edgar/data/1411579/000104746914001769/0001047469-14-001769.txt


In [450]:
sec_fillings_df['filling_text'] = [None] * len(sec_fillings_df)

In [680]:
for index,row in tqdm(sec_fillings_df.iterrows(),desc='Downloading Fillings', \
                      unit='filling',total=len(sec_fillings_df)):
    filing_href = row['annual_report_url'] 
    report_txt= edgar_access.get(filing_href)
    report_soup = BeautifulSoup(report_txt, "html")
    for document in report_soup.find_all('TYPE'):    
        if(re.match(r'\s+10-K',document.prettify().splitlines()[1])):
            #if (document.find('html')):
            sec_fillings_df.iloc[index]['filling_text'] = document#.find('html')

Downloading Fillings: 100%|██████████| 8/8 [01:24<00:00, 10.59s/filling]


In [632]:
print("Memory consumption of sec_fillings_df is {:.2f}Mb".format(sec_fillings_df.memory_usage().sum()/1024**2))

Memory consumption of sec_fillings_df is 0.00Mb


In [692]:
sec_fillings_df

Unnamed: 0,ticker,cik,date,annual_report_url,filling_text
0,AMC,1411579,2021-03-12,https://www.sec.gov/Archives/edgar/data/1411579/000141157921000006/0001411579-21-000006.txt,"[10-K\n, [1\n, [amc-20201231x10k.htm\n, <description>10-K\n<text>\n<xbrl>\n<?xml version='1.0' encoding='UTF-8'?>\n<!-- iXBRL document created with: Toppan Merrill Bridge iXBRL 9.6.7713.40453 -->\..."
1,AMC,1411579,2020-02-28,https://www.sec.gov/Archives/edgar/data/1411579/000141157920000027/0001411579-20-000027.txt,"[10-K\n, [\n, [\n, xml version='1.0' encoding='UTF-8', \n, iXBRL document created with: Toppan Merrill Bridge iXBRL 9.5.7293.39921 , \n, Based on: iXBRL 1.1 , \n, Created on: 2/27/2020 11:12:00..."
2,AMC,1411579,2019-03-01,https://www.sec.gov/Archives/edgar/data/1411579/000141157919000013/0001411579-19-000013.txt,"[10-K\n, [\n, HTML document created with Merrill Bridge 9.0.0.120, \n, Created on: 3/1/2019 6:13:18 AM, \n, [\n, <head>\n<title>\n\t\t\tamch_Current folio_10K\n\t\t</title>\n</head>, \n, <body><d..."
3,AMC,1411579,2018-03-01,https://www.sec.gov/Archives/edgar/data/1411579/000141157918000014/0001411579-18-000014.txt,"[10-K\n, [\n, HTML document created with Merrill Bridge 7.3.329.0, \n, Created on: 3/1/2018 8:02:20 AM, \n, [\n, <head>\n<title>\n\t\t\tamch_Current folio_10K\n\t\t</title>\n</head>, \n, <body><d..."
4,AMC,1411579,2017-03-10,https://www.sec.gov/Archives/edgar/data/1411579/000141157917000021/0001411579-17-000021.txt,"[10-K\n, [\n, HTML document created with Merrill Bridge 6.4.50.0, \n, Created on: 3/10/2017 3:52:09 PM, \n, [\n, <head>\n<title>\n\t\t\tamch_Current folio_10K\n\t\t</title>\n</head>, \n, <body><d..."
5,AMC,1411579,2016-03-08,https://www.sec.gov/Archives/edgar/data/1411579/000104746916010880/0001047469-16-010880.txt,
6,AMC,1411579,2015-03-10,https://www.sec.gov/Archives/edgar/data/1411579/000104746915001918/0001047469-15-001918.txt,
7,AMC,1411579,2014-03-04,https://www.sec.gov/Archives/edgar/data/1411579/000104746914001769/0001047469-14-001769.txt,


In [693]:
sample_report_txt = sec_fillings_df.iloc[0]['filling_text']

In [694]:
result = [p_tag.getText() for p_tag in sample_report_txt.find_all('p',text=True) if re.match(r'\w+',p_tag.getText())]

In [698]:
result[:2000]

['Table of Contents',
 'UNITED STATES',
 'SECURITIES AND EXCHANGE COMMISSION',
 'Washington, D.C. 20549',
 'ANNUAL REPORT PURSUANT TO SECTION\xa013 OR 15(d) OF THE SECURITIES EXCHANGE\xa0ACT OF\xa01934',
 'OR',
 'TRANSITION REPORT PURSUANT TO SECTION\xa013 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF\xa01934',
 'For the transition period from                  to                 ',
 'AMC ENTERTAINMENT HOLDINGS,\xa0INC.',
 'Registrant’s telephone number, including area code:',
 'Securities registered pursuant to Section 12(b) of the Act:',
 'Title of each class',
 'Trading Symbol',
 'Name of each exchange on which registered',
 'Class A common stock',
 'AMC',
 'New York Stock Exchange',
 'Indicate by check mark whether the registrant is a large accelerated filer, an accelerated filer, a non-accelerated filer, a smaller reporting company, or an emerging growth company. See definitions of “large accelerated filer,” “accelerated filer,” “smaller reporting company,” and emerging growth company