In [1]:
import requests
import urllib
from urllib.request import Request, urlopen, urlretrieve

from __future__ import print_function
import multiprocessing
import os
import datetime
import zipfile
import tempfile
import logging
import os.path
import sys
import io
import time

EDGAR_PREFIX = "https://www.sec.gov/Archives/"
SEP = "|"
IS_PY3 = sys.version_info[0] >= 3
REQUEST_BUDGET_MS = 200

In [2]:
path = "download_master"
os.mkdir(path)

In [3]:
def _quarterly_idx_list(since_year=1995):
    """
    Generate the list of quarterly zip files archived in EDGAR
    since 1995 until this previous quarter
    """
    logging.debug("downloading files since %s" % since_year)
    years = range(since_year, 2022)
    quarters = ["QTR1", "QTR2", "QTR3", "QTR4"]
    history = list((y, q) for y in years for q in quarters)

    return [
        (
            EDGAR_PREFIX + "edgar/full-index/%s/%s/master.zip" % (x[0], x[1]),
            "%s-%s.tsv" % (x[0], x[1]),
        )
        for x in history
    ]


def _append_line(line):
    chunks = line.split(SEP)
    if chunks[2]=="8-K":
        return line

def _is_8k(line):
    chunks = line.split("|")
    if chunks[2]=="8-K":
        return True
    return False

def _skip_header(f):
    for x in range(0, 11):
        f.readline()

In [4]:
def _url_get(url, user_agent):
    content = None
    if IS_PY3:
        # python 3
        import urllib.request
        hdr = { 'User-Agent' : user_agent }
        req = urllib.request.Request(url, headers=hdr)
        content =urllib.request.urlopen(req).read()
    else:
        # python 2
        import urllib2

        content = urllib2.urlopen(url).read()
    return content

def _download(file, dest, skip_file, user_agent):
    """
    Download an idx archive from EDGAR
    This will read idx files and unzip
    archives + read the master.idx file inside
    when skip_file is True, it will skip the file if it's already present.
    """
    if not dest.endswith("/"):
        dest = "%s/" % dest

    url = file[0]
    dest_name = file[1]
    if skip_file and os.path.exists(dest+dest_name):
        logging.info("> Skipping %s" % (dest_name))
        return

    if url.endswith("zip"):
        with tempfile.TemporaryFile(mode="w+b") as tmp:
            tmp.write(_url_get(url, user_agent))
            with zipfile.ZipFile(tmp).open("master.idx") as z:
                with io.open(dest + dest_name, "w+", encoding="utf-8") as idxfile:
                    _skip_header(z)
                    lines = z.read()
                    if IS_PY3:
                        lines = lines.decode("latin-1")
                    new_lines = lines.splitlines()
                    latest_lines = ""
                    for line_new in new_lines:
                        if _is_8k(line_new):
                            latest_lines+=line_new + "\n"

                    lines = map(
                        lambda line: _append_line(line), latest_lines.splitlines() 
                    )
                    idxfile.write("\n".join(lines)+"\n")
        
    else:
        raise logging.error("python-edgar only supports zipped index files")

In [5]:
def _get_millis():
    return round(time.time() * 1000)

def download_index(dest, since_year, user_agent, skip_all_present_except_last=False):
    """
    Convenient method to download all files at once
    """
    if not os.path.exists(dest):
        os.makedirs(dest)

    tasks = _quarterly_idx_list(since_year)
    logging.info("%d index files to retrieve", len(tasks))
    last_download_at = _get_millis()
    for i, file in enumerate(tasks):
        skip_file = skip_all_present_except_last
        if i == 0:
            # First one should always be re-downloaded
            skip_file = False
        # naive: 200ms or 5QPS serialized
        start = _get_millis()
        _download(file, dest, skip_file, user_agent)
        elapsed = _get_millis() - start
        if elapsed < REQUEST_BUDGET_MS:
            sleep_for = REQUEST_BUDGET_MS-elapsed
            logging.info("sleeping for %dms because we are going too fast (previous request took %dms", sleep_for, elapsed)
            time.sleep(sleep_for/1000)
        last_download_at = _get_millis()


    logging.info("complete")

In [7]:
##Please check your user agent and make changes accordinlgy
download_index("download_master", 1995, "Mozilla/5.0 (Windows NT 10.0; Win64; x64)", False)

In [8]:
path = "8K_filing"
os.mkdir(path)

In [11]:
from pathlib import Path
import random
import re
import pandas as pd
from bs4 import BeautifulSoup
import requests as rq
from bs4 import BeautifulSoup as bs
from nltk.corpus import stopwords
import urllib
from urllib.request import Request, urlopen
import csv
import unicodedata

directory = "download_master"
SEP = "|"
EDGAR_PREFIX = "https://www.sec.gov/Archives/"
start = '<TYPE>8-K'
end = '</DOCUMENT>'
result=""


def get_url(line):
    chunks = line.split(SEP)
    return chunks[-1]

def get_cik(line):
    chunks = line.split(SEP)
    return chunks[0]


def get_date(line):
    chunks = line.split(SEP)
    return chunks[3]

def get_number(line):
    chunks = line.split("/")
    return chunks[-1]

stopwords = ['copyright','html','webfilings','zip code', 'pagebreak',
             'table','body','value','per','securities','exchange','comission','telephone','number',
             'code', 'page','xbrl','begin','abn amro','abnormal',
             'aoci','anne','anda',
             'bbls','bcfe','asic','asus','blvd',
             'btus', "date"
             'cceeff','cdos','cede','cmsa','conway',
             'dana','wyeth','wyoming','xannual','xerox','xiii','xindicate',
             'xvii','xviii',
             'tdrs','form', 'january', 'february', 'march',
             'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december',
             ' january', ' february', ' march',
             ' april', ' may', ' june', ' july', ' august', ' september', ' october', ' november', ' december',
             '__________________________________________ '
            '__________ ']

def find_between(s,start,end):
    """
    Find the text between  <TYPE>8-K and </DOCUMENT>
    """
    return (s.split(start))[1].split(end)[0]

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

#function for removing all the scrub words
def scrub_words(text):
    
    #Replace \xao characters in text
    text = re.sub('\xa0', ' ', text)
    
    #Replace non ascii / not words and digits
    text = re.sub("(\\W|\\d)",' ',text)
     
    #Replace new line characters and following text until space
    text = re.sub('\n(\w*?)[\s]', '', text)
     
    #Remove extra spaces from the text
    text = re.sub("\s+", ' ', text)
    
    return text

def remove_head(text):
    """
    Removes the starting part of the text which is not relevant
    """
    text = text.replace(" html PUBLIC W C DTD HTML Transitional EN http www w org TR html loose dtd Document created using Wdesk Copyright Workiva Document ", "")
    
    text = text.replace(" SECURITIES AND EXCHANGE COMMISSION Washington D C FORM K CURRENT REPORT Pursuant to Section or d of the Securities Exchange Act of Date of Report Date of earliest event reported ", "")
    text = text.replace(" SECURITIES AND EXCHANGE COMMISSION Washington D C Form K CURRENT REPORT Pursuant to Section or d of the Securities Exchange Act of Date of Report Date of earliest event reported ", "")
    text = text.replace(" SECURITIES AND EXCHANGE COMMISSION Washington D C ________________ FORM K CURRENT REPORT PURSUANT TO SECTION OR d OF THE SECURITIES EXCHANGE ACT OF Date of report date of earliest event reported ", "")
    text = text.replace("commission washington current report pursuant section date report date earliest event reported", "")
    text = text.replace(" commission washington current report pursuant section date report date earliest event reported","")
    text = text.replace("commission washington current report pursuant section date report","")
    return text


def review_to_words(raw_review):
    """
    Removes all the words which are less than 4 characters and which are there in stopwords. For removing words less than 3 
    characters instead of 4 characters use: new_string = ' '.join([w for w in old_string.split() if len(w)>2])
    """
    remove = re.sub(r'\b\w{1,3}\b', '', raw_review) #removing all words less than 4 characters 
    letters_only = re.sub("[^a-zA-Z]", " ", remove) 
    word = letters_only.lower().split()
  
    meaningful_words = [w for w in word if not w in stopwords]   ##removing all words in stopwords list
    return( " ".join(meaningful_words))


# Creates a csv file to save the CIK no., Date of filing, name of the text file and Sentiment score
temp_csv = open("csv_file.csv", "a+", newline="")
csv_file = csv.writer(temp_csv)

header=["CIK", "Date_of_filing", "Name_of_text_file", "Sentiment_Score"]
csv_file.writerow(header)

## iterate over .tsv files in the directory
# Might take upto 15-20 mins for the execution
files = Path(directory).glob('*')
count=0
for file in files:
    with open(file, "r+") as source:
        lines = [line.replace("\n", "") for line in source]
        random_line = random.sample(lines, 10)                  ## To choice 10 random lines from each .tsv file
        
        for i in range(0, 10):
            
            line = random_line[i]      
            url_to_8K = EDGAR_PREFIX + get_url(line)
            cik = get_cik(line)
            date_of_filing = get_date(line)
            unique_name = get_number(url_to_8K)
            
            ##Extracting the text from the url
            req = Request(url_to_8K, headers={'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})
            web_byte = urlopen(req).read()
            web_byte = web_byte.decode("utf-8")
            result = find_between(web_byte,start,end)
            result = re.sub("<.*?>", ' ', result)
            soup = bs(result,'lxml')
            text =soup.find_all(text=True)
            
            ##Cleaning the text
            blacklist = ['a','sequence','filename','description']
            output =""
            for t in text:    
                if t.parent.name not in blacklist:
                    output += '{}'.format(t)
            output = re.sub("<.*?>", ' ', output)
            output = remove_accented_chars(output)
            output = scrub_words(output)
            output = review_to_words(output)
            output = remove_head(output)
            
            ##Saving details in the dataframe
            details = [cik, date_of_filing, unique_name, 0]
            csv_file.writerow(details)
            
            file = open(f"8K_filing/{unique_name}", "a+")
            file.write(output)
            
            count=count+1

            
print(count)            
temp_csv.close()          
            

1080
