In [2]:
import requests
import re
import os
import time
import nltk
import string
import pandas as pd
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from bs4 import BeautifulSoup

In [3]:
nltk.data.path.append(os.path.join(os.getcwd(), "nltk_data"))

In [4]:
stop_words = stopwords.words('english')

In [71]:
def get_index(f_type,doc_text):    ### Regex function for finding the beginning and ending of 10-Q and 10-K filings
    start_mda = -1
    end_mda = -1
    if f_type == 'EX-13.3':
        print('Ex')
        start_mda = 0
        end_mda = len(doc_text)
        return start_mda,end_mda
    
    if f_type == '10-K':
        regex = re.compile(r'I[tT][eE][mM] 7\.\s*M[aA][nN][aA][gG][eE][mM][eE][nN][tT]')
    elif f_type == '10-Q':
        regex = re.compile(r'I[tT][eE][mM] 2\.\s*M[aA][nN][aA][gG][eE][mM][eE][nN][tT]')
        
    match = regex.finditer(doc_text)

    for m in match:
        start_mda = m.span()[0]
        break

    if f_type == '10-K':
        regex = re.compile(r'I[tT][eE][mM] 8\.\s*F[iI][nN][aA][nN][cC][iI][aA][lL]')
    elif f_type == '10-Q':
        regex = re.compile(r'I[tT][eE][mM] 3\.\s*Q[uU][aA][nN][tT][iI][tT][aA][tT][iI][vV][eE]')
        
    match = regex.finditer(doc_text)
    for m in match:
        end_mda = m.span()[0]
        break
    
    if (f_type == '10-Q') and (end_mda == -1):   ### For old 10-Qs, it is a different format
        regex = re.compile(r'I[tT][eE][mM] 1\.\s*L[eE][gG][aA][lL]')
    
    match = regex.finditer(doc_text)
    for m in match:
        end_mda = m.span()[0]
        break
        
        
    print(start_mda,",",end_mda)
       
    return start_mda,end_mda

In [63]:
def get_mda(filing_type, filing_link):    ### Returns a clean text of MDA based on the filing type and its link
    final_doc = requests.get(filing_link)
    soup_ob = BeautifulSoup(final_doc.content,'html.parser')

    for table in soup_ob.find_all("table"): 
        table.decompose()                     ### Removing all tables from the text

    #soup_ob.get_text()                        ### Fetching just the textual data

    final_text = soup_ob.get_text().replace('\n',' ').replace('\t',' ').replace('\xa0',' ') ### get_text. remove string operations

    start , end  = get_index(filing_type,final_text)
    final_text = final_text[start + 8:end]                    ### Fetching MD&A of 10-K filings

    final_text = re.sub(r'\([^()]*\)', '', final_text.lower()) ### Removing everything between two brackets since it is mostly unnecessary
    
    if filing_type == '10-K':                            ### Removing Page numbers from filings
        final_text = re.sub(r'[0-9][0-9]\s*part ii item [0-9]', '', final_text)
    elif filing_type == '10-Q':
        final_text = re.sub(r'[0-9][0-9]\s*part i item [0-9]', '', final_text)
        
    final_text = re.sub(r':', '.', final_text)  ### ':' Present at the end of sentences before presenting tables. 
    final_text = re.sub(r',', ' ', final_text)  ### TODO : Remove all punctuations, numbers and special characeters
    # Normalize it
    #final_text = re.sub(r"[^a-zA-Z0-9]", " ", final_text.lower())
    
    return final_text,start,end

In [12]:
def is_float(element): ### Helper function
    try:
        float(element)
    except ValueError:
        return False
    return True

#### Fetching the page containing the links for previous 100 10-K and 10-Q filings of the company based on CIK.

In [6]:
endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"

# define our parameters dictionary
param_dict = {'action':'getcompany',
              'CIK':'0000789019',            ### CIK details of the company
              'type':'10',                   ### 10 in the search bar resulting in all 10 related filings
              'dateb':'',
              'owner':'exclude',
              'start':'',
              'output':'',
              'count':'100'}                ### Return upto 100(max limit) SEC filings

# request the url, and then parse the response.
response = requests.get(url = endpoint, params = param_dict)
soup = BeautifulSoup(response.content, 'html.parser')

# Let the user know it was successful.
print('Request Successful')
print(response.url)

Request Successful
https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0000789019&type=10&dateb=&owner=exclude&start=&output=&count=100


#### Going through the response of the table and storing the required links of SEC filings in a pandas DataFrame. 

In [8]:
table = soup.find('table',summary = 'Results')                   ### Results Table

filing_data = []

for i,j in enumerate(table.find_all('tr')):                     ### For each row in table
    if i is 0:   ### Header row
        continue                   
    
    row_list = j.find_all('td')        ### Find all columns in the row
    
    filing_type = row_list[0].get_text()    ### get the text in the first column(filing name/type)
    
    if not (filing_type == '10-Q' or filing_type == '10-K'): #get only 10Q and 10K. No amendments (10-Q/A)
        continue
    
    filing_date = row_list[3].get_text()    ### Get the filing date listed in the 4th column
    
    files_link = r'https://www.sec.gov/' + row_list[1].a.get('href')    ### Link of the filings filed on the date
    
    next_page = requests.get(files_link)        ### Link for the page storing all the filings
    contents = next_page.content       
    soup2 = BeautifulSoup(contents, 'html.parser')    ### New soup object to parse the next page
    
    table2 = soup2.find('table',summary = 'Document Format Files')     ### Table storing all the uploaded filings
    
    flag = 0 
    for p,q in enumerate(table2.find_all('tr')):      ### For all the rows in this table
        
        if p is 0:    ### Header row
            continue
        row_list2 = q.find_all('td')     ### For all columns in the row
        
        doc_string = row_list2[2].get_text()     ### Name of the uploaded doc
        type_string = row_list2[3].get_text()   ### type of filing
        
        if type_string == '10-Q' or type_string == '10-K': 
            if doc_string == '':
                continue #next row
            else:
                filing_link = r'https://www.sec.gov/' + row_list2[2].a.get('href')
                flag = 1
                
        if type_string == 'EX-13.3':
            if doc_string == '':
                break
            else:
                filing_link = r'https://www.sec.gov/' + row_list2[2].a.get('href')
                flag = 1
                filing_type = 'EX-13.3'
                print(type_string)
                break

    if flag == 0:   ### If no link is found in any of the tables, take the full text of the filing. 
        filing_link = files_link.split('-index')[0] + '.txt'
        #print(r'https://www.sec.gov/' + row_list2[2].a.get('href'))
    
    filing_link = re.sub(r'ix\?doc=/','',filing_link)   ### remove the iXBRL format for the latest filings
    filing_data.append([filing_type,filing_link,filing_date])
    print(filing_link)
    
data = pd.DataFrame(filing_data,columns = ['Filing_type','Filing_link','Filing_date'])   ### Create a data frame of the table
data['Sentences'] = ''
data['Words'] = ''

data['Mda_index']=''
for i in range(len(data)):
    mda,start,end = get_mda(data['Filing_type'][i],data['Filing_link'][i])  ### Get the MDA section
    #print(mda)

    sent_mda = sent_tokenize(mda)                          ### list of all sentences in the MDA

    sent_mda = [x.strip() for x in sent_mda if len(x.strip()) > 15]   ### Keep sentences which are atleast 15 characters long

    words_mda = word_tokenize(mda)                     ### Fetching just the words from MDA

    words_mda = [x for x in words_mda if not is_float(x)]
    words_mda = [(WordNetLemmatizer().lemmatize(x)) for x in words_mda if x not in stop_words and len(x) > 2]

    data['Sentences'][i] = sent_mda
    data['Words'][i] = words_mda
    data['Mda_index'][i] = (start,end)

In [67]:
data[1:5]

Unnamed: 0,Filing_type,Filing_link,Filing_date,Sentences,Words,Mda_index
1,10-Q,https://www.sec.gov//Archives/edgar/data/78901...,2020-01-29,[management’s discussion and analysis of finan...,"[management, discussion, analysis, financial, ...","(75041, 113497)"
2,10-Q,https://www.sec.gov//Archives/edgar/data/78901...,2019-10-23,[management’s discussion and analysis of finan...,"[management, discussion, analysis, financial, ...","(65214, 99158)"
3,10-K,https://www.sec.gov//Archives/edgar/data/78901...,2019-08-01,[management’s discussion and analysis of finan...,"[management, discussion, analysis, financial, ...","(111753, 161210)"
4,10-Q,https://www.sec.gov//Archives/edgar/data/78901...,2019-04-24,[management’s discussion and analysis of finan...,"[management, discussion, analysis, financial, ...","(51304, 93244)"


In [79]:
import sys

data_size = sys.getsizeof(data)

print((data_size/1024)/1024)   ### Size in MB of the data. 

3.438176155090332


In [18]:
vader_sent = SentimentIntensityAnalyzer()         ### Sentiment Analyzer used to find the sentiment from sentences

#for x in sent_mda:    
#    print(vader_sent.polarity_scores(x))