In [3]:
import pandas as pd
import os
import bs4
import pickle
import requests
from bs4 import BeautifulSoup
import sys

In [2]:
company_list = ['Microsoft']
years = ['2017']

In [3]:
#DO NOT NEED TO RUN: CSV Already Created
#Found at https://pythonprogramming.net/sp500-company-list-python-programming-for-finance/
#I added more lists to obtain the Company name and CIK numbers, then combined into a DataFrame and Exported it out

resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soup = bs4.BeautifulSoup(resp.text, 'lxml')
table = soup.find('table', {'class': 'wikitable sortable'})

key_path = os.path.join('..','Financials', 'Ticker_and_CIK', 'Key.csv')

stock_tickers = []
companies = []
central_index_keys = []

for sp_row in table.findAll('tr')[1:]:
    stock_ticker = sp_row.findAll('td')[0].text
    stock_tickers.append(stock_ticker.rstrip())
    company = sp_row.findAll('td')[1].text
    companies.append(company.rstrip())
    central_index_key = sp_row.findAll('td')[7].text
    central_index_keys.append(central_index_key.rstrip())

sp500_df = pd.DataFrame(list(zip(stock_tickers, companies, central_index_keys)), columns = ["Ticker", "Company", "CIK"])
sp500_df.to_csv(key_path, index = False)

In [9]:
key_path = os.path.join('..','Financials', 'Ticker_and_CIK', 'Key.csv')

key_df = pd.read_csv(key_path)

key_dict = {}

for query in range(len(company_list)):
    try:
        company_df = key_df[key_df['Company'].str.contains(company_list[query].title())]
        key_dict[company_df.iloc[0, 1]] = [company_df.iloc[0, 0], company_df.iloc[0, 2]]
    except IndexError:
        print(f"{company_list[query]} could not be found")

if key_dict == {}:
    print ("Whoops, none of your companies were on the S&P 500. Please try again!")
else:
    print (f"Success! We were able to find {len(key_dict.keys())} of your queries in the S&P 500")
    print (f"We found: {list(key_dict.keys())} in the S&P 500")
    print (key_dict)

Success! We were able to find 1 of your queries in the S&P 500
We found: ['Microsoft Corp.'] in the S&P 500
{'Microsoft Corp.': ['MSFT', 789019]}


In [18]:
#Unmodified Base Code found at https://www.codeproject.com/Articles/1227765/Parsing-XBRL-with-Python
#I added the ability for the code to query through a list of companies for multiple years and multiple quarters
#and save the statements 

date_dict = {}

for companies_found in range(len(key_dict.keys())):
    
    company_name = list(key_dict.keys())[companies_found]
    ticker = key_dict.get(company_name)[0]
    
    cik = key_dict.get(company_name)[1]
    filing_type = '10-Q'
    dateb = f'{years[0]}-12-31'
    
    # Obtain HTML for search page
    base_url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type={}&dateb={}"
    edgar_resp = requests.get(base_url.format(cik, filing_type, dateb))
    edgar_str = edgar_resp.text
    
    doc_links = []
    dates = []
    
            
    # Find the document link
    soup = BeautifulSoup(edgar_str, 'html.parser')
    table_tag = soup.find('table', class_='tableFile2')
    rows = table_tag.find_all('tr')
        
    for year in years:
        for row in rows:
            cells = row.find_all('td')
            if len(cells) > 3:
                if year in cells[3].text:
                    doc_link = 'https://www.sec.gov' + cells[1].a['href']
                    date = cells[3].text
                    dates.append(date)
                    
                    # Obtain HTML for document page
                    doc_resp = requests.get(doc_link)
                    doc_str = doc_resp.text
                    # Find the XBRL link
                    xbrl_link = ''
                    soup_doc = BeautifulSoup(doc_str, 'html.parser')
                    table_tag_doc = soup_doc.find('table', class_='tableFile', summary='Data Files')
                    rows_doc = table_tag_doc.find_all('tr')
                    for row_doc in rows_doc:
                        cells_doc = row_doc.find_all('td')
                        if len(cells_doc) > 3:
                            if 'INS' in cells_doc[3].text:
                                xbrl_link = 'https://www.sec.gov' + cells_doc[2].a['href']
                    # Obtain XBRL text from document
                    xbrl_resp = requests.get(xbrl_link)
                    xbrl_str = xbrl_resp.text
                    # Find and print stockholder's equity
                    soup_xbrl = BeautifulSoup(xbrl_str, 'lxml')
                    tag_list = soup_xbrl.find_all()

                    df = pd.DataFrame()

                    for tag in tag_list:

                        #Modified from the original; The original searched through to find specific line items
                        #Modified to search through every 10-Q per given year, and extract any financial line item into its own 10-Q

                        if tag.name.find('us-gaap:') != -1:
                            try: 
                                if float(tag.text) > 0:
                                    i = [f'{tag.name}', tag.text, tag['contextref']]
                                    i_series = pd.Series(i)
                                    df = df.append(i_series, ignore_index = True)
                            except ValueError:
                                pass

                    folder_path = os.path.join('..','Financials',f'{filing_type}s',f'{ticker}')
                    file_path = os.path.join('..','Financials',f'{filing_type}s',f'{ticker}', f'{ticker}_{date}.csv')

                    try:
                        os.mkdir(folder_path)
                    except FileExistsError:
                        pass
                    df.to_csv(file_path, index = False)
                    
    date_dict[ticker]=dates
    
date_dict

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




<us-gaap:foreigncurrencytransactiongainlossbeforetax contextref="C_0000789019_20160701_20160930" decimals="-6" id="F_000350" unitref="U_iso4217USD">-40000000</us-gaap:foreigncurrencytransactiongainlossbeforetax>
<us-gaap:othernonoperatingincomeexpense contextref="C_0000789019_20170701_20170930" decimals="-6" id="F_000351" unitref="U_iso4217USD">-39000000</us-gaap:othernonoperatingincomeexpense>
<us-gaap:othernonoperatingincomeexpense contextref="C_0000789019_20160701_20160930" decimals="-6" id="F_000352" unitref="U_iso4217USD">-15000000</us-gaap:othernonoperatingincomeexpense>
<us-gaap:marketablesecuritiesrealizedgainlossotherthantemporaryimpairmentsamount contextref="C_0000789019_20170701_20170930" decimals="-6" id="F_000353" unitref="U_iso4217USD">6000000</us-gaap:marketablesecuritiesrealizedgainlossotherthantemporaryimpairmentsamount>
<us-gaap:marketablesecuritiesrealizedgainlossotherthantemporaryimpairmentsamount contextref="C_0000789019_20160701_20160930" decimals="-6" id="F_0003

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



<us-gaap:scheduleofaccumulatedothercomprehensiveincomelosstabletextblock contextref="C_0000789019_20160701_20170331" id="F_000431">&lt;div&gt;

&lt;p style="margin-bottom:0pt;margin-top:9pt;text-align:justify;text-indent:0%;font-size:10pt;font-family:Arial;font-weight:normal;font-style:normal;text-transform:none;font-variant: normal;"&gt;The following table summarizes the changes in accumulated other comprehensive income by component:&lt;/p&gt;
&lt;p style="margin-bottom:0pt;margin-top:0pt;text-align:justify;text-indent:0%;font-size:9pt;"&gt;&amp;nbsp;&lt;/p&gt;
&lt;div&gt;
&lt;table border="0" cellspacing="0" cellpadding="0" align="center" style="border-collapse:collapse; width:100%;"&gt;
&lt;tr&gt;
&lt;td valign="bottom" style="width:62%;"&gt;
&lt;p style="margin-bottom:0pt;margin-top:0pt;text-align:justify;margin-left:0pt;;text-indent:0pt;;font-weight:bold;font-size:7.5pt;font-family:Arial;font-style:normal;text-transform:none;font-variant: normal;"&gt;(In millions)&lt;/p&gt;&lt;/td

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



<us-gaap:availableforsalesecuritiesaccumulatedgrossunrealizedlossbeforetax contextref="C_0000789019_us-gaapInvestmentTypeAxis_us-gaapForeignGovernmentDebtSecuritiesMember_20161231" decimals="-6" id="F_000504" unitref="U_iso4217USD">29000000</us-gaap:availableforsalesecuritiesaccumulatedgrossunrealizedlossbeforetax>
<us-gaap:availableforsalesecuritiesaccumulatedgrossunrealizedlossbeforetax contextref="C_0000789019_us-gaapInvestmentTypeAxis_us-gaapAssetBackedSecuritiesMember_20161231" decimals="-6" id="F_000505" unitref="U_iso4217USD">7000000</us-gaap:availableforsalesecuritiesaccumulatedgrossunrealizedlossbeforetax>
<us-gaap:availableforsalesecuritiesaccumulatedgrossunrealizedlossbeforetax contextref="C_0000789019_us-gaapInvestmentTypeAxis_us-gaapCorporateDebtSecuritiesMember_20161231" decimals="-6" id="F_000506" unitref="U_iso4217USD">26000000</us-gaap:availableforsalesecuritiesaccumulatedgrossunrealizedlossbeforetax>
<us-gaap:availableforsalesecuritiesaccumulatedgrossunrealizedlossbef

{'MSFT': ['2017-10-26', '2017-04-27', '2017-01-26']}

In [37]:
for companies_found in range(len(date_dict.keys())):
    ticker = list(date_dict.keys())[companies_found]
    stock_path = os.path.join('..','Stock_Data','SP500_Data','individual_stocks_5yr','individual_stocks_5yr',f'{ticker}_data.csv')
    
    stock_data_df = pd.read_csv(stock_path)

    
    print(date_dict[ticker])
    
    
    for date in date_dict[ticker]: 
        y = date [0:4]
        
        #Get the saved 10-Q; Conduct Analysis     
        file_path = os.path.join('..','Financials',f'{filing_type}s',f'{ticker}', f'{ticker}_{date}.csv')
        df = pd.read_csv(file_path)
        
        c_assets_df = df[df['0'].str.contains('us-gaap:assetscurrent')]
        cref_list = []
        for r in c_assets_df.iloc[:, 2]:
            
            #Need to use date format finder instead
            cref_list.append(int(r[-8:-1]))
        curr_assets = c_assets_df[c_assets_df['2'].str.contains(str(max(cref_list)))].iloc[0,1]
        
        c_liab_df = df[df['0'].str.contains('us-gaap:liabilitiescurrent')]
        cref_list = []
        for r in c_liab_df.iloc[:, 2]:
            cref_list.append(int(r[-8:-1]))
        curr_liab = c_liab_df[c_liab_df['2'].str.contains(str(max(cref_list)))].iloc[0,1]
        
        
        curr_ratio = (curr_assets / curr_liab).round(2)
        
        print(curr_ratio, date)
        
        
        #Find the pertinent Stock Data
        release_day_data = stock_data_df[stock_data_df['date'] == date]
        release_day_index = int(release_day_data.index.values)
        
        prior_day_data = stock_data_df.iloc[release_day_index-1]
        
        
        prior_day_close = prior_day_data['close']

['2017-10-26', '2017-04-27', '2017-01-26']
3.12 2017-10-26
2.81 2017-04-27
2.05 2017-01-26


In [17]:
import datefinder




MMM_date = 'Duration_1_1_2016_To_6_30_2016_us-gaap_IncomeStatementLocationAxis_mmm_ResearchDevelopmentAndRelatedExpensesMember'
MSFT_2017 = 'C_0000789019_20170331'
MSFT_2016 = 'eol_PE8528----1510-Q0018_STD_0_20151231_0'
MMM = 'As_Of_12_31_2016'











matches = list(datefinder.find_dates(MMM_date))

if len(matches) > 0:
    # date returned will be a datetime.datetime object. here we are only using the first match.
    date = matches[0]
    print (date)
else:
    print ('No dates found')
    

No dates found


In [24]:
#

import difflib

cases=[('C_0000789019_20170331', 'C_0000789019_20160630'),
       ('As_Of_12_31_2016', 'As_Of_6_30_2017'),
       ('afrykanerskojęzycznym', 'afrykanerskojęzyczny'),
       ('nieafrykanerskojęzyczni', 'afrykanerskojęzyczni'),
       ('nieafrynerskojęzyczni', 'afrykanerskojzyczni'),
       ('abcdefg','xac')] 

for a,b in cases:     
    print('{} => {}'.format(a,b))  
    for i,s in enumerate(difflib.ndiff(a, b)):
        if s[0]==' ': continue
        elif s[0]=='-':
            print(u'Delete "{}" from position {}'.format(s[-1],i))
        elif s[0]=='+':
            print(u'Add "{}" to position {}'.format(s[-1],i))    
    print()  

C_0000789019_20170331 => C_0000789019_20160630
Delete "7" from position 16
Add "6" to position 17
Add "6" to position 19
Add "0" to position 21
Delete "3" from position 22
Delete "1" from position 23

As_Of_12_31_2016 => As_Of_6_30_2017
Add "6" to position 6
Delete "1" from position 7
Delete "2" from position 8
Delete "1" from position 11
Add "0" to position 12
Delete "6" from position 17
Add "7" to position 18

afrykanerskojęzycznym => afrykanerskojęzyczny
Delete "m" from position 20

nieafrykanerskojęzyczni => afrykanerskojęzyczni
Delete "n" from position 0
Delete "i" from position 1
Delete "e" from position 2

nieafrynerskojęzyczni => afrykanerskojzyczni
Delete "n" from position 0
Delete "i" from position 1
Delete "e" from position 2
Add "k" to position 7
Add "a" to position 8
Delete "ę" from position 16

abcdefg => xac
Add "x" to position 0
Delete "b" from position 2
Delete "d" from position 4
Delete "e" from position 5
Delete "f" from position 6
Delete "g" from position 7

