In [1]:
#!usr/bin/env python
# Heavily influenced by the R version of Seward Lee; https://github.com/sewardlee337/finreportr/blob/master/R/company_info.R
#
# Acquire basic company information
#'
#' Extracts and displays basic information relating to a given company in a data frame.
#'
#' @export
#' @import dplyr
#' @param symbol A character vector specifying the stock symbol of the company of interest.
#' @examples
#' CompanyInfo("GOOG")
#' CompanyInfo("TSLA")

import requests, bs4
import re

symbol = 'GOOG'

url = 'https://www.sec.gov/cgi-bin/browse-edgar?CIK={0}&owner=exclude&action=getcompany&Find=Search'.format(symbol)

res = requests.get(url)
try:
    res.raise_for_status()
except Exception as exc:
    print('There was a problem: %s' % (exc))

total = bs4.BeautifulSoup(res.text, 'lxml')

elems = total.find_all("span", class_="companyName")

# Company name
# this line is extracted from observing patterns in company names
name = elems[0].getText().split('CIK#')[0]

##   Error message for function
if (len(name) == 0):
    print("invalid company symbol")

# CIK number
CIK = elems[0].getText().split('CIK#')[1]
CIK = re.findall(r'\d+',CIK)[0] # find all numbers

##   Acquire SIC code

elems = total.findAll("p", {"class":"identInfo"})
print(type(elems))

print(CIK)

<class 'bs4.element.ResultSet'>
0001652044


In [7]:
type(elems[0])

bs4.element.Tag

In [8]:
dir(elems[0])

['HTML_FORMATTERS',
 'XML_FORMATTERS',
 '__bool__',
 '__call__',
 '__class__',
 '__contains__',
 '__copy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_all_strings',
 '_attr_value_as_string',
 '_attribute_checker',
 '_find_all',
 '_find_one',
 '_formatter_for_name',
 '_is_xml',
 '_lastRecursiveChild',
 '_last_descendant',
 '_select_debug',
 '_selector_combinators',
 '_should_pretty_print',
 '_tag_name_matches_and',
 'append',
 'attribselect_re',
 'attrs',
 'can_be_empty_element',
 'childGenerator',
 'children',
 'clear',
 'contents',
 'decode',
 'decode_contents',
 'decomp

In [14]:
elems[0].contents

[<acronym title="Standard Industrial Code">SIC</acronym>,
 ': ',
 <a href="/cgi-bin/browse-edgar?action=getcompany&amp;SIC=7370&amp;owner=exclude&amp;count=40">7370</a>,
 ' - SERVICES-COMPUTER PROGRAMMING, DATA PROCESSING, ETC.',
 <br/>,
 'State location: ',
 <a href="/cgi-bin/browse-edgar?action=getcompany&amp;State=CA&amp;owner=exclude&amp;count=40">CA</a>,
 ' | State of Inc.: ',
 <strong>DE</strong>,
 ' | Fiscal Year End: 1231',
 <br/>,
 '(Assistant Director Office: 3)',
 <br/>,
 'Get ',
 <a href="/cgi-bin/own-disp?action=getissuer&amp;CIK=0001652044"><b>insider transactions</b></a>,
 ' for this ',
 <b>issuer</b>,
 '.\n',
 <br/>,
 'Get ',
 <a href="/cgi-bin/own-disp?action=getowner&amp;CIK=0001652044"><b>insider transactions</b></a>,
 ' for this ',
 <b>reporting owner</b>,
 '.\n']

In [18]:
SIC = elems[0].contents[2].getText()

In [19]:
print(SIC)

7370


### Acquire mailing address

In [23]:
total

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html lang="ENG">
<head>
<title>EDGAR Search Results</title>
<link href="/include/interactive.css" rel="stylesheet" type="text/css"/>
<link href="/cgi-bin/browse-edgar?action=getcompany&amp;CIK=0001652044&amp;type=&amp;dateb=&amp;owner=exclude&amp;count=40&amp;output=atom" rel="alternate" title="ATOM" type="application/atom+xml"/>
</head>
<body style="margin: 0">
<!-- SEC Web Analytics - For information please visit: http://www.sec.gov/privacy.htm#collectedinfo -->
<noscript><iframe height="0" src="//www.googletagmanager.com/ns.html?id=GTM-TD3BKV" style="display:none;visibility:hidden" width="0"></iframe></noscript>
<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
'//www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.inse

In [130]:
elems = total.findAll("span", {"class":"mailerAddress"})

In [131]:
mail_street = elems[2].getText()

In [132]:
mail_street

'1600 AMPHITHEATRE PARKWAY'

In [80]:
all_address = elems[3].getText().strip().rsplit(' ', 2)

In [111]:
mail_city = all_address[0]

In [114]:
mail_city

'MOUNTAIN VIEW'

In [113]:
mail_state = all_address[1]

In [115]:
mail_state

'CA'

In [87]:
zip_code = all_address[2]

In [88]:
zip_code 

'94043'

### Acquring phone number

In [None]:
elems = total.findAll("span", {"class":"mailerAddress"})

In [91]:
phone = elems[-1].getText()

In [92]:
phone

'650-253-0000'

###   Acquire fiscal year end

In [98]:
elems = total.findAll("p", {"class":"identInfo"})[0]

In [105]:
fiscal_year = elems.contents[9].rsplit(' ', 2)[-1]

In [109]:
elems

<p class="identInfo"><acronym title="Standard Industrial Code">SIC</acronym>: <a href="/cgi-bin/browse-edgar?action=getcompany&amp;SIC=7370&amp;owner=exclude&amp;count=40">7370</a> - SERVICES-COMPUTER PROGRAMMING, DATA PROCESSING, ETC.<br/>State location: <a href="/cgi-bin/browse-edgar?action=getcompany&amp;State=CA&amp;owner=exclude&amp;count=40">CA</a> | State of Inc.: <strong>DE</strong> | Fiscal Year End: 1231<br/>(Assistant Director Office: 3)<br/>Get <a href="/cgi-bin/own-disp?action=getissuer&amp;CIK=0001652044"><b>insider transactions</b></a> for this <b>issuer</b>.
<br/>Get <a href="/cgi-bin/own-disp?action=getowner&amp;CIK=0001652044"><b>insider transactions</b></a> for this <b>reporting owner</b>.
</p>

### Acquire state of incorporation

In [122]:
elems.contents

[<acronym title="Standard Industrial Code">SIC</acronym>,
 ': ',
 <a href="/cgi-bin/browse-edgar?action=getcompany&amp;SIC=7370&amp;owner=exclude&amp;count=40">7370</a>,
 ' - SERVICES-COMPUTER PROGRAMMING, DATA PROCESSING, ETC.',
 <br/>,
 'State location: ',
 <a href="/cgi-bin/browse-edgar?action=getcompany&amp;State=CA&amp;owner=exclude&amp;count=40">CA</a>,
 ' | State of Inc.: ',
 <strong>DE</strong>,
 ' | Fiscal Year End: 1231',
 <br/>,
 '(Assistant Director Office: 3)',
 <br/>,
 'Get ',
 <a href="/cgi-bin/own-disp?action=getissuer&amp;CIK=0001652044"><b>insider transactions</b></a>,
 ' for this ',
 <b>issuer</b>,
 '.\n',
 <br/>,
 'Get ',
 <a href="/cgi-bin/own-disp?action=getowner&amp;CIK=0001652044"><b>insider transactions</b></a>,
 ' for this ',
 <b>reporting owner</b>,
 '.\n']

In [126]:
inc_state = elems.contents[8].getText() # state of incorporation

In [127]:
inc_state

'DE'

### Wrap together

In [133]:
company_info = {}
company_info['symbol'] = symbol
company_info['name'] = name
company_info['CIK'] = CIK
company_info['SIC'] = SIC
company_info['mail_street'] = mail_street
company_info['mail_state'] = mail_state
company_info['mail_city'] = mail_city
company_info['zip_code'] = zip_code
company_info['phone'] = phone
company_info['fiscal_year'] = fiscal_year
company_info['inc_state'] = inc_state
company_info['loac_state'] = mail_state

In [139]:
company_info

{'CIK': '0001652044',
 'SIC': '7370',
 'fiscal_year': '1231',
 'inc_state': 'DE',
 'loac_state': 'CA',
 'mail_city': 'MOUNTAIN VIEW',
 'mail_state': 'CA',
 'mail_street': '1600 AMPHITHEATRE PARKWAY',
 'name': 'Alphabet Inc. ',
 'phone': '650-253-0000',
 'symbol': 'GOOG',
 'zip_code': '94043'}

In [135]:
import pandas as pd

In [140]:
df = pd.DataFrame([company_info], columns=company_info.keys())

In [141]:
df

Unnamed: 0,symbol,name,CIK,SIC,mail_street,mail_state,mail_city,zip_code,phone,fiscal_year,inc_state,loac_state
0,GOOG,Alphabet Inc.,1652044,7370,1600 AMPHITHEATRE PARKWAY,CA,MOUNTAIN VIEW,94043,650-253-0000,1231,DE,CA


In [146]:
def CompanyInfo(symbol):
    
    url = 'https://www.sec.gov/cgi-bin/browse-edgar?CIK={0}&owner=exclude&action=getcompany&Find=Search'.format(symbol)

    res = requests.get(url)
    try:
        res.raise_for_status()
    except Exception as exc:
        print('There was a problem: %s' % (exc))
    
    # get all tag
    total = bs4.BeautifulSoup(res.text, 'lxml')
    # company info tag
    elems = total.find_all("span", class_="companyName")
    
    # Company name
    # this line is extracted from observing patterns in company names
    name = elems[0].getText().split('CIK#')[0]

    ##   Error message for function
    if (len(name) == 0):
        print("invalid company symbol")

    # CIK number
    CIK = elems[0].getText().split('CIK#')[1]
    CIK = re.findall(r'\d+',CIK)[0] # find all numbers

    ## SIC code
    elems = total.findAll("p", {"class":"identInfo"})
    
    ## Now find address
    elems = total.findAll("span", {"class":"mailerAddress"})
    ## mailing street is a standalone line
    mail_street = elems[2].getText()
    
    # all_address include state, city, zip code. Strip them one by one
    all_address = elems[3].getText().strip().rsplit(' ', 2)
    mail_city = all_address[0]
    mail_state = all_address[1]
    zip_code = all_address[2]
    
    # now phone number
    phone = elems[-1].getText()
    
    ## Now fiscal year end
    elems = total.findAll("p", {"class":"identInfo"})[0]
    fiscal_year = elems.contents[9].rsplit(' ', 2)[-1]
    
    ## Now incorporate state
    inc_state = elems.contents[8].getText() # state of incorporation
    
    # Wrap together and return a dictionary
    company_info = {}
    company_info['symbol'] = symbol
    company_info['name'] = name
    company_info['CIK'] = CIK
    company_info['SIC'] = SIC
    company_info['mail_street'] = mail_street
    company_info['mail_state'] = mail_state
    company_info['mail_city'] = mail_city
    company_info['zip_code'] = zip_code
    company_info['phone'] = phone
    company_info['fiscal_year'] = fiscal_year
    company_info['inc_state'] = inc_state
    company_info['loac_state'] = mail_state
    
    return pd.DataFrame([company_info], columns=company_info.keys())

In [147]:
CompanyInfo('GOOG')

Unnamed: 0,symbol,name,CIK,SIC,mail_street,mail_state,mail_city,zip_code,phone,fiscal_year,inc_state,loac_state
0,GOOG,Alphabet Inc.,1652044,7370,1600 AMPHITHEATRE PARKWAY,CA,MOUNTAIN VIEW,94043,650-253-0000,1231,DE,CA
