In [1]:
#importing the required libraries
import requests
import re
import csv
import lxml
from bs4 import BeautifulSoup
import pandas as pd
import xml.etree.ElementTree as et
import numpy as np
import yfinance as yf

In [2]:
#global styling display
pd.set_option('display.max_rows',None)
pd.set_option('colheader_justify', 'left')
pd.set_option('display.max_colwidth',1000)

In [3]:
#Setting the variables
sec_url = 'https://www.sec.gov'
requested_cik ='0001067983'
doctype='13F-HR'
Num_Hist_Qtrs=9  ##How many filings to scrape

In [25]:
#Creating the functions

def get_request(url):
    return requests.get(url)

def create_url(cik,doctype):
    return 'https://www.sec.gov/cgi-bin/browse-edgar?CIK={}&owner=exclude&action=getcompany&type={}'.format(cik,doctype)

def scrape_data(requested_cik,Qtr):
    #xml_file_position=3 for 13F-HR , =4 for 10-K
    
    #getting the urls of the document(eg.13F-HR) locations
    #which further have html,xml file urls
    doctype = '13F-HR'
    xml_file_position=3
    
    response = get_request(create_url(requested_cik,doctype))
    soup = BeautifulSoup(response.text, "html.parser")
    tags = soup.findAll('a', id="documentsbutton")
    
    #going to a single document location and extracting the xml file location
    #by parsing thru the soup html object
    response_two = get_request(sec_url + tags[Qtr]['href'])
    soup_two = BeautifulSoup(response_two.text, "html.parser")
    tags_two = soup_two.findAll('a', attrs={'href': re.compile('xml')})
    xml_url = tags_two[xml_file_position].get('href')
    response_xml = get_request(sec_url + xml_url)
    soup_xml = BeautifulSoup(response_xml.content, "lxml")
    return soup_two,soup_xml

def table13F(requested_cik,Num_Hist_Qtrs):
    
    appended_table = []
    for Qtr in range(Num_Hist_Qtrs):
    
        soup_two,soup_xml = scrape_data(requested_cik,Qtr)

        #Extracting the filing and reporting date from the FormContent in the soup html object
        table_tag = soup_two.find('div', class_=['formContent'])
        table_tag2 = table_tag.find_all('div', class_=['infoHead','info'])

        FilingDate = table_tag2[1].text
        PeriodofReport = table_tag2[7].text

        #Extracting data from the soup xml object
        rows = soup_xml.find_all('infotable')
        positions = []
        for row in rows:
            dic = {}
            position = row.find_all()
            dic["NAME_OF_ISSUER"] = position[0].text
            dic["SHARES"] = float(position[5].text.replace(',', ''))/1000
            dic["PeriodofReport"]= table_tag2[7].text
            dic["CUSIP"] = position[2].text
            #dic["$ VALUE"] = float(position[3].text.replace(',', ''))/1000
            #dic["FilingDate"]= table_tag2[1].text
            positions.append(dic)
        data = pd.DataFrame(positions)
        appended_table.append(data)
    appended_table = pd.concat(appended_table)
    appended_table =pd.pivot_table(appended_table,index=["NAME_OF_ISSUER","CUSIP"],values=["SHARES"],
               columns=["PeriodofReport"],aggfunc=[np.sum],fill_value=0)
    return appended_table

In [26]:
appended_table=table13F(requested_cik,Num_Hist_Qtrs)
#Formatting pivot table display
appended_table.style.format('{0:,.0f}k')
#appended_table_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,sum,sum,sum,sum,sum,sum,sum,sum
Unnamed: 0_level_1,Unnamed: 1_level_1,SHARES,SHARES,SHARES,SHARES,SHARES,SHARES,SHARES,SHARES,SHARES
Unnamed: 0_level_2,PeriodofReport,2018-09-30,2018-12-31,2019-03-31,2019-06-30,2019-09-30,2019-12-31,2020-03-31,2020-06-30,2020-09-30
NAME_OF_ISSUER,CUSIP,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
ABBVIE INC,00287Y109,0k,0k,0k,0k,0k,0k,0k,0k,"21,264k"
AMAZON COM INC,023135106,0k,0k,483k,537k,537k,537k,533k,533k,533k
AMERICAN AIRLS GROUP INC,02376R102,"43,700k","43,700k","43,700k","43,700k","43,700k","42,500k","41,909k",0k,0k
AMERICAN EXPRESS CO,025816109,"151,611k","151,611k","151,611k","151,611k","151,611k","151,611k","151,611k","151,611k","151,611k"
APPLE INC,037833100,"252,479k","249,589k","249,589k","249,589k","248,839k","245,156k","245,156k","245,156k","944,296k"
AXALTA COATING SYS LTD,G0750C108,"24,264k","24,264k","24,264k","24,264k","24,264k","24,264k","24,070k","24,070k","23,420k"
BANK AMER CORP,060505104,"877,249k","896,168k","896,168k","927,249k","927,249k","925,009k","925,009k","925,009k","1,010,101k"
BANK OF NEW YORK MELLON CORP,064058100,"77,849k","80,937k","80,937k","80,937k","80,937k","79,765k","79,765k","72,357k","72,357k"
BARRICK GOLD CORPORATION,067901108,0k,0k,0k,0k,0k,0k,0k,"20,919k","12,000k"
BIOGEN INC,09062X103,0k,0k,0k,0k,0k,648k,643k,643k,643k
