In [1]:
from bs4 import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
import datetime
import unicodedata
import requests
import pandas as pd
import quandl
import config
import dateutil.relativedelta
import re
import numpy as np
import pandas_market_calendars as mcal

### Table of Contents
1. Define Functions <br>
    a. Get list of SEC docs for CIK<br>
    b. Extract text, date from each link<br>
    c. Get price given ticker, date from Quandl
    d. Get movement given ticker, date
    e. Get index movement
    f. Check if date is a weekday, and if necessary, adjust to Friday before
    g. Calculate dates for month before, quarter before, year before for eent mvoement calculations
2. Download 8Ks
3. Download Stock Movements
### Process
1. For each ticker in S&P 500:<br>
    a. Get list of links to all 8Ks for a CIK<br>
    b. Extract and clean up corpus<br>
    c. Extract date and time of release<br>
2. Stock Movements<br>
    a. Calculate 1 Day movement(before and after release)<br>
    b. Calculate 1 Month, 1 Quarter, 1 Year moving averages<br>
    c. Get VIX at publication release<br>
3. NLP

### Get Corpus Info

In [15]:
# Returns Dataframe of document links for a given CIK
def get_sec_docs(cik,ticker):
    base_url = "https://www.sec.gov/cgi-bin/browse-edgar"
    inputted_cik = cik
    payload = {
        "action" : "getcompany",
        "CIK" : inputted_cik,
        "type" : "8-K",
        "output":"xml",
        "dateb" : "20180401",
        #"count" : "100",
        #"owner" : "include"
    }
    sec_response = requests.get(url=base_url,params=payload)
    soup = BeautifulSoup(sec_response.text,'lxml')
    url_list = soup.findAll('filinghref')
    html_list = []
    # Get html version of links
    for link in url_list:
        link = link.string
        if link.split(".")[len(link.split("."))-1] == 'htm':
            txtlink = link + "l"
            html_list.append(txtlink)

    doc_list = []
    doc_name_list = []
    # Get links for txt versions of files
    for k in range(len(html_list)):
        txt_doc = html_list[k].replace("-index.html",".txt")
        doc_name = txt_doc.split("/")[-1]
        doc_list.append(txt_doc)
        doc_name_list.append(doc_name)
        # Create dataframe of CIK, doc name, and txt link
    df = pd.DataFrame(
        {
        "cik" : [cik]*len(html_list),
        "ticker" : [ticker]*len(html_list),
        "txt_link" : doc_list,
        "doc_name": doc_name_list
        }
    )
    return df

# Extracts text and submission datetime from document link
def extract_text(link):
    r = requests.get(link,timeout=30)
    #Parse 8-K document
    filing = BeautifulSoup(r.content,"lxml",from_encoding="ascii")
    #Extract datetime
    submission_dt = filing.find("acceptance-datetime").string[:14]
    #Extract HTML sections
    submission_dt = datetime.datetime.strptime(submission_dt,"%Y%m%d%H%M%S")
    for section in filing.findAll("html"):
        #Remove tables
        for table in section("table"):
            table.decompose()
        #Convert to unicode
        section = unicodedata.normalize("NFKD",section.text)
        section = section.replace("\t"," ").replace("\n"," ").replace("/s"," ").replace("\'","'")
    filing = "".join((section))
    
    return filing, submission_dt

### Get Stock & Index Movements

In [3]:
# S&P 500 index data downloaded from Yahoo Finance GSPC
gspc_df = pd.read_csv("Data/gspc.csv",parse_dates=['Date'],index_col="Date")
# Get VIX index data downloaded from Yahoo Finance
vix_df = pd.read_csv("Data/vix.csv",parse_dates=['Date'],index_col="Date")
#Authenticate with API KEY
quandl.ApiConfig.api_key = config.api_key # YOUR API KEY HERE

nyse = mcal.get_calendar('NYSE')
nyse_holidays = nyse.holidays().holidays

In [5]:
#Takes datetime object and ticker string, returns price (opening or closing)
def get_historical_movements(ticker,period,release_date):
   #1 Week
    if period == "week":
        e_start = release_date + datetime.timedelta(weeks=-1)
        b_start = e_start
        
        e_end = release_date + dateutil.relativedelta.relativedelta(days=-1)
        b_end = e_end
        
     #1 Month    
    elif period == "month":
        e_start = release_date + dateutil.relativedelta.relativedelta(months=-1)
        b_start = e_start + dateutil.relativedelta.relativedelta(days=-5)
        
        e_end = release_date + dateutil.relativedelta.relativedelta(days=-1)
        b_end = release_date + dateutil.relativedelta.relativedelta(days=-6)
        
    #1 Quarter
    elif period == "quarter":
        e_start = release_date + dateutil.relativedelta.relativedelta(months=-3)
        b_start = e_start + dateutil.relativedelta.relativedelta(days=-10)
        
        e_end = release_date + dateutil.relativedelta.relativedelta(days=-1)
        b_end = release_date + dateutil.relativedelta.relativedelta(days=-11)
        
    #1 Year
    elif period == "year":
        e_start = release_date + dateutil.relativedelta.relativedelta(years=-1)
        b_start = e_start + dateutil.relativedelta.relativedelta(days=-20)
        
        e_end = release_date + dateutil.relativedelta.relativedelta(days=-1)
        b_end = release_date + dateutil.relativedelta.relativedelta(days=-21)
    else:
        raise KeyError
        
    e_start = weekday_check(e_start)
    b_start = weekday_check(b_start)
    e_end = weekday_check(e_end)
    b_end = weekday_check(b_end)
    
    start_price = get_quandl_data(ticker=ticker,start_date = b_start, end_date = e_start)
    end_price = get_quandl_data(ticker=ticker,start_date = b_end, end_date = e_end)
    stock_change = calculate_pct_change(end_price,start_price)
    
    start_index = get_index_price(start_date = b_start, end_date = e_start)
    end_index = get_index_price(start_date = e_start, end_date = e_end)
    index_change =  calculate_pct_change(end_index,start_index)
    
    normalized = stock_change - index_change
    return normalized

def get_quandl_data(ticker,start_date,end_date,market_open=False):
    if market_open == True:
        quandl_param = "WIKI/" + ticker + ".8"  
    else:
        quandl_param = "WIKI/" + ticker + ".11" 
   
    end_date = datetime.datetime.strftime(end_date,"%Y-%m-%d") 
    start_date = datetime.datetime.strftime(start_date,"%Y-%m-%d")
    price = quandl.get(quandl_param,start_date=start_date,end_date=end_date)
    return price.mean()

# Takes ticker, 8K release date, checks time of release and then calculate before and after price change
def get_change(ticker,release_date):
    market_close = release_date.replace(hour=16,minute=0,second=0)
    market_open = release_date.replace(hour=9,minute=30,second=0)
    
# If report is released after market hours, take change of start date close and release date open
    if release_date > market_close:
        start_date = release_date
        end_date = release_date + datetime.timedelta(days=1)
        end_date = weekday_check(end_date)

        price_before_release = get_quandl_data(ticker,start_date,market_open=False)
        price_after_release = get_quandl_data(ticker,end_date,market_open=True)

        index_before_release = get_index_price(start_date,start_date,market_open=False)
        index_after_release = get_index_price(end_date,end_date,market_open=True)
        
        vix = vix_df.loc[vix_df.index == np.datetime64(start_date),"Adj Close"]

    # If report is released before market hours, take change of start date's close and release date's open
    elif release_date < market_open:
        start_date = release_date + datetime.timedelta(days=-1)
        start_date = weekday_check(start_date)
        end_date = release_date

        price_before_release = get_quandl_data(ticker,start_date,market_open=False)
        price_after_release = get_quandl_data(ticker,end_date,market_open=True) 

        index_before_release = get_index_price(start_date,start_date,market_open=False)
        index_after_release = get_index_price(end_date,end_date,market_open=True)
        
        vix = vix_df.loc[vix_df.index == np.datetime64(start_date),"Adj Close"]

    # If report is released during market hours, use market close
    else:
        start_date = release_date
        end_date = release_date
        price_before_release = get_quandl_data(ticker,start_date,market_open=True)
        price_after_release = get_quandl_data(ticker,end_date,market_open=False)

        index_before_release = get_index_price(start_date,start_datemarket_open=True)
        index_after_release = get_index_price(end_date,end_date,market_open=False)
        
        vix = vix_df.loc[vix_df.index == np.datetime64(start_date),"Open"]
       
    price_pct_change = calculate_pct_change(price_after_release,price_before_release)
    index_pct_change = calculate_pct_change(index_after_release,index_before_release)
    normalized_change = price_pct_change - index_pct_change
    
    
    return normalized_change

def get_index_price(start_date,end_date,market_open=False):
    if market_open == True:
        price = gspc_df.loc[(gspc_df.index >= np.datetime64(start_date)) & 
                         (gspc_df.index <= np.datetime64(end_date)),"Open"].mean()
    else:
        price = gspc_df.loc[(gspc_df.index >= np.datetime64(start_date)) & 
                         (gspc_df.index <= np.datetime64(end_date)),"Adj Close"].mean()
    
    return price

def calculate_pct_change(end_value,start_value):
    pct_change = (end_value - start_value) / start_value
    pct_change = round(pct_change,4) * 100
    return pct_change

def weekday_check(date):
    date = holiday_check(date)
    
    # If date is Saturday or Sunday, reset date to the preceding Friday
    if date.isoweekday() == 6:
        date = date + datetime.timedelta(days=-1)
    elif date.isoweekday() == 7:
        date = date + datetime.timedelta(days=-2)
        
    if date in nyse_holidays:
        date = date + datetime.timedelta(days=-1) 
        
    date = holiday_check(date)
    return date

def holiday_check(date):
    if date in nyse_holidays:
        date = date + datetime.timedelta(days=-1)
    return date


### Features
1. Pubication Date
2. Publication Category
3. Recent Movements (Normalized with S&P 500 Movements)<br>
    a. 1 Day<br>
    b. 1 Month (5 Day MA)<br>
    c. 1 Quarter (10 Day MA)<br>
    d. 1 Year(20 Day MA)<br>
4. VIX at publication release
5. Publicaion Corpus<br>
    a. Unigram<br>
    b. NMF

In [6]:
# Get Current S&P 500 List, Stock Ticker, and CIK
wiki_url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
cik_df = pd.read_html(wiki_url,header=[0],index_col=0)[0]
cik_df

Unnamed: 0_level_0,Security,SEC filings,GICS Sector,GICS Sub Industry,Address of Headquarters,Date first added[3][4],CIK
Ticker symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
MMM,3M Company,reports,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",,66740
ABT,Abbott Laboratories,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800
ABBV,AbbVie Inc.,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152
ACN,Accenture plc,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373
ATVI,Activision Blizzard,reports,Information Technology,Home Entertainment Software,"Santa Monica, California",2015-08-31,718877
AYI,Acuity Brands Inc,reports,Industrials,Electrical Components & Equipment,"Atlanta, Georgia",2016-05-03,1144215
ADBE,Adobe Systems Inc,reports,Information Technology,Application Software,"San Jose, California",1997-05-05,796343
AMD,Advanced Micro Devices Inc,reports,Information Technology,Semiconductors,"Sunnyvale, California",2017-03-20,2488
AAP,Advance Auto Parts,reports,Consumer Discretionary,Automotive Retail,"Roanoke, Virginia",2015-07-09,1158449
AES,AES Corp,reports,Utilities,Independent Power Producers & Energy Traders,"Arlington, Virginia",1998-10-02,874761


In [7]:
df_list = []
company_list = cik_df['CIK'].to_dict()
for (ticker,cik) in company_list.items():
    df_list.append(get_sec_docs(cik,ticker))
crawled_df = pd.concat(df_list,axis=0)

FileNotFoundError: [Errno 2] No such file or directory: 'Pickles/sec_links_df.pkl'

In [10]:
crawled_df.to_pickle("Pickles/doc_links_df.pkl")

In [None]:
crawled_df['text'], crawled_df['release_date'] = zip(*crawled_df['txt_link'].apply(extract_text))
crawled_df.head()

In [None]:
crawled_df.to_pickle("Pickles/doc_texts.pkl")

In [1]:
crawled_df

NameError: name 'crawled_df' is not defined