In [114]:
from bs4 import BeautifulSoup, SoupStrainer, BeautifulStoneSoup
import datetime
import unicodedata
import requests
import pandas as pd
import quandl
import config
import dateutil.relativedelta
import re
import numpy as np

### Table of Contents
1. Define Functions <br>
    a. Get list of SEC docs for CIK<br>
    b. Extract text, date from each link<br>
    c. Get price given ticker, date from Quandl
    d. Get movement given ticker, date
    e. Get index movement
    f. Check if date is a weekday, and if necessary, adjust to Friday before
    g. Calculate dates for month before, quarter before, year before for eent mvoement calculations
2. Download 8Ks
3. Download Stock Movements
### Process
1. For each ticker in S&P 500:<br>
    a. Get list of links to all 8Ks for a CIK<br>
    b. Extract and clean up corpus<br>
    c. Extract date and time of release<br>
2. Stock Movements<br>
    a. Calculate 1 Day movement(before and after release)<br>
    b. Calculate 1 Month, 1 Quarter, 1 Year moving averages<br>
    c. Get VIX at publication release<br>
3. NLP

### Get Corpus Info

In [4]:
# Returns Dataframe of document links for a given CIK
def get_sec_docs(cik="0001065088"):
    base_url = "https://www.sec.gov/cgi-bin/browse-edgar"
    inputted_cik = cik
    payload = {
        "action" : "getcompany",
        "CIK" : inputted_cik,
        "type" : "8-K",
        "output":"xml"
        #"dateb" : "20180331",
        #"count" : "100",
        #"owner" : "include"
    }
    sec_response = requests.get(url=base_url,params=payload)
    soup = BeautifulSoup(sec_response.text,'lxml')
    url_list = soup.findAll('filinghref')
    html_list = []
    # Get html version of links
    for link in url_list:
        link = link.string
        if link.split(".")[len(link.split("."))-1] == 'htm':
            txtlink = link + "l"
            html_list.append(txtlink)

    doc_list = []
    doc_name_list = []
    # Get links for txt versions of files
    for k in range(len(html_list)):
        txt_doc = html_list[k].replace("-index.html",".txt")
        doc_name = txt_doc.split("/")[-1]
        doc_list.append(txt_doc)
        doc_name_list.append(doc_name)
        # Create dataframe of CIK, doc name, and txt link
    df = pd.DataFrame(
        {
        "cik" : [cik]*len(html_list),
        "txt_link" : doc_list,
        "doc_name": doc_name_list
        }
    )
    return df

# Extracts text and submission datetime from document link
def extract_text(link):
    r = requests.get(link)
    #Parse 8-K document
    filing = BeautifulSoup(r.content,"html.parser",from_encoding="ascii")
    #Extract datetime
    submission_dt = filing.find("acceptance-datetime").string[:14]
    #Extract HTML sections
    submission_dt = datetime.datetime.strptime(submission_dt,"%Y%m%d%H%M%S")
    for section in filing.findAll("html"):
        #Remove tables
        for table in section("table"):
            table.decompose()
        #Convert to unicode
        section = unicodedata.normalize("NFKD",section.text)
        section = section.replace("\t"," ").replace("\n"," ").replace("/s"," ").replace("\'","'")
    filing = "".join((section))
    
    return filing, submission_dt

### Get Stock & Index Movements

In [36]:
# S&P 500 index data downloaded from Yahoo Finance GSPC
gspc_df = pd.read_csv("Data/gspc.csv",parse_dates=['Date'],index_col="Date")

#Authenticate with API KEY
quandl.ApiConfig.api_key = config.api_key # YOUR API KEY HERE

In [132]:
#Takes datetime object and ticker string, returns price (opening or closing)
def get_quandl_data(ticker,end_date,market_open=False):
    if market_open == True:
        quandl_param = "WIKI/" + ticker + ".8"  
    else:
        quandl_param = "WIKI/" + ticker + ".11" 
   
    end_date = datetime.datetime.strftime(end_date,"%Y-%m-%d") 
    start_date = datetime.datetime.strftime(end_date,"%Y-%m-%d")
    price = quandl.get(quandl_param,start_date=start_date,end_date=end_date).values[0,0]
    return price

def get_start_date(period,release_date):
   #1 Week
    if period == "week":
        start_date = release_date + datetime.timedelta(weeks=-1)
     #1 Month    
    elif period == "month":
        start_date = release_date + dateutil.relativedelta.relativedelta(months=-1)
    #1 Quarter
    elif period == "quarter":
        start_date = release_date + dateutil.relativedelta.relativedelta(months=-3)
    #1 Year
    elif period == "year":
        start_date = release_date +  dateutil.relativedelta.relativedelta(years=-1)
    else:
        raise KeyError
        
    #Check if date falls on a weekend
    start_date = weekday_check(start_date)

    return start_date

def get_recent_movements(ticker,release_date):
    #Get year of data
    end_date = release_date + datetime.timedelta(days=-1)
    end_date = datetime.datetime.strftime(end_date,"%Y-%m-%d")
    year_before = get_start_date(release_date,"year")
    year_before = datetime.datetime.strftime(year_before,"%Y-%m-%d")
    params = quandl_param = "WIKI/" + ticker + ".11"
    year_df = quandl.get(quandl_param,start_date=start_date,end_date=year_before)
    #From year, get quarterly, monthly, weekly prices
    
# Takes ticker, 8K release date, checks time of release and then calculate before and after price change
def get_change(ticker,release_date):
    market_close = release_date.replace(hour=16,minute=0,second=0)
    market_open = release_date.replace(hour=9,minute=30,second=0)
    
# If report is released after market hours, take change of start date close and release date open
    if release_date > market_close:
        start_date = release_date
        end_date = release_date + datetime.timedelta(days=1)
        end_date = weekday_check(end_date)

        price_before_release = get_quandl_data(ticker,start_date,market_open=False)
        price_after_release = get_quandl_data(ticker,end_date,market_open=True)

        index_before_release = get_index_price(start_date,market_open=False)
        index_after_release = get_index_price(end_date,market_open=True)

    # If report is released before market hours, take change of start date's close and release date's open
    elif release_date < market_open:
        start_date = release_date + datetime.timedelta(days=-1)
        start_date = weekday_check(start_date)
        end_date = release_date

        price_before_release = get_quandl_data(ticker,start_date,market_open=False)
        price_after_release = get_quandl_data(ticker,end_date,market_open=True) 

        index_before_release = get_index_price(start_date,market_open=False)
        index_after_release = get_index_price(end_date,market_open=True)

    # If report is released during market hours, use market close
    else:
        start_date = release_date
        end_date = release_date
        price_before_release = get_quandl_data(ticker,start_date,market_open=True)
        price_after_release = get_quandl_data(ticker,end_date,market_open=False)

        index_before_release = get_index_price(start_date,market_open=False)
        index_after_release = get_index_price(end_date,market_open=False)
       
    price_pct_change = calculate_pct_change(price_after_release,price_before_release)
    index_pct_change = calculate_pct_change(index_after_release,index_before_release)
    
    return price_pct_change - index_pct_change

def get_index_price(input_date,market_open):
    if market_open == True:
        price = gspc_df.loc[gspc_df.index==np.datetime64(input_date.date()),"Open"]  
    else:
        price = gspc_df.loc[gspc_df.index==np.datetime64(input_date.date()),"Adj Close"] 
    return price

def calculate_pct_change(end_value,start_value):
    pct_change = (end_value - start_value) / start_value
    pct_change = round(pct_change,4) * 100
    return pct_change

def weekday_check(date):
    # If date is Saturday or Sunday, reset date to the preceding Friday
    if date.isoweekday() == 6:
        date = date + datetime.timedelta(days=-1)
    elif date.isoweekday() == 7:
        date = date + datetime.timedelta(days=-2)
    return date

### Features
1. Pubication Date
2. Publication Category
3. Recent Movements (Normalized with S&P 500 Movements)<br>
    a. 1 Day<br>
    b. 1 Month (5 Day MA)<br>
    c. 1 Quarter (10 Day MA)<br>
    d. 1 Year(20 Day MA)<br>
4. VIX at publication release
5. Publicaion Corpus<br>
    a. Unigram<br>
    b. NMF

In [37]:
corpus, dt = extract_text(ebay_df['txt_link'][1])

In [82]:
get_closing_price()

{'month_before': 37.74,
 'quarter_before': 37.64,
 'week_before': 39.82,
 'year_before': 31.83}

In [158]:
# Get Current S&P 500 List, Stock Ticker, and CIK
wiki_url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
cik_df = pd.read_html(wiki_url,header=[0],index_col=0,attrs={"class":"wikitable"})[0]
cik_df

In [None]:
df_list = []
company_list = cik_df.todict()
for company in company_list.keys():
    df_list = df_list.append(get_sec_docs(company_list[company]))
crawled_df = pd.concat(df_list,axis=0)