## Test Use
Scrapes site up to ```backtest_date``` and returns any trades deemed important.

Current version dumps output of dict list sent to email formater  

In [1]:
from requests_html import HTMLSession
from datetime import datetime,timedelta
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import sys
import re 
from bs4 import BeautifulSoup
import nums_from_string

In [2]:
backtest_date = '2022-04-01'

In [3]:
def fetchSession(url):
    session = HTMLSession()
    r = session.get(url)
    return r

def getTrades(r):
    table = r.html.find('table')[0]
    rows = table.find('tr')
    return rows[1:]

def value_to_ints(value):
    bad_chars = [
        ',','$','-'
    ]
    for c in bad_chars:
        value = value.replace(c,'')
    low, high = [
        int(x) for x in (value.split('  ', 1))
    ]
    return [low,high]

In [4]:
def getHTML(url):
    r = fetchSession(url)
    h = r.text
    doc = BeautifulSoup(h, 'html.parser')
    return doc

def getTicker(trade_):
    try:
        return re.findall('\[(.*?)\]', trade_)[0]
    except IndexError:
        return ''

In [5]:
def getFirstRowEntry(ticker):
    url = 'https://finance.yahoo.com/quote/{}/'.format(ticker)
    soup = getHTML(url)
    quote_summary = soup.find(id='quote-summary')
    if quote_summary is None:
        return ''
    tables = quote_summary.find_all('table')
    if len(tables) == 0:
        return ''
    # right side table
    mc_table = tables[1]
    # get all rows
    mc_rows = mc_table.find_all('td')
    # entire row 
    mc_string = str(mc_rows[1])
    return mc_string

def isStock(row_one):
    flag = 'data-test="(.*)-value'
    seach = re.search(
        flag, row_one
    )
    if seach is None:
        return -1
    marker = seach.group(1)
    if marker == 'MARKET_CAP':
        return 1
    elif marker == 'NET_ASSETS':
        return 0
    # N/A
    else:
        return -1

def parseToMillions(value_string):
    unit = value_string[-1:]
    number = nums_from_string.get_nums(value_string)[0]
    #keep in units of millions
    if unit == 'B':
        number = number * 1000
    elif unit == 'T':
        number = number * 1000000
    return number

def getNAVCAP(row_one):
    value = re.search('>(.*)<', row_one).group(1)
    if value == 'N/A':
        return -1
    return round(parseToMillions(value),2)

In [6]:
def cleanNewsURLQuery(trade):
    return (
        'https://news.google.com/search?q={}&hl=en-US&gl=US&ceid=US%3Aen'.format(
            trade.replace(' ', '%20').replace(',', '').replace('[','%5B').replace(']','%5D')
            )
        )

def getArticleTextFromUrl(url):
    soup = getHTML(url)
    articles = soup.find_all('article')
    return str(articles)

def findNOccurrence(str, sub, n):
    val = -1
    for i in range(0,n):
        val = str.find(sub, val + 1)
    return val

def getNewsUrlsTitles(full_article_string):
    list_of_urls_titles = []
    list_of_articles = full_article_string.split('</article>')
    i = 0
    for a in list_of_articles[:-1]:
        if i > 2:
            break
        start_ind = findNOccurrence(
            a, 'href', 2
        )
        slice = a[start_ind:]
        url = slice[slice.find('articles'):slice.find('">')]
        title = slice[slice.find('>'):slice.find('<')][1:]
        list_of_urls_titles.append(
            {
                'title':title,
                'url':'news.google.com/{}'.format(url)
            }
        )
        i += 1
    return list_of_urls_titles

In [7]:
# backtestDate = 'YYYY-MM-DD'
def backtestScrape(backtestDate):
    r = fetchSession('https://sec.report/Senate-Stock-Disclosures')
    # if website is down
    try:
        trades = getTrades(r)
    except IndexError:
        print('website may be down. quitting.')
        sys.exit(1)

    n = len(trades)
    all_trades = []
    dt_backtest = datetime.strptime(backtestDate, '%Y-%m-%d').date()

    for i in range(0,n,2):
        imp_trade = False
        l1_elements = trades[i].find('td')
        l2_elements = trades[i+1].find('td')[:-1]

        # make sure trade happened today before doing anything 
        file_date, trade_date = l1_elements[0].text.split('\n')
        file_dt = datetime.strptime(file_date, '%Y-%m-%d').date()
        days = file_dt - dt_backtest
        if days < timedelta(days=0):
            break

        # ensure trade is a purchase, otherwise contniue to next trade
        trade_type = l2_elements[0].text.split('\n', 1)[0]
        if trade_type != 'Purchase':
            continue

        trade = l1_elements[1].text
        senator = l1_elements[2].text
        senator = senator.split(' [')[0]
        value = value_to_ints(l2_elements[1].text)
        
        ticker = getTicker(trade)
        # move on if ticker is invalid
        if ticker == '':
            continue

        row_one = getFirstRowEntry(ticker)
        mkt_cap = getNAVCAP(row_one)
        small_mktCap = mkt_cap < 2000 and mkt_cap > 0
        medium_mktCap = mkt_cap >= 2000 and mkt_cap <= 10000
        large_mktCap = mkt_cap > 10000
        # any small caps, medium purchase medium caps, large purchase large cap
        if isStock(row_one) and small_mktCap:
            imp_trade = True
            cap_string = 'small'
        elif isStock(row_one) and medium_mktCap and value[0] >= 50000:
            imp_trade = True
            cap_string = 'medium'
        elif isStock(row_one) and large_mktCap and value[0] >= 100000:
            imp_trade = True
            cap_string = 'large'

        if imp_trade:
            url = 'https://finance.yahoo.com/quote/{}/'.format(ticker)
            trade_dict = {
                'trade date' : trade_date,
                'file date' : file_date,
                'senator' : senator,
                'trade' : trade,
                'trade type' : trade_type,
                'value' : value,
                'mkt cap' : cap_string,
                'yahoo finance' : url,
            }
            all_trades.append(trade_dict)
    return all_trades

In [8]:
def cleanText(trades_list):
    trades_for_txt = []
    for t in trades_list:
        trade_date = str(t['trade date']) + ' (' + str((
                datetime.today().date() - datetime.strptime(
                    t['trade date'], '%Y-%m-%d'
                ).date()
            )).split(',')[0] + ' ago)'

        value_string = '$' + (
            "{:,}".format(t['value'][0])
        ) + ' to $' + (
            "{:,}".format(t['value'][1])
        )

        if t['mkt cap'] == 'small':
            mkt_cap_string = 'Small Cap (Under $2B)'
        elif t['mkt cap'] == 'medium':
            mkt_cap_string = 'Medium Cap ($2B to $10B)'
        else:
            mkt_cap_string = 'Large Cap (Over $10B)'

        list_of_titles_urls = getNewsUrlsTitles(
            getArticleTextFromUrl(
                cleanNewsURLQuery(
                    t['trade']
                )
            )
        )

        if len(list_of_titles_urls) > 1:
            trades_for_txt.append(
                {
                    'Trade Date' : trade_date,
                    'File Date' : t['file date'],
                    'Senator' : t['senator'],
                    'Equity' : t['trade'],
                    'Trade Value' : value_string,
                    'Market Cap' : mkt_cap_string,
                    'Yahoo!' : t['yahoo finance'],
                    'Title 1' : list_of_titles_urls[0]['title'], 
                    'Title 2' : list_of_titles_urls[1]['title'],  
                    'Title 3' : list_of_titles_urls[2]['title'], 
                    'URL1' : list_of_titles_urls[0]['url'],
                    'URL2' : list_of_titles_urls[1]['url'],
                    'URL3' : list_of_titles_urls[2]['url']
                }
            )
        else:
            trades_for_txt.append(
                {
                    'Trade Date' : trade_date,
                    'File Date' : t['file date'],
                    'Senator' : t['senator'],
                    'Equity' : t['trade'],
                    'Trade Value' : value_string,
                    'Market Cap' : mkt_cap_string,
                    'Yahoo!' : t['yahoo finance']
                }
            )
    return trades_for_txt

In [9]:
trades = backtestScrape(backtest_date)
cleaned_trades = cleanText(trades)

In [10]:
for t in cleaned_trades:
    for (key,item) in t.items():
        if key == 'Yahoo!':
            print(
                '%s' % (
                item
                )
            )
        else:
            print(
                '%s : %s' % (
                key,item
                )
            )
    print('\n')

Trade Date : 2022-03-30 (34 days ago)
File Date : 2022-04-08
Senator : Thomas H Tuberville
Equity : Limestone Bancorp, Inc. - Common Stock [LMST]
Trade Value : $1,001 to $15,000
Market Cap : Small Cap (Under $2B)
https://finance.yahoo.com/quote/LMST/
Title 1 : LIMESTONE BANCORP, INC. Management's Discussion and Analysis of Financial Condition and Results of Operations (form 10-Q)
Title 2 : Limestone Bancorp Stock Forecast, Price &amp; News (NASDAQ:LMST)
Title 3 : Zacks.com featured highlights include: BancFirst Corp., Fulton Financial Corp., Limestone Bancorp, Inc. and Cowen Inc.
URL1 : news.google.com/articles/CBMitgFodHRwczovL3d3dy5tYXJrZXRzY3JlZW5lci5jb20vcXVvdGUvc3RvY2svTElNRVNUT05FLUJBTkNPUlAtSU5DLTQ0MzE1ODQ2L25ld3MvTElNRVNUT05FLUJBTkNPUlAtSU5DLU1hbmFnZW1lbnQtcy1EaXNjdXNzaW9uLWFuZC1BbmFseXNpcy1vZi1GaW5hbmNpYWwtQ29uZGl0aW9uLWFuZC1SZXN1bHRzLTQwMjIzNDExL9IBugFodHRwczovL3d3dy5tYXJrZXRzY3JlZW5lci5jb20vYW1wL3F1b3RlL3N0b2NrL0xJTUVTVE9ORS1CQU5DT1JQLUlOQy00NDMxNTg0Ni9uZXdzL0xJTUVTVE9ORS1CQ

### Converting data to JSON

In [24]:
import json

with open('output_test.json','w') as f:
    f.write(
        json.dumps(obj=cleaned_trades,indent=4)
        )
