In [235]:
from requests_html import HTMLSession
from lxml import html 
from datetime import date,datetime
import smtplib, ssl
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import sys
import re 
from bs4 import BeautifulSoup
import nums_from_string

## Helper Methods

In [236]:
def fetchSession(url):
    session = HTMLSession()
    r = session.get(url)
    return r

def getTrades(r):
    table = r.html.find('table')[0]
    rows = table.find('tr')
    return rows[1:]

def value_to_ints(value):
    bad_chars = [
        ',','$','-'
    ]
    for c in bad_chars:
        value = value.replace(c,'')
    low, high = [
        int(x) for x in (value.split('  ', 1))
    ]
    return [low,high]

In [237]:
def getHTML(url):
    r = fetchSession(url)
    h = r.text
    doc = BeautifulSoup(h, 'html.parser')
    return doc

def getTicker(trade_):
    try:
        return re.findall('\[(.*?)\]', trade_)[0]
    except IndexError:
        return ''

In [238]:
def getFirstRowEntry(ticker):
    url = 'https://finance.yahoo.com/quote/{}/'.format(ticker)
    soup = getHTML(url)
    quote_summary = soup.find(id='quote-summary')
    if quote_summary is None:
        return ''
    tables = quote_summary.find_all('table')
    if len(tables) == 0:
        return ''
    # right side table
    mc_table = tables[1]
    # get all rows
    mc_rows = mc_table.find_all('td')
    # entire row 
    mc_string = str(mc_rows[1])
    return mc_string

def isStock(row_one):
    flag = 'data-test="(.*)-value'
    seach = re.search(
        flag, row_one
    )
    if seach is None:
        return -1
    marker = seach.group(1)
    if marker == 'MARKET_CAP':
        return 1
    elif marker == 'NET_ASSETS':
        return 0
    # N/A
    else:
        return -1

def parseToMillions(value_string):
    unit = value_string[-1:]
    number = nums_from_string.get_nums(value_string)[0]
    #keep in units of millions
    if unit == 'B':
        number = number * 1000
    elif unit == 'T':
        number = number * 1000000
    return number

def getNAVCAP(row_one):
    value = re.search('>(.*)<', row_one).group(1)
    if value == 'N/A':
        return -1
    return round(parseToMillions(value),2)

In [239]:
def writeToFile(trades):
    with open('data/daily_trades.txt', 'w') as f:
        for t in trades:
            for (key,item) in t.items():
                if key == 'Yahoo!':
                    f.write(
                        '%s\n' % (
                        item
                        )
                    )
                else:
                    f.write(
                        '%s : %s\n' % (
                        key,item
                        )
                    )
            f.write('\n')

def sendEmail(toList = False):
    port = 465
    send_email = 'ders.mailbot@gmail.com'
    with open('data/password.txt','r') as f:
        password = f.read()

    with open('data/daily_trades.txt', 'r') as f:
        data = f.read()
    
    # get list of emails from text file in data folder 
    recipients = []
    if toList:
        with open('data/mailing_list.txt','r') as f:
            lines = f.readlines()
        for l in lines:
            recipients.append(l.strip())
    else:
        recipients = [send_email]
    # if the length of the string from the file is not 0, then there was a 
    # (major) trade executed today
    if len(data) != 0:
        print('major trade found.')
        message = MIMEMultipart('alternative')
        message['Subject'] = 'Trade Alert'
        message['From'] = send_email
        message['To'] = ', '.join(recipients) # change post testing
        message['Bcc'] = ''
        body = MIMEText(data, 'plain')
        message.attach(body)

        context = ssl.create_default_context()
        with smtplib.SMTP_SSL('smtp.gmail.com', port, context=context) as server:
            server.login(send_email, password)
            server.sendmail(
                send_email, recipients, message.as_string()
            )
            print('mail sent.')
    else:
        print('no major trades.')

In [240]:
def cleanNewsURLQuery(trade):
    return (
        'https://news.google.com/search?q={}&hl=en-US&gl=US&ceid=US%3Aen'.format(
            trade.replace(' ', '%20').replace(',', '').replace('[','%5B').replace(']','%5D')
            )
        )

def getArticleTextFromUrl(url):
    soup = getHTML(url)
    articles = soup.find_all('article')
    return str(articles)

def findNOccurrence(str, sub, n):
    val = -1
    for i in range(0,n):
        val = str.find(sub, val + 1)
    return val

def getNewsUrlsTitles(full_article_string):
    list_of_urls_titles = []
    list_of_articles = full_article_string.split('</article>')
    i = 0
    for a in list_of_articles[:-1]:
        if i > 2:
            break
        start_ind = findNOccurrence(
            a, 'href', 2
        )
        slice = a[start_ind:]
        url = slice[slice.find('articles'):slice.find('">')]
        title = slice[slice.find('>'):slice.find('<')][1:]
        list_of_urls_titles.append(
            {
                'title':title,
                'url':'news.google.com/{}'.format(url)
            }
        )
        i += 1
    return list_of_urls_titles

## Testing

### Method to scrape trades with mkt cap and purchase logic all together

In [241]:
def scrapeImportantTrades(today=datetime.today().date(), onlyToday=False):
    r = fetchSession('https://sec.report/Senate-Stock-Disclosures')
    # if website is down
    try:
        trades = getTrades(r)
    except IndexError:
        print('website may be down. quitting.')
        sys.exit(1)

    n = len(trades)
    all_trades = []

    for i in range(0,n,2):
        imp_trade = False
        l1_elements = trades[i].find('td')
        l2_elements = trades[i+1].find('td')[:-1]

        # make sure trade happened today before doing anything 
        file_date, trade_date = l1_elements[0].text.split('\n')
        if file_date != today and onlyToday:
            break

        # ensure trade is a purchase, otherwise contniue to next trade
        trade_type = l2_elements[0].text.split('\n', 1)[0]
        if trade_type != 'Purchase':
            continue

        trade = l1_elements[1].text
        senator = l1_elements[2].text
        value = value_to_ints(l2_elements[1].text)
        
        ticker = getTicker(trade)
        # move on if ticker is invalid
        if ticker == '':
            continue

        row_one = getFirstRowEntry(ticker)
        mkt_cap = getNAVCAP(row_one)
        small_mktCap = mkt_cap < 2000 and mkt_cap > 0
        medium_mktCap = mkt_cap >= 2000 and mkt_cap <= 10000
        large_mktCap = mkt_cap > 10000
        # any small caps, medium purchase medium caps, large purchase large cap
        if isStock(row_one) and small_mktCap:
            imp_trade = True
            cap_string = 'small'
        elif isStock(row_one) and medium_mktCap and value[0] >= 50000:
            imp_trade = True
            cap_string = 'medium'
        elif isStock(row_one) and large_mktCap and value[0] >= 100000:
            imp_trade = True
            cap_string = 'large'

        if imp_trade:
            url = 'https://finance.yahoo.com/quote/{}/'.format(ticker)
            trade_dict = {
                'trade date' : trade_date,
                'file date' : file_date,
                'senator' : senator,
                'trade' : trade,
                'trade type' : trade_type,
                'value' : value,
                'mkt cap' : cap_string,
                'yahoo finance' : url,
            }
            all_trades.append(trade_dict)
    return all_trades

In [None]:
important_trades_all = scrapeImportantTrades()

In [None]:
important_trades_today = scrapeImportantTrades(onlyToday=True)
if len(important_trades_today) != 0:
    print(important_trades_today)
else:
    print('no important trades today.')

no important trades today.


## Email Appearance

In [273]:
def cleanText(trades_list):
    trades_for_txt = []
    for t in trades_list:
        trade_date = str(t['trade date']) + ' (' + str((
                datetime.today().date() - datetime.strptime(
                    t['trade date'], '%Y-%m-%d'
                ).date()
            )).split(',')[0] + ' ago)'

        value_string = '$' + (
            "{:,}".format(t['value'][0])
        ) + ' to $' + (
            "{:,}".format(t['value'][1])
        )

        if t['mkt cap'] == 'small':
            mkt_cap_string = 'Small Cap (Under $2B)'
        elif t['mkt cap'] == 'medium':
            mkt_cap_string = 'Medium Cap ($2B to $10B)'
        else:
            mkt_cap_string = 'Large Cap (Over $10B)'

        list_of_titles_urls = getNewsUrlsTitles(
            getArticleTextFromUrl(
                cleanNewsURLQuery(
                    t['trade']
                )
            )
        )
        # debug weird behavior for third ECOM trade
        # first two print the same thing fine, last one prints empty url and title
        # print(getTicker(t['trade']))
        # for i in list_of_titles_urls:
        #     for key,value in i.items():
        #         print(key, ':', value)
        if len(list_of_titles_urls) > 1:
            trades_for_txt.append(
                {
                    'Trade Date' : trade_date,
                    'File Date' : t['file date'],
                    'Senator' : t['senator'],
                    'Equity' : t['trade'],
                    'Trade Value' : value_string,
                    'Market Cap' : mkt_cap_string,
                    'Yahoo!' : t['yahoo finance'],
                    list_of_titles_urls[0]['title'] : list_of_titles_urls[0]['url'],
                    list_of_titles_urls[1]['title'] : list_of_titles_urls[1]['url'],
                    list_of_titles_urls[2]['title'] : list_of_titles_urls[2]['url']
                }
            )
        else:
            trades_for_txt.append(
                {
                    'Trade Date' : trade_date,
                    'File Date' : t['file date'],
                    'Senator' : t['senator'],
                    'Equity' : t['trade'],
                    'Trade Value' : value_string,
                    'Market Cap' : mkt_cap_string,
                    'Yahoo!' : t['yahoo finance']
                }
            )
    return trades_for_txt

In [274]:
cleaned_trades = cleanText(important_trades_all)
for key,value in cleaned_trades[0].items():
    if key == 'Yahoo!':
        print(value)
        continue
    print(key, ':', value)

Trade Date : 2022-03-30 (26 days ago)
File Date : 2022-04-08
Senator : Thomas H Tuberville [Tuberville, Tommy]
Equity : Limestone Bancorp, Inc. - Common Stock [LMST]
Trade Value : $1,001 to $15,000
Market Cap : Small Cap (Under $2B)
https://finance.yahoo.com/quote/LMST/
Limestone Bancorp : Reports Net Income of $3.6 million, or $0.47 per Diluted Share, for the 1st Quarter of 2022 - Form 8-K : news.google.com/articles/CBMitwFodHRwczovL3d3dy5tYXJrZXRzY3JlZW5lci5jb20vcXVvdGUvc3RvY2svTElNRVNUT05FLUJBTkNPUlAtSU5DLTQ0MzE1ODQ2L25ld3MvTGltZXN0b25lLUJhbmNvcnAtUmVwb3J0cy1OZXQtSW5jb21lLW9mLTMtNi1taWxsaW9uLW9yLTAtNDctcGVyLURpbHV0ZWQtU2hhcmUtZm9yLXRoZS0xc3QtUXVhci00MDEwMjUwNy_SAbsBaHR0cHM6Ly93d3cubWFya2V0c2NyZWVuZXIuY29tL2FtcC9xdW90ZS9zdG9jay9MSU1FU1RPTkUtQkFOQ09SUC1JTkMtNDQzMTU4NDYvbmV3cy9MaW1lc3RvbmUtQmFuY29ycC1SZXBvcnRzLU5ldC1JbmNvbWUtb2YtMy02LW1pbGxpb24tb3ItMC00Ny1wZXItRGlsdXRlZC1TaGFyZS1mb3ItdGhlLTFzdC1RdWFyLTQwMTAyNTA3Lw?hl=en-US&amp;gl=US&amp;ceid=US%3Aen
Limestone Bancorp, Inc. Declares Fir

In [275]:
writeToFile(cleaned_trades)

In [276]:
sendEmail(toList = True)

major trade found.
mail sent.


## Getting Links to Google News Articles

In [None]:
# takes trade line and converts to url formatting
def cleanNewsURLQuery(trade):
    return (
        'https://news.google.com/search?q={}&hl=en-US&gl=US&ceid=US%3Aen'.format(
            trade.replace(' ', '%20').replace(',', '').replace('[','%5B').replace(']','%5D')
            )
        )

In [None]:
def getArticleTextFromUrl(url):
    soup = getHTML(url)
    articles = soup.find_all('article')
    return str(articles)

In [None]:
def findNOccurrence(str, sub, n):
    val = -1
    for i in range(0,n):
        val = str.find(sub, val + 1)
    return val

In [249]:
# get urls of first 3 news stories 
def getNewsUrlsTitles(full_article_string):
    list_of_urls_titles = []
    list_of_articles = full_article_string.split('</article>')
    i = 0
    for a in list_of_articles:
        if i > 2:
            break
        start_ind = findNOccurrence(
            a, 'href', 2
        )
        slice = a[start_ind:]
        url = slice[slice.find('articles'):slice.find('">')]
        title = slice[slice.find('>'):slice.find('<')][1:]
        list_of_urls_titles.append(
            {
                'title':title,
                'url':'news.google.com/{}'.format(url)
            }
        )
        i += 1
    return list_of_urls_titles

In [None]:
all_articles = []
for t in important_trades_all:
    url = cleanNewsURLQuery(t['trade'])
    all_articles.append(
        getNewsUrlsTitles(getArticleTextFromUrl(url))
    )

In [252]:
all_articles[0][0]['title']

'Limestone Bancorp Reports Net Income of $3.6 million, or $0.47 per Diluted Share, for the 1st Quarter of 2022'