<a href="https://colab.research.google.com/github/zzyy-gh/stonks-lambda-/blob/main/stonks_info_scrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# scrape stock data daily from finviz
# data: tkr, n, sec, ind, ctry, ipo
# run on UTC time 0000, 4 hours after NYC: use yesterday's date in utc to get latest market data
import requests
from datetime import datetime, timedelta
import time
import multiprocessing
import json
import boto3
from bs4 import BeautifulSoup

client = boto3.client('dynamodb')

# ---------- functions ----------

def cleanData(nicePage, url, faker):
    
    def cleanText(dataName, data):
        try:
            if data != '-':
                cleanItem['PutRequest']['Item'][dataName] = {'S': data}
        except:
            pass
        return
    
    def cleanIPO(dataName, data):
        try:
            if data != '-':
                cleanItem['PutRequest']['Item'][dataName] = {'N': datetime.strptime(item[5], '%m/%d/%Y').strftime('%Y%m%d')}
        except:
            pass
        return
    
    rows = nicePage.find_all('tr', class_='table-light-row-cp') + nicePage.find_all('tr', class_='table-dark-row-cp')
    pageCleanList = []
    rawList = []
    
    # if ads pops up, refresh
    if len(rows) == 0:
        # fetch webpage
        try:
            print('Retrieving refreshed page.')
            page = requests.get(url, headers=faker)
        except:
            print('Failed to retrieve refreshed page.')
            return pageCleanList
        nicePage = BeautifulSoup(page.content, 'html.parser')
        rows = nicePage.find_all('tr', class_='table-light-row-cp') + nicePage.find_all('tr', class_='table-dark-row-cp')
    
    # extract raw data
    for row in rows:
        td = row.find_all('td')
        rawList.append([x.text for x in td])
        
    # save cleaned data
    for item in rawList:
        try:
            date = datetime.strptime(item[5], "%m/%d/%Y")
            yesterday = datetime.now() - timedelta(1)
            if date.date() < yesterday.date():
                print('No new IPO.')
                return pageCleanList
            else:
                print('New stock!')
                pass
        except:
            pass
        cleanItem = {'PutRequest':{'Item':{}}}
        cleanText('tkr', item[0])
        cleanText('n', item[1])
        cleanText('sec', item[2])
        cleanText('ind', item[3])
        cleanText('ctry', item[4])
        cleanIPO('ipo', item[5]) 
        cleanItem['PutRequest']['Item']['date'] = {'N': yesterday.strftime('%Y%m%d')}

        pageCleanList.append(cleanItem)
        
    
    return pageCleanList
    
def upload(cleanList):
    strCleanList = [ [] for _ in range((len(cleanList) - 1) // 25 + 1) ]
    for i, data in enumerate(cleanList):
        strCleanList[i // 25].append(data) 
    for batch in strCleanList:
        try: 
            response = client.batch_write_item(
                RequestItems= {
                    'ticker_info': batch
                }
            )
        except:
            print('Whoops!')
    return

def scrape_n_store(finvizUrl1, finvizUrl2, faker, rowIncrement):

    # data
    row = 1
    cleanList = []

    # iterate through all the webpages to obtain and clean data
    start = time.time()
    while True:

        # fetch webpage
        try:
            url = finvizUrl1 + str(row) + finvizUrl2
            page = requests.get(url, headers=faker)
        except:
            print('Failed to retrieve row ' + str(row) + '. (1)')
            break

        # get clean data
        nicePage = BeautifulSoup(page.content, 'html.parser')
        tempCleanList = cleanData(nicePage, url, faker)
        if len(tempCleanList) == 0:
            print('No new stocks.')
            return
        else:
            cleanList.extend(tempCleanList)


        # exit loop if on the last page, else continue to next page
        nextBtn = nicePage.find_all('a', class_='tab-link',string='next')
        if len(nextBtn) == 0:
            print('Data scraping has ended. The first row number of the last page is ' + str(row) + '. (1)')
            break
        else:
            row += rowIncrement
            continue   
    print(str(time.time() - start) + ' seconds have elapsed for scraping. (1)')

    # return function if nothing is scraped
    if len(cleanList) == 0:
        print('No data is scraped. (1)')
        return
    
    # upload to dynamodb
    print(str(len(cleanList)) + ' rows to be added to dynamodb. (1)')
    start = time.time()
    upload(cleanList)
    print(str(time.time() - start) + ' seconds have elapsed for uploading. (1)')
    
# ---------- main function ----------

def lambda_handler(event, context):
    
    # ---------- global variables ----------

    marketIsOpen = False
    yesterday = datetime.now() - timedelta(1)
    date = yesterday.strftime('%Y-%m-%d')
    month = yesterday.strftime('%m')
    year = yesterday.strftime('%Y')
    calendarApi = 'https://sandbox.tradier.com/v1/markets/calendar'
    calendarApiToken = 't0XrEyArrcq6EJZAAbP6zbZDl9FA'
    finvizUrl1 = 'https://finviz.com/screener.ashx?v=152&o=ticker&r='
    finvizUrl2 = '&c=1,2,3,4,5,70'
    faker = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    rowIncrement = 20

    # ---------- check if market is open ----------

    calendar = requests.get(calendarApi, 
                     headers={'Authorization': calendarApiToken, 'Accept': 'application/json'},
                     params={'month': month, 'year': year})

    dayListInThisMonth = calendar.json()['calendar']['days']['day']
    try:
        for day in dayListInThisMonth:
            if day['date'] == date:
                if day['status'] == 'open':
                    marketIsOpen = True
                    break
        if marketIsOpen:
            print('Market is open.')
        else:
            print('Market is closed.')        
    except:
        print('Failed to check market\'s status.')

    # ---------- return function if market is closed ----------

    if not marketIsOpen:
        return

    # ---------- scrape data from finviz ----------
    
    scrape_n_store(finvizUrl1, finvizUrl2, faker, rowIncrement)
    
    return 
