<a href="https://colab.research.google.com/github/zzyy-gh/stonks_scrape/blob/main/stonks_scrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# scrape stock data daily from finviz
# data: tkr, pb, sgqq, s, f, (sf), si, sti, fsh, de, opm, prfm, weekc, monc, quartc, halfc, yearc, weekv, monv, p, dayc, v
# run on UTC time 0000, 4 hours after NYC: use yesterday's date in utc to get latest market data
import requests
from datetime import datetime, timedelta
import time
import multiprocessing
import json
from bs4 import BeautifulSoup

# ---------- functions ----------

def cleanData(rows):
    
    def cleanText(dataName, data):
        try:
            cleanItem['PutRequest']['Item'][dataName] = data
        except:
            pass
        return

    def cleanSimpleFloat(dataName, data):
        try:
            if data != '-':
                cleanItem['PutRequest']['Item'][dataName] = data
        except:
            pass
        return
    
    def cleanCommaInt(dataName, data):
        try:
            if data != '-':
                cleanItem['PutRequest']['Item'][dataName] = data.replace(',', '')
        except:
            pass
        return
    
    def cleanPercFloat(dataName, data):
        try:
            if data != '-':
                cleanItem['PutRequest']['Item'][dataName] = data[:-1]
        except:
            pass
        return
    
    def cleanMBFloat(dataName, data):
        try:
            if data != '-':
                if data[-1] == 'M':
                    cleanItem['PutRequest']['Item'][dataName] = str(float(data[:-1]) * 1000000)
                elif data[-1] == 'B':
                    cleanItem['PutRequest']['Item'][dataName] = str(float(data[:-1]) * 1000000000)              
        except:
            pass
        return
    
    date = (datetime.now() - timedelta(1)).strftime('%Y%m%d')
    pageCleanList = []
    rawList = []
    
    # extract raw data
    for row in rows:
        td = row.find_all('td')
        rawList.append([x.text for x in td])
        
    # save cleaned data
    for item in rawList:
        cleanItem = {'PutRequest':{'Item':{}}}
        cleanText('tkr', item[0])
        cleanSimpleFloat('pb', item[1])
        cleanPercFloat('sgqq', item[2])
        cleanMBFloat('s', item[3])
        cleanMBFloat('f', item[4])
        cleanPercFloat('si', item[5])
        cleanPercFloat('sti', item[6])
        cleanPercFloat('fsh', item[7])
        cleanSimpleFloat('de', item[8])
        cleanPercFloat('opm', item[9])
        cleanPercFloat('prfm', item[10])
        cleanPercFloat('weekc', item[11])
        cleanPercFloat('monc', item[12])
        cleanPercFloat('quartc', item[13])
        cleanPercFloat('halfc', item[14])
        cleanPercFloat('yearc', item[15])
        cleanPercFloat('weekv', item[16])
        cleanPercFloat('monv', item[17])
        cleanSimpleFloat('p', item[18])
        cleanPercFloat('dayc', item[19])
        cleanCommaInt('v', item[20])
        try:
            if cleanItem['PutRequest']['Item']['s'] and cleanItem['PutRequest']['Item']['f']:
                cleanItem['PutRequest']['Item']['sf'] = str(round(item[3]/item[4], 2))
        except:
            pass
        cleanItem['PutRequest']['Item']['uuid'] = date + item[0]
        cleanItem['PutRequest']['Item']['date'] = date

        pageCleanList.append(cleanItem)
    
    return pageCleanList

def scrape_n_store1(stopPage, finvizUrl1, finvizUrl2, faker, rowIncrement):

    # data
    row = 1
    cleanList = []

    # iterate through all the webpages to obtain and clean data
    start = time.time()
    while True:

        # fetch webpage
        try:
            url = finvizUrl1 + str(row) + finvizUrl2
            page = requests.get(url, headers=faker)
        except:
            print('Failed to retrieve row ' + str(row) + '. (1)')
            break

        # get clean data
        nicePage = BeautifulSoup(page.content, 'html.parser')
        rows = nicePage.find_all('tr', class_='table-dark-row-cp') + nicePage.find_all('tr', class_='table-light-row-cp')
        # if ads pops up, refresh
        if len(rows) == 0:
            # fetch webpage
            try:
                print('Retrieving refreshed page.')
                page = requests.get(url, headers=faker)
            except:
                print('Failed to retrieve refreshed page.')
                return
            nicePage = BeautifulSoup(page.content, 'html.parser')
            rows = nicePage.find_all('tr', class_='table-light-row-cp') + nicePage.find_all('tr', class_='table-dark-row-cp')
        cleanList.extend(cleanData(rows))

        # exit loop if on the last page, else continue to next page
        stopper = nicePage.find_all('a', class_='tab-link', string=stopPage)
        nextBtn = nicePage.find_all('a', class_='tab-link',string='next')
        if len(stopper) > 0 or len(nextBtn) == 0:
            print('Data scraping has ended. The first row number of the last page is ' + str(row) + '. (1)')
            break
        else:
            row += rowIncrement
            continue   
    print(str(time.time() - start) + ' seconds have elapsed for scraping. (1)')

    # return function if nothing is scraped
    if len(cleanList) == 0:
        print('No data is scraped. (1)')
        return
    
    print(len(cleanList))
    print(cleanList)
    return
    
def scrape_n_store2(startIndex, finvizUrl1, finvizUrl2, faker, rowIncrement):

    # data
    row = startIndex
    cleanList = []
                
    # iterate through all the webpages to obtain and clean data
    start = time.time()
    while True:

        # fetch webpage
        try:
            url = finvizUrl1 + str(row) + finvizUrl2
            page = requests.get(url, headers=faker)
        except:
            print('Failed to retrieve row ' + str(row) + '. (2)')
            break

        # get clean data
        nicePage = BeautifulSoup(page.content, 'html.parser')
        rows = nicePage.find_all('tr', class_='table-dark-row-cp') + nicePage.find_all('tr', class_='table-light-row-cp')
        # if ads pops up, refresh
        if len(rows) == 0:
            # fetch webpage
            try:
                print('Retrieving refreshed page.')
                page = requests.get(url, headers=faker)
            except:
                print('Failed to retrieve refreshed page.')
                return
            nicePage = BeautifulSoup(page.content, 'html.parser')
            rows = nicePage.find_all('tr', class_='table-light-row-cp') + nicePage.find_all('tr', class_='table-dark-row-cp')
        cleanList.extend(cleanData(rows))

        # exit loop if on the last page, else continue to next page
        nextBtn = nicePage.find_all('a', class_='tab-link',string='next')
        if len(nextBtn) == 0:
            print('Data scraping has ended. The first row number of the last page is ' + str(row) + '. (2)')
            break
        else:
            row += rowIncrement
            continue   
    print(str(time.time() - start) + ' seconds have elapsed for scraping. (2)')

    # return function if nothing is scraped
    if len(cleanList) == 0:
        print('No data is scraped. (2)')
        return
    
    print(len(cleanList))
    print(cleanList)
    return
    
# ---------- main function ----------

def lambda_handler(event, context):
    
    # ---------- global variables ----------

    marketIsOpen = False
    yesterday = datetime.now() - timedelta(1)
    date = yesterday.strftime('%Y-%m-%d')
    month = yesterday.strftime('%m')
    year = yesterday.strftime('%Y')
    calendarApi = 'https://sandbox.tradier.com/v1/markets/calendar'
    calendarApiToken = 't0XrEyArrcq6EJZAAbP6zbZDl9FA'
    finvizUrl1 = 'https://finviz.com/screener.ashx?v=152&o=ticker&r='
    finvizUrl2 = '&c=1,11,23,24,25,26,28,30,38,40,41,42,43,44,45,46,50,51,65,66,67'
    faker = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    rowIncrement = 20
    startIndex = 4101
    stopPage = '205'

    # ---------- check if market is open ----------

    calendar = requests.get(calendarApi, 
                     headers={'Authorization': calendarApiToken, 'Accept': 'application/json'},
                     params={'month': month, 'year': year})

    dayListInThisMonth = calendar.json()['calendar']['days']['day']
    try:
        for day in dayListInThisMonth:
            if day['date'] == date:
                if day['status'] == 'open':
                    marketIsOpen = True
                    break
        if marketIsOpen:
            print('Market is open.')
        else:
            print('Market is closed.')        
    except:
        print('Failed to check market\'s status.')

    # ---------- return function if market is closed ----------

    if not marketIsOpen:
        return

    # ---------- scrape data from finviz ----------

    scrape_n_store1(stopPage, finvizUrl1, finvizUrl2, faker, rowIncrement)
    scrape_n_store2(startIndex, finvizUrl1, finvizUrl2, faker, rowIncrement)

    return 


In [None]:
lambda_handler(1, 2)

Market is open.
Data scraping has ended. The first row number of the last page is 4081. (1)
63.676939249038696 seconds have elapsed for scraping. (1)
4100
[{'PutRequest': {'Item': {'tkr': 'A', 'pb': '10.95', 'sgqq': '25.80', 's': '303000000.0', 'f': '301780000.0', 'si': '0.30', 'sti': '90.00', 'fsh': '1.29', 'de': '0.58', 'opm': '20.40', 'prfm': '16.10', 'weekc': '1.86', 'monc': '14.18', 'quartc': '27.55', 'halfc': '48.95', 'yearc': '87.86', 'weekv': '1.33', 'monv': '1.65', 'p': '178.73', 'dayc': '0.57', 'v': '1174601', 'uuid': '20210908A', 'date': '20210908'}}}, {'PutRequest': {'Item': {'tkr': 'AAAU', 'weekc': '-1.55', 'monc': '3.37', 'quartc': '-5.48', 'halfc': '3.61', 'yearc': '-7.64', 'weekv': '0.73', 'monv': '0.72', 'p': '17.78', 'dayc': '-0.34', 'v': '339341', 'uuid': '20210908AAAU', 'date': '20210908'}}}, {'PutRequest': {'Item': {'tkr': 'AACG', 'pb': '2.74', 'sgqq': '39.40', 's': '33680000.0', 'f': '31700000.0', 'si': '18.70', 'sti': '16.50', 'fsh': '0.19', 'de': '0.00', 'opm': 