<a href="https://colab.research.google.com/github/zzyy-gh/stonks-lambda-/blob/main/stonks_info_scrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# scrape stock data daily from finviz
# data: tkr, n, sec, ind, ctry, ipo
# run on UTC time 0000, 4 hours after NYC: use yesterday's date in utc to get latest market data
import requests
from datetime import datetime, timedelta
import time
from multiprocessing import Process, Manager
import json
import boto3
from bs4 import BeautifulSoup

client = boto3.client('dynamodb')

# ---------- functions ----------

def cleanData(rows, resultsSet, fsList):
    
    def cleanText(dataName, data):
        try:
            if data != '-':
                cleanItem['PutRequest']['Item'][dataName] = {'S': data}
                cleanFsItem[dataName] = data
        except:
            pass
        return
    
    def cleanIPO(dataName, data):
        try:
            if data != '-':
                cleanItem['PutRequest']['Item'][dataName] = {'N': datetime.strptime(item[5], '%m/%d/%Y').strftime('%Y%m%d')}
                cleanFsItem[dataName] = datetime.strptime(item[5], '%m/%d/%Y').strftime('%Y%m%d')
        except:
            pass
        return
    
    pageCleanList = []
    rawList = []
    yesterday = datetime.now() - timedelta(1)
    yesterdayStr = yesterday.strftime('%Y%m%d')
    
    # extract raw data
    for row in rows:
        td = row.find_all('td')
        rawList.append([x.text for x in td])
        
    # save cleaned data
    for item in rawList:
        if item[0] != '-' and item[0] not in resultsSet:
            cleanItem = {'PutRequest':{'Item':{}}}
            cleanFsItem = {}
            cleanText('tkr', item[0])
            cleanText('n', item[1])
            cleanText('sec', item[2])
            cleanText('ind', item[3])
            cleanText('ctry', item[4])
            cleanIPO('ipo', item[5]) 
            cleanItem['PutRequest']['Item']['date'] = {'N': yesterdayStr}
            cleanFsItem['date'] = yesterdayStr
    
            fsList.append(cleanFsItem)
            pageCleanList.append(cleanItem)
    
    return pageCleanList
    
def upload(cleanList):
    strCleanList = [ [] for _ in range((len(cleanList) - 1) // 25 + 1) ]
    unprocList = []
    for i, data in enumerate(cleanList):
        strCleanList[i // 25].append(data) 
    for batch in strCleanList:
        try: 
            response = client.batch_write_item(
                RequestItems= {
                    'ticker_info': batch
                }
            )
            if bool(response['UnprocessedItems']):
                print(response)
                unprocList.extend(response['UnprocessedItems']['ticker_info'])
        except Exception as e:
            print(batch[0])
            print(e)
    if len(unprocList) > 0:
        print(str(len(unprocList)) + ' unprocessed items.')
        upload(unprocList)
    return

def endChecker1(nicePage, stopPage):
    nextBtn = nicePage.find_all('a', class_='tab-link',string='next')
    stopper = nicePage.find_all('a', class_='tab-link', string=stopPage)
    
    if len(stopper) > 0 or len(nextBtn) == 0:
        return True
    return False

def endChecker2(nicePage, stopPage):
    nextBtn = nicePage.find_all('a', class_='tab-link',string='next')

    if len(nextBtn) == 0:
        return True
    return False
    
def scrape(startIndex, stopPage, finvizUrl1, finvizUrl2, faker, rowIncrement, finalList, resultsSet, fsList):

    # data and setup
    row = startIndex
    cleanList = []
    if stopPage == 'end':
        isLastPage = endChecker2
    else:
        isLastPage = endChecker1
    
                
    # iterate through all the webpages to obtain and clean data
    start = time.time()
    while True:

        # fetch webpage
        try:
            url = finvizUrl1 + str(row) + finvizUrl2
            page = requests.get(url, headers=faker)
        except:
            print('Failed to retrieve row ' + str(row) + '.')
            break

        # get page rows
        nicePage = BeautifulSoup(page.content, 'html.parser')
        rows = nicePage.find_all('tr', class_='table-dark-row-cp') + nicePage.find_all('tr', class_='table-light-row-cp')

        # if ads pops up, refresh
        if len(rows) == 0:
            # fetch webpage
            try:
                print('Retrieving refreshed page of row: ' + str(row) + '.')
                page = requests.get(url, headers=faker)
            except:
                print('Failed to retrieve refreshed page of row: ' + str(row) + '.')
                return pageCleanList
            nicePage = BeautifulSoup(page.content, 'html.parser')
            rows = nicePage.find_all('tr', class_='table-light-row-cp') + nicePage.find_all('tr', class_='table-dark-row-cp')
        
        if len(rows) == 0:
            print('Possibly screwed by ads. (page of row: ' + str(row) + ')')
        
        if len(rows) != 0:
            cleanList.extend(cleanData(rows, resultsSet, fsList))

        # exit loop if on the last page, else continue to next page
        if isLastPage(nicePage, stopPage):
            print('Data scraping has ended. The first row number of the last page is ' + str(row) + '.')
            break
        else:
            row += rowIncrement
            continue   
    print(str(time.time() - start) + ' seconds have elapsed for scraping. (last row ' + str(row) + ')')

    # return function if nothing is scraped
    if len(cleanList) == 0:
        print('No data is scraped. (for startIndex: ' + str(startIndex) + ')')
    
    finalList.extend(cleanList)
    return

def uploadFs(prepList, fsHttps):
    payload = json.dumps({
        'data': prepList
    })
    headers = {
      'Content-Type': 'application/json'
    }

    try:
        start = time.time()
        response = requests.request("POST", fsHttps, headers=headers, data=payload)
        print(str(response.content))
        print(str(time.time() - start) + ' seconds have elapsed for firestore upload.')
    except:
        pass
    return
    
# ---------- main function ----------

def lambda_handler(event, context):
    
    # ---------- global variables ----------

    marketIsOpen = False
    yesterday = datetime.now() - timedelta(1)
    date = yesterday.strftime('%Y-%m-%d')
    month = yesterday.strftime('%m')
    year = yesterday.strftime('%Y')
    calendarApi = 'https://sandbox.tradier.com/v1/markets/calendar'
    calendarApiToken = 't0XrEyArrcq6EJZAAbP6zbZDl9FA'
    fsHttps = 'https://asia-southeast2-stonks-810ca.cloudfunctions.net/addSrapedInfo'
    finvizUrl1 = 'https://finviz.com/screener.ashx?v=152&o=ticker&r='
    finvizUrl2 = '&c=1,2,3,4,5,70'
    faker = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    rowIncrement = 20
    startIndex1 = 1
    startIndex2 = 4101
    stopPage1 = '205'
    stopPage2 = 'end'
    table_name = 'ticker_meta'
    results = []
    resultsSet = set()
    last_evaluated_key = None

    # ---------- check if market is open, else return ----------

    calendar = requests.get(calendarApi, 
                     headers={'Authorization': calendarApiToken, 'Accept': 'application/json'},
                     params={'month': month, 'year': year})

    dayListInThisMonth = calendar.json()['calendar']['days']['day']
    try:
        for day in dayListInThisMonth:
            if day['date'] == date:
                if day['status'] == 'open':
                    marketIsOpen = True
                    break
        if marketIsOpen:
            print('Market is open.')
        else:
            print('Market is closed.')        
    except:
        print('Failed to check market\'s status.')

    if not marketIsOpen:
        return
    
# ---------- check existing tkr ----------
    
    # find tkr
    while True:
        try:
            if last_evaluated_key:
                response = client.scan(
                    TableName=table_name,
                    ExclusiveStartKey=last_evaluated_key,
                    ExpressionAttributeNames={
                        '#t': 'tkr',
                    },
                    ProjectionExpression='#t',
                )
            else: 
                response = client.scan(
                    TableName=table_name,
                    ExpressionAttributeNames={
                        '#t': 'tkr',
                    },
                    ProjectionExpression='#t',
                    )
            last_evaluated_key = response.get('LastEvaluatedKey')
            
            results.extend(response['Items'])
            
            if not last_evaluated_key:
                break
        except:
            print('aww shit')
    for i in results:
        resultsSet.add(i['tkr']['S'])
    print(str(len(resultsSet)) + ' tickers in db.')
    

    # ---------- scrape and upload data from finviz ----------
    
    with Manager() as manager:
        # creating processes
        start = time.time()
        finalList = manager.list()
        fsList = manager.list()
        p1 = Process(target=scrape, args=(startIndex1, stopPage1, finvizUrl1, finvizUrl2, faker, rowIncrement, finalList, resultsSet, fsList, ))
        p2 = Process(target=scrape, args=(startIndex2, stopPage2, finvizUrl1, finvizUrl2, faker, rowIncrement, finalList, resultsSet, fsList,))

        # starting process 1
        p1.start()
        # starting process 2
        p2.start()

        # wait until process 1 is finished
        p1.join()
        # wait until process 2 is finished
        p2.join()
        print(str(time.time() - start) + ' seconds have elapsed for scraping in total.')
    
        # upload to dynamodb and firestore
        print(str(len(finalList)) + ' rows to be uploaded.')
        if len(finalList) == 0:
            return
        start = time.time()
        prepFinalList = finalList._getvalue()
        p1 = Process(target=upload, args=(prepFinalList, ))
        prepList = fsList._getvalue()
        p2 = Process(target=uploadFs, args=(prepList, fsHttps, ))
        
        # starting process 1
        p1.start()
        # starting process 2
        p2.start()

        # wait until process 1 is finished
        p1.join()
        # wait until process 2 is finished
        p2.join()
        print(str(time.time() - start) + ' seconds have elapsed for both uploads.')

    return 
