In [1]:
import json
import pandas as pd
import boto3

### Setup AWS services

In [2]:
s3 = boto3.resource('s3')

# Create s3 Bucket:
bucket_name = 'scraped-data-zh'
s3.create_bucket(Bucket=bucket_name)

s3.Bucket(name='scraped-data-zh')

In [3]:
# Create lambda function
aws_lambda = boto3.client('lambda')
iam_client = boto3.client('iam')
role = iam_client.get_role(RoleName='LabRole')

# Open our Zipped directory
with open('proj-deployment-package.zip', 'rb') as f:
    lambda_zip = f.read()

try:
    # If function hasn't yet been created, create it
    response = aws_lambda.create_function(
        FunctionName='proj_scrape_data',
        Runtime='python3.9',
        Role=role['Role']['Arn'],
        Handler='lambda_function.lambda_handler',
        Code=dict(ZipFile=lambda_zip),
        Timeout=300
    )
except aws_lambda.exceptions.ResourceConflictException:
    # If function already exists, update it based on zip
    # file contents
    response = aws_lambda.update_function_code(
    FunctionName='proj_scrape_data',
    ZipFile=lambda_zip
    )

lambda_arn = response['FunctionArn']

In [4]:
sfn = boto3.client('stepfunctions')

def make_def(lambda_arn):
    definition = {
      "Comment": "My State Machine",
      "StartAt": "Map",
      "States": {
        "Map": {
          "Type": "Map",
          "End": True,
          "Iterator": {
            "StartAt": "Lambda Invoke",
            "States": {
              "Lambda Invoke": {
                "Type": "Task",
                "Resource": "arn:aws:states:::lambda:invoke",
                "OutputPath": "$.Payload",
                "Parameters": {
                  "Payload.$": "$",
                  "FunctionName": lambda_arn
                },
                "Retry": [
                  {
                    "ErrorEquals": [
                      "Lambda.ServiceException",
                      "Lambda.AWSLambdaException",
                      "Lambda.SdkClientException",
                      "Lambda.TooManyRequestsException",
                      "States.TaskFailed"
                    ],
                    "IntervalSeconds": 2,
                    "MaxAttempts": 6,
                    "BackoffRate": 2
                  }
                ],
                "End": True
              }
            }
          }
        }
      }
    }
    return definition

In [5]:
# create step functions
sf_def = make_def(lambda_arn)

try:
    response = sfn.create_state_machine(
        name='data_scrape_sm',
        definition=json.dumps(sf_def),
        roleArn=role['Role']['Arn'],
        type='EXPRESS'
    )
except sfn.exceptions.StateMachineAlreadyExists:
    response = sfn.list_state_machines()
    state_machine_arn = [sm['stateMachineArn'] 
                         for sm in response['stateMachines'] 
                         if sm['name'] == 'data_scrape_sm'][0]
    response = sfn.update_state_machine(
        stateMachineArn=state_machine_arn,
        definition=json.dumps(sf_def),
        roleArn=role['Role']['Arn']
    )

### Use NASDAQ data to determine which stocks to track

In [6]:
nasdaq_data = pd.read_csv("nasdaq_screener_1685121443165.csv")
nasdaq_data.sort_values(by='Market Cap', ascending=False, inplace=True)
nasdaq_data.head()

Unnamed: 0,Symbol,Name,Last Sale,Net Change,% Change,Market Cap,Country,IPO Year,Volume,Sector,Industry
19,AAPL,Apple Inc. Common Stock,$175.2445,2.2545,1.303%,3038273000000.0,United States,1980.0,25148439,Technology,Computer Manufacturing
4599,MSFT,Microsoft Corporation Common Stock,$331.515,5.595,1.717%,2464976000000.0,United States,1986.0,19276086,Technology,Computer Software: Prepackaged Software
3078,GOOG,Alphabet Inc. Class C Capital Stock,$125.85,1.5,1.206%,1597917000000.0,United States,2004.0,13406022,Technology,Computer Software: Programming Data Processing
3079,GOOGL,Alphabet Inc. Class A Common Stock,$125.115,1.635,1.324%,1588585000000.0,United States,2004.0,17303800,Technology,Computer Software: Programming Data Processing
7145,V,Visa Inc.,$225.815,2.435,1.09%,1441643000000.0,United States,,2191942,Consumer Discretionary,Business Services


In [7]:
nasdaq_data["Symbol"][:50].values

array(['AAPL', 'MSFT', 'GOOG', 'GOOGL', 'V', 'AMZN', 'VZ', 'NVDA',
       'MBINO', 'BRK/A', 'BRK/B', 'MBINP', 'META', 'TSLA', 'MBINN', 'TSM',
       'UNH', 'XOM', 'LLY', 'JNJ', 'JPM', 'WMT', 'NVO', 'MA', 'PG',
       'AVGO', 'HD', 'NTES', 'CVX', 'ASML', 'ORCL', 'MRK', 'KO', 'PEP',
       'ABBV', 'TECK', 'BAC', 'COST', 'CRM', 'PFE', 'BABA', 'MCD', 'NVS',
       'AMD', 'CSCO', 'TMO', 'ACN', 'SHEL', 'TM', 'ADBE'], dtype=object)

In [8]:
# symbols = ['AAPL', 'NVDA', 'TSLA', 'AMD', 'JNUG', 'JDST', 'LABU', 'QCOM', 'INTC', 'DGAZ']
# Top 50 symbols on Nasdaq by Market CAP
symbols = nasdaq_data["Symbol"][:50].values

num_queries = 200
symbol_batches = [{'symbol': s, 'num_queries': 200} for s in symbols]
symbol_batches

[{'symbol': 'AAPL', 'num_queries': 200},
 {'symbol': 'MSFT', 'num_queries': 200},
 {'symbol': 'GOOG', 'num_queries': 200},
 {'symbol': 'GOOGL', 'num_queries': 200},
 {'symbol': 'V', 'num_queries': 200},
 {'symbol': 'AMZN', 'num_queries': 200},
 {'symbol': 'VZ', 'num_queries': 200},
 {'symbol': 'NVDA', 'num_queries': 200},
 {'symbol': 'MBINO', 'num_queries': 200},
 {'symbol': 'BRK/A', 'num_queries': 200},
 {'symbol': 'BRK/B', 'num_queries': 200},
 {'symbol': 'MBINP', 'num_queries': 200},
 {'symbol': 'META', 'num_queries': 200},
 {'symbol': 'TSLA', 'num_queries': 200},
 {'symbol': 'MBINN', 'num_queries': 200},
 {'symbol': 'TSM', 'num_queries': 200},
 {'symbol': 'UNH', 'num_queries': 200},
 {'symbol': 'XOM', 'num_queries': 200},
 {'symbol': 'LLY', 'num_queries': 200},
 {'symbol': 'JNJ', 'num_queries': 200},
 {'symbol': 'JPM', 'num_queries': 200},
 {'symbol': 'WMT', 'num_queries': 200},
 {'symbol': 'NVO', 'num_queries': 200},
 {'symbol': 'MA', 'num_queries': 200},
 {'symbol': 'PG', 'num_qu

### Start Scraping

In [9]:
# Get arn for Step Function state machine
response = sfn.list_state_machines()
state_machine_arn = [sm['stateMachineArn']
                     for sm in response['stateMachines'] 
                     if sm['name'] == 'data_scrape_sm'][0]

# Spread url batches across Lambda workers
try:
    response = sfn.start_sync_execution(
        stateMachineArn=state_machine_arn,
        name='data_scrape_res',
        input=json.dumps(symbol_batches)
    )
except:
    pass