# Extract training data from Reddit

This notebook uses Cloud Secret Manager to import an API key into a Vertex AI pipeline.

## Install all dependencies

In [1]:
! pip install google-cloud-secret-manager google-cloud-aiplatform kfp google-cloud-pipeline-components praw --upgrade



### Set project information

In [2]:
# Get your GCP project id from gcloud
shell_output=!gcloud config list --format 'value(core.project)' 2>/dev/null
PROJECT_ID=shell_output[0]
print("Project ID: ", PROJECT_ID)

Project ID:  fantasymaps-334622


### Set IAM permissions on your service account

`secretmanager.versions.access`

## Store your API key in Cloud Secret Manager

Although you can [create a new secret in Cloud Secret Manager programmatically](https://cloud.google.com/secret-manager/docs/creating-and-accessing-secrets#create), in this notebook you must create it using the Cloud Console.

To create a new secret in the Cloud Console, do the following:

  1. Open the [Cloud Console](https://console.cloud.google.com/security/secret-manager).
  1. Click **Create secret**.
  1. In the **Create secret** page, do the following:
     
     + Give your secret a memorable name. This notebook uses the Reddit API, so the name of the secret
       is 'reddit-api-key'.
     + Upload the credentials file. In this example, the `client_id`, `secret`, and `user_agent` credentials
       provided by Reddit are stored as JSON in a single file.
  
  1. Click **Create secret** at the bottom of the page.
  

## Access the key programmatically

In [None]:
from google.cloud import secretmanager
import json

client = secretmanager.SecretManagerServiceClient()

secret_resource_name = f"projects/{PROJECT_ID}/secrets/reddit-api-key/versions/1"

response = client.access_secret_version(request={"name": secret_resource_name})

payload = response.payload.data.decode("UTF-8")

reddit_key_json = json.loads(payload)

### Construct a request to the Reddit API

In [None]:
import praw

reddit = praw.Reddit(client_id=reddit_key_json["client_id"], 
                     client_secret=reddit_key_json["secret"],
                     user_agent=reddit_key_json["user_agent"])
print(f'Reddit is in read-only mode: {reddit.read_only}')

In [None]:
import numpy as np
import pandas as pd

nan_value = float("NaN")
sciatica_sub = "sciatica"

In [None]:
posts = reddit.subreddit(sciatica_sub).hot(limit=100)
filtered_posts = [[s.title, s.selftext, s.id] for s in posts]

filtered_posts = np.array(filtered_posts)
reddit_posts_df = pd.DataFrame(filtered_posts,
                               columns=['Title', 'Posts', 'ID'])

# Drop all the rows with empty values
reddit_posts_df.replace("", nan_value, inplace=True)
reddit_posts_df = reddit_posts_df[reddit_posts_df.Posts != nan_value]


# Print 
reddit_posts_df.head(10)

print(reddit_posts_df.iloc[8]['Title'])

In [None]:
from typing import NamedTuple
from google.cloud import secretmanager
import json

def get_google_cloud_credentials():
    from google import auth
    creds, project = auth.default()

    LocalCredentials = NamedTuple("LocalCredentials",
    [
        ("creds", str),
        ("project", str),
    ])
    return LocalCredentials(creds, project)

local_creds = get_google_cloud_credentials()

client = secretmanager.SecretManagerServiceClient(credentials=local_creds.creds)

secret_resource_name = f"projects/{local_creds.project}/secrets/reddit-api-key/versions/1"
response = client.access_secret_version(request={"name": secret_resource_name})
payload = response.payload.data.decode("UTF-8")

print(json.loads(payload))

## Troubleshoot Reddit component code

In [91]:
secret_name = "reddit-api-key"
collection_name = "FantasyMapsTest"
gcs_bucket_name = "fantasy-maps"
gcs_prefix_name = "ScrapedData"
subreddit_name = "battlemaps"

In [40]:
from datetime import datetime
import numpy as np
import pandas as pd
import praw
import re

from google.cloud import storage

def get_reddit_credentials(project_id):
    from google.cloud import secretmanager
    import json

    client = secretmanager.SecretManagerServiceClient()

    secret_resource_name = f"projects/{project_id}/secrets/{secret_name}/versions/1"
    response = client.access_secret_version(request={"name": secret_resource_name})
    payload = response.payload.data.decode("UTF-8")

    return json.loads(payload)

def get_reddit_posts(reddit_credentials):
    import praw

    reddit = praw.Reddit(client_id=reddit_credentials["client_id"], 
                 client_secret=reddit_credentials["secret"],
                 user_agent=reddit_credentials["user_agent"])
    print(f"Reddit is in read-only mode: {reddit.read_only}")
    return reddit.subreddit(subreddit_name).hot(limit=100)

nan_value = float("NaN")

print(f"Project ID is: {project_id}")

# Get the data from Reddit
credentials = get_reddit_credentials(project_id)
posts = get_reddit_posts(credentials)

dim_posts = filter(lambda p: len(re.findall("\d+x\d+", p.title)) > 0, posts)
print(dim_posts)

# Filter the posts the data that we want and store as DataFrame
filtered_posts = [[s.title, s.selftext, s.id, s.url] for s in dim_posts]

filtered_posts = np.array(filtered_posts)
reddit_posts_df = pd.DataFrame(filtered_posts,
                           columns=['Title', 'Post', 'ID', 'URL'])

reddit_posts_df.replace("", nan_value, inplace=True)
reddit_posts_df = reddit_posts_df[reddit_posts_df["Post"] != nan_value]

jpg_df = reddit_posts_df.loc[reddit_posts_df["URL"].str.contains("jpg")]
print(jpg_df.head(10))

# Save the dataframe as CSV in Storage
csv_str = jpg_df.to_csv()

storage_client = storage.Client(project=project_id)
bucket = storage_client.bucket(gcs_bucket_name)

timestamp = datetime.now().strftime("%Y%m%d%H%M%S")

csv_file_uri = f"{gcs_prefix_name}/reddit-scraped-{subreddit_name}-{timestamp}.csv"

file_blob = bucket.blob(csv_file_uri)
file_blob.upload_from_string(csv_str)

print(csv_file_uri)

Project ID is: fantasymaps-334622
Reddit is in read-only mode: True
<filter object at 0x7f58f9cd09d0>
                                                Title  Post      ID  \
0      Saint Miranda's Church [4K] [Gridless] [30x40]   NaN  ul1hnn   
1           Titanite Demon Ruins (Dark Souls) [19x28]   NaN  ukuabl   
2            Solar Geyser Ocean Variant [27x67] [Sea]   NaN  ul3dhb   
3       The Narrows [36x48] [140DPI] [OC] [BattleMap]   NaN  ukyjx0   
4               The dungeon of dreaming books [49x72]   NaN  ukzk2g   
5           Forest Overwatch [18x24] [OC] [Battlemap]   NaN  ul3yxs   
6                       Glowing tropical cave - 60x60   NaN  ukrv8r   
8   How about a floating pirate town in the sky? 2...   NaN  ukq4m0   
9    Sky Ship Docks [80x50] [140DPI] [OC] [BattleMap]   NaN  ukf6sd   
10  Ruined Freeway bridge | post apocalyptic battl...   NaN  ukygsh   

                                    URL  
0   https://i.redd.it/h08lenj959y81.jpg  
1   https://i.redd.it/a28u0itml6

## Troubleshoot Firestore component code

In [3]:
project_id = PROJECT_ID
collection_name = "FantasyMapsTest"
gcs_bucket_name = "fantasy-maps"
gcs_prefix_name = "ScrapedData"
csv_input_file = "ScrapedData/reddit-scraped-20220404223231.csv"

from datetime import datetime
import hashlib
from io import BytesIO
import json
import pandas as pd
from PIL import Image
import regex as re
import requests
import shutil

from google.cloud import firestore
from google.cloud import storage

storage_client = storage.Client(project=project_id)
bucket = storage_client.bucket(gcs_bucket_name)

firestore_client = firestore.Client(project=project_id)
collection_ref = firestore_client.collection(collection_name)

blob = bucket.blob(csv_input_file)
csv_bytes = blob.download_as_string()
csv_buffer = BytesIO(csv_bytes)

jpg_df = pd.read_csv(csv_buffer)

hashes = [None] * len(jpg_df.index)
jpg_df.insert(1, "HashId", hashes, True)
jpg_df.insert(6, "GcsURI", hashes, True)

# Concatenate string of batch prediction inputs
bp_inputs = ""

def make_nice_filename(name):
    regex = "[\s|\(|\"|\)]"
    new_name = re.sub(regex, "_", name)
    new_name = new_name.lower()[:30]
    new_name = new_name.replace("__", "_")
    return f"{new_name}.jpg"


def create_vtt_json(content, title):
    img = Image.open(BytesIO(content))
    w, h = img.size
    
    dims = re.findall("\d+x\d+", title)
    if len(dims) is 0:
        return None
    
    dims = dims[0].split("x")
    
    if len(dims) is not 2:
        return None
    
    cols = int(dims[0])
    rows = int(dims[1])
    
    cell_w = w / rows
    cell_h = h / cols
    if cell_w != cell_h:
        return None
    
    return {
        "cols": cols,
        "rows": rows,
        "imageWidth": w,
        "imageHeight": h,
        "cellOffsetX": 0,
        'cellOffsetY': 0, 
        'cellWidth': cell_w, 
        'cellHeight': cell_h, 
    }

def compute_bboxes(vtt_data):
    bboxes = []
    
    cols = vtt_data["cols"]
    rows = vtt_data["rows"]
    
    for x in range(1, cols):
        for y in range(1, rows):
           x_min_tmp = vtt_data["cellOffsetX"] + (vtt_data["cellWidth"] * x) - 2
           x_max_tmp = x_min_tmp + vtt_data["cellWidth"] + 4
           y_min_tmp = vtt_data["cellOffsetY"] + (vtt_data["cellHeight"] * y) - 2
           y_max_tmp = y_min_tmp + vtt_data["cellHeight"] + 4
            
           x_min_train = x_min_tmp / vtt_data["imageWidth"]
           x_max_train = x_max_tmp / vtt_data["imageWidth"]
           y_min_train = y_min_tmp / vtt_data["imageHeight"]
           y_max_train = y_min_tmp / vtt_data["imageHeight"]
            
           bboxes.append({
               "xMin": x_min_train,
               "yMin": y_min_train,
               "xMax": x_max_train,
               "yMax": y_max_train,
               "displayName": "cell"
           })

    return bboxes

# Iterate over JPG URIs, download them in batches, convert to sha values
for i, r in jpg_df.iterrows():
    jpg_url = r["URL"]
    title = r["Title"]
    
    req = requests.get(jpg_url, stream=True)
    if req.status_code == 200:
        req.raw.decode_content = True
        sha1 = hashlib.sha1()
        jpg_hash = sha1.update(req.content)
        jpg_hash = sha1.hexdigest()
        
        jpg_df["HashId"][i] = jpg_hash
        #print(f"Index {i}, hash {jpg_hash}")
        hashes.append(jpg_hash)
        
        # Try to fetch each document from Firestore. If it does not exist,
        # overwrite and download the image.
        doc_ref = collection_ref.document(jpg_hash)
        doc = doc_ref.get()
        if not doc.exists:
            
            file_name = make_nice_filename(title)
            img_gcs_uri = f"gs://{gcs_bucket_name}/{gcs_prefix_name}/{file_name}"
            blob_name = f"{gcs_prefix_name}/{file_name}"
            
            file_blob = bucket.blob(blob_name)
            image_buffer = BytesIO(req.content)
            
            # Get image grid metadata
            img_data = create_vtt_json(req.content, title)
            print(img_data)
            
            file_blob.upload_from_file(BytesIO(req.content))
            
            data = {
                u"filename": file_name,
                u"gcsURI": img_gcs_uri,
                u"source": gcs_prefix_name,
                u"vtt": img_data,
                u"userId": "None",
            }
            
            if img_data is not None:
                bboxes = compute_bboxes(img_data)
                data["vttData"] = img_data
                data["computedBBoxes"] = bboxes
            
            
            doc_ref.set(data)
            print(f"Set data: {data}")
            bp_inputs += json.dumps({ "content": img_gcs_uri, "mimeType": "image/jpeg"})
            bp_inputs += "\n"

# No fresh JPGs in this scraping; return empty string
if bp_inputs is "":
    # return ""
    print("no inputs")
            
print(f"First ten: {jpg_df.head(10)}")

# Save the batch_predict file
timestamp = datetime.now().strftime("%Y%m%d%H%M%S") 
batch_predict_file_uri = f"gs://{gcs_bucket_name}/{gcs_prefix_name}/bp_input_{timestamp}.jsonl"

bp_blob_name = f"{gcs_prefix_name}/bp_input_{timestamp}.jsonl"
bp_blob = bucket.blob(bp_blob_name)

bp_blob.upload_from_string(bp_inputs)

print(batch_predict_file_uri)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


{'cols': 58, 'rows': 23, 'imageWidth': 1610, 'imageHeight': 4060, 'cellOffsetX': 0, 'cellOffsetY': 0, 'cellWidth': 70.0, 'cellHeight': 70.0}
Set data: {'filename': 'feywild_waterfalls_+_50%_disco.jpg', 'gcsURI': 'gs://fantasy-maps/ScrapedData/feywild_waterfalls_+_50%_disco.jpg', 'source': 'ScrapedData', 'userId': 'None', 'vttData': {'cols': 58, 'rows': 23, 'imageWidth': 1610, 'imageHeight': 4060, 'cellOffsetX': 0, 'cellOffsetY': 0, 'cellWidth': 70.0, 'cellHeight': 70.0}, 'computedBBoxes': [{'xMin': 0.0422360248447205, 'yMin': 0.016748768472906402, 'xMax': 0.08819875776397515, 'yMax': 0.016748768472906402, 'displayName': 'cell'}, {'xMin': 0.0422360248447205, 'yMin': 0.03399014778325123, 'xMax': 0.08819875776397515, 'yMax': 0.03399014778325123, 'displayName': 'cell'}, {'xMin': 0.0422360248447205, 'yMin': 0.05123152709359606, 'xMax': 0.08819875776397515, 'yMax': 0.05123152709359606, 'displayName': 'cell'}, {'xMin': 0.0422360248447205, 'yMin': 0.06847290640394088, 'xMax': 0.088198757763975

## Create a custom Reddit pipelines component

In [123]:
from typing import NamedTuple

import kfp
from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import (Artifact, Dataset, Input, InputPath, Model, Output,
                        OutputPath, ClassificationMetrics, Metrics, component)
from kfp.v2.google.client import AIPlatformClient

from google.cloud import aiplatform
from google_cloud_pipeline_components import aiplatform as gcc_aip

In [124]:
@component(packages_to_install=["praw",
                                "google-cloud-secret-manager",
                                "google-cloud-storage",
                                "numpy",
                                "pandas"],
           output_component_file="reddit.yaml")
def reddit(
    secret_name: str,
    subreddit_name: str,
    gcs_bucket_name: str,
    gcs_prefix_name: str,
    project_id: str,
    limit: int,
) -> str:
    from datetime import datetime
    import numpy as np
    import pandas as pd
    import praw
    import re
    
    from google.cloud import storage

    def get_reddit_credentials(project_id):
        from google.cloud import secretmanager
        import json

        client = secretmanager.SecretManagerServiceClient()

        secret_resource_name = f"projects/{project_id}/secrets/{secret_name}/versions/1"
        response = client.access_secret_version(request={"name": secret_resource_name})
        payload = response.payload.data.decode("UTF-8")

        return json.loads(payload)
    
    def get_reddit_posts(reddit_credentials):
        import praw

        reddit = praw.Reddit(client_id=reddit_credentials["client_id"], 
                     client_secret=reddit_credentials["secret"],
                     user_agent=reddit_credentials["user_agent"])
        print(f"Reddit is in read-only mode: {reddit.read_only}")
        return reddit.subreddit(subreddit_name).hot(limit=limit)
    
    nan_value = float("NaN")
    
    print(f"Project ID is: {project_id}")
    
    # Get the data from Reddit
    credentials = get_reddit_credentials(project_id)
    posts = get_reddit_posts(credentials)
    
    dim_posts = filter(lambda p: len(re.findall("\d+x\d+", p.title)) > 0, posts)
    
    # Filter the posts the data that we want and store as DataFrame
    filtered_posts = [[s.title, s.selftext, s.id, s.url] for s in dim_posts]

    filtered_posts = np.array(filtered_posts)
    reddit_posts_df = pd.DataFrame(filtered_posts,
                               columns=['Title', 'Post', 'ID', 'URL'])

    reddit_posts_df.replace("", nan_value, inplace=True)
    reddit_posts_df = reddit_posts_df[reddit_posts_df["Post"] != nan_value]
    
    jpg_df = reddit_posts_df.loc[reddit_posts_df["URL"].str.contains("jpg")]
    jpg_df.head(10)
    
    # Save the dataframe as CSV in Storage
    csv_str = jpg_df.to_csv()
    
    storage_client = storage.Client(project=project_id)
    bucket = storage_client.bucket(gcs_bucket_name)
    
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    
    csv_file_uri = f"{gcs_prefix_name}/reddit-scraped-{subreddit_name}-{timestamp}.csv"
    
    file_blob = bucket.blob(csv_file_uri)
    file_blob.upload_from_string(csv_str)
    
    return csv_file_uri
    

## Create the Firestore component

In [125]:
from typing import NamedTuple

@component(packages_to_install=["Pillow",
                                "google-cloud-firestore",
                                "google-cloud-storage",
                                "numpy",
                                "pandas"],
           output_component_file="firestore.yaml")
def firestore(
    subreddit_name: str,
    collection_name: str,
    gcs_bucket_name: str,
    gcs_prefix_name: str,
    csv_input_file: str,
    project_id: str,
) -> NamedTuple(
    "Outputs",
    [
        ("batch_predict_file_uri", str),
        ("bp_inputs_count", int),
    ]
):
    
    from datetime import datetime
    import hashlib
    from io import BytesIO
    import json
    import pandas as pd
    from PIL import Image
    import re
    import requests
    import shutil

    from google.cloud import firestore
    from google.cloud import storage

    storage_client = storage.Client(project=project_id)
    bucket = storage_client.bucket(gcs_bucket_name)

    firestore_client = firestore.Client(project=project_id)
    collection_ref = firestore_client.collection(collection_name)

    blob = bucket.blob(csv_input_file)
    csv_bytes = blob.download_as_string()
    csv_buffer = BytesIO(csv_bytes)

    jpg_df = pd.read_csv(csv_buffer)

    hashes = [None] * len(jpg_df.index)
    jpg_df.insert(1, "HashId", hashes, True)
    jpg_df.insert(6, "GcsURI", hashes, True)

    # Concatenate string of batch prediction inputs
    bp_inputs = ""
    bp_inputs_count = 0

    def make_nice_filename(name, *, rows=None, cols=None):
        regex = "[\s|\(|\"|\)]"
        new_name = re.sub(regex, "_", name)
        new_name = new_name.lower()[:30]
        new_name = new_name.replace("__", "_")
        
        if rows is not None and cols is not None:
            new_name += f".{cols}x{rows}"
        return f"{new_name}.jpg"


    def create_vtt_json(content, title):
        img = Image.open(BytesIO(content))
        w, h = img.size

        dims = re.findall("\d+x\d+", title)
        if len(dims) is 0:
            return None

        dims = dims[0].split("x")

        if len(dims) is not 2:
            return None

        cols = int(dims[0])
        rows = int(dims[1])

        cell_w = w / rows
        cell_h = h / cols
        if cell_w != cell_h:
            return None

        return {
            "cols": cols,
            "rows": rows,
            "imageWidth": w,
            "imageHeight": h,
            "cellOffsetX": 0,
            'cellOffsetY': 0, 
            'cellWidth': cell_w, 
            'cellHeight': cell_h, 
        }

    def compute_bboxes(vtt_data):
        bboxes = []

        cols = vtt_data["cols"]
        rows = vtt_data["rows"]

        for x in range(1, cols):
            for y in range(1, rows):
               x_min_tmp = vtt_data["cellOffsetX"] + (vtt_data["cellWidth"] * x) - 2
               x_max_tmp = x_min_tmp + vtt_data["cellWidth"] + 4
               y_min_tmp = vtt_data["cellOffsetY"] + (vtt_data["cellHeight"] * y) - 2
               y_max_tmp = y_min_tmp + vtt_data["cellHeight"] + 4

               x_min_train = x_min_tmp / vtt_data["imageWidth"]
               x_max_train = x_max_tmp / vtt_data["imageWidth"]
               y_min_train = y_min_tmp / vtt_data["imageHeight"]
               y_max_train = y_max_tmp / vtt_data["imageHeight"]

               bboxes.append({
                   "xMin": x_min_train,
                   "yMin": y_min_train,
                   "xMax": x_max_train,
                   "yMax": y_max_train,
                   "displayName": "cell"
               })

        return bboxes

    # Iterate over JPG URIs, download them in batches, convert to sha values
    for i, r in jpg_df.iterrows():
        jpg_url = r["URL"]
        title = r["Title"]

        req = requests.get(jpg_url, stream=True)
        if req.status_code == 200:
            req.raw.decode_content = True
            sha1 = hashlib.sha1()
            jpg_hash = sha1.update(req.content)
            jpg_hash = sha1.hexdigest()

            jpg_df["HashId"][i] = jpg_hash
            #print(f"Index {i}, hash {jpg_hash}")
            hashes.append(jpg_hash)

            # Try to fetch each document from Firestore. If it does not exist,
            # overwrite and download the image.
            doc_ref = collection_ref.document(jpg_hash)
            doc = doc_ref.get()
            if not doc.exists:

                img_data = create_vtt_json(req.content, title)
                
                if img_data is not None:
                    file_name = make_nice_filename(title,
                                                   rows=img_data["rows"],
                                                   cols=img_data["cols"])
                else:
                    file_name = make_nice_filename(title)
                
                img_gcs_uri = f"gs://{gcs_bucket_name}/{gcs_prefix_name}/{file_name}"
                blob_name = f"{gcs_prefix_name}/{file_name}"

                file_blob = bucket.blob(blob_name)
                image_buffer = BytesIO(req.content)

                # Get image grid metadata
                #img_data = create_vtt_json(req.content, title)
                print(img_data)

                file_blob.upload_from_file(BytesIO(req.content))

                data = {
                    u"filename": file_name,
                    u"gcsURI": img_gcs_uri,
                    u"source": gcs_prefix_name,
                    u"userId": "None",
                    u"sourceUrl": jpg_url,
                }

                if img_data is not None:
                    bboxes = compute_bboxes(img_data)
                    data["vtt"] = img_data
                    data["computedBBoxes"] = bboxes

                    doc_ref.set(data)
                    print(f"Set data: {data}")
                    bp_inputs += json.dumps({ "content": img_gcs_uri, "mimeType": "image/jpeg"})
                    bp_inputs += "\n"
                    bp_inputs_count = bp_inputs_count + 1

    # No fresh JPGs in this scraping; return empty string
    if bp_inputs is "":
        print("no inputs")
        return ("", 0)

    print(f"First ten: {jpg_df.head(10)}")

    # Save the batch_predict file
    timestamp = datetime.now().strftime("%Y%m%d%H%M%S") 
    batch_predict_file_uri = f"gs://{gcs_bucket_name}/{gcs_prefix_name}/bp_input_{timestamp}.jsonl"

    bp_blob_name = f"{gcs_prefix_name}/bp_input_{timestamp}.jsonl"
    bp_blob = bucket.blob(bp_blob_name)

    bp_blob.upload_from_string(bp_inputs)

    return (batch_predict_file_uri, bp_inputs_count)  

## Build a simple pipeline

In [131]:
GCS_BUCKET = "fantasy-maps"
GCS_PREFIX = "ScrapedData"
MODEL_ID = "7292897899317297152"
LOCATION = "us-central1"
COLLECTION_NAME = "FantasyMaps"
LIMIT=300
print(PROJECT_ID)    

fantasymaps-334622


In [132]:
@dsl.pipeline(
    name="reddit-scraper-pipeline-v2",
    description="Gets data from a subreddit",
    pipeline_root=f"gs://{GCS_BUCKET}/pipeline_root",
)
def reddit_pipeline(
    collection_name: str = COLLECTION_NAME,
    secret_name: str = "reddit-api-key",
    subreddit_name_1: str = "battlemaps",
    subreddit_name_2: str = "FantasyMaps",
    gcs_bucket: str = GCS_BUCKET,
    gcs_prefix: str = GCS_PREFIX,
    project_id: str = PROJECT_ID,
    location: str = LOCATION,
    model_id: str = MODEL_ID,
    limit: int = LIMIT,
):
    
    # First stream of Reddit scraping
    reddit_op_1 = reddit(
        secret_name=secret_name,
        subreddit_name=subreddit_name_1,
        gcs_bucket_name=gcs_bucket,
        gcs_prefix_name=gcs_prefix,
        project_id=project_id,
        limit=limit,
    )
    
    reddit_csv_file_1 = reddit_op_1.output
    
    firestore_op_1 = firestore(
        subreddit_name=subreddit_name_1,
        collection_name=collection_name,
        gcs_bucket_name=gcs_bucket,
        gcs_prefix_name=gcs_prefix,
        csv_input_file=reddit_csv_file_1,
        project_id=project_id,
    )
    
    # Second stream of Reddit scraping
    reddit_op_2 = reddit(
        secret_name=secret_name,
        subreddit_name=subreddit_name_2,
        gcs_bucket_name=gcs_bucket,
        gcs_prefix_name=gcs_prefix,
        project_id=project_id,
        limit=limit,
    )

    reddit_csv_file_2 = reddit_op_2.output
    
    firestore_op_2 = firestore(
        subreddit_name=subreddit_name_2,
        collection_name=collection_name,
        gcs_bucket_name=gcs_bucket,
        gcs_prefix_name=gcs_prefix,
        csv_input_file=reddit_csv_file_2,
        project_id=project_id,
    )

In [133]:
compiler.Compiler().compile(
    pipeline_func=reddit_pipeline, package_path="reddit_scraper_pipeline_job.json"
)

In [134]:
api_client = AIPlatformClient(
    project_id=PROJECT_ID,
    region=LOCATION,
)

In [135]:
response = api_client.create_run_from_job_spec(
    job_spec_path="reddit_scraper_pipeline_job.json",
    enable_caching=False
)