In [None]:
import requests
from pyspark.sql import SparkSession
from pyspark.sql.types import (StructType, StructField, StringType,
                               IntegerType, BooleanType, TimestampType,
                              ArrayType, MapType)
import time
import json
from pyspark.sql.functions import col, udf, from_unixtime, year, month, dayofmonth
spark = SparkSession.builder.appName("GetStockNews").getOrCreate()

ticker_news_schema = StructType([
    StructField("amp_url", StringType(), True),
    StructField("article_url", StringType(), True),
    StructField("author", StringType(), True),
    StructField("description", StringType(), True),
    StructField("id", StringType(), True),
    StructField("image_url", StringType(), True),
    StructField("keywords", ArrayType(StringType()), True),
    StructField("published_utc", StringType(), True),
    StructField("publisher", MapType(StringType(), StringType(), True), True),
    StructField("tickers", ArrayType(StringType()), True),
    StructField("ticket", StringType(), True),
    StructField("title", StringType(), True)
])

url = "https://api.polygon.io/v2/reference/news?ticker.gte=A&published_utc.gte=2023-01-02&limit=1000&apiKey=VOYRq_vnd9soI0AhCO8FQpK4auZ6ppc3"
response = requests.get(url)
ticker_news_df = spark.createDataFrame(response.json()["results"], schema = ticker_news_schema)

flag = 0
time.sleep(60)
while(flag==0):
    start = time.time()
    for i in range(5):
        if 'next_url' in response.json():
            next_url = response.json()['next_url'] + "&apiKey=VOYRq_vnd9soI0AhCO8FQpK4auZ6ppc3"
            response = requests.get(next_url)
        
            ticker_news_df = ticker_news_df.union(spark.createDataFrame(response.json()["results"],
                                                                          schema = ticker_news_schema))
        else:
            print("No data")
            flag = 1
    ticker_news_df_new = ticker_news_df.withColumn("year", year("published_utc").cast(IntegerType()))
    ticker_news_df_new = ticker_news_df_new.withColumn("month", month("published_utc").cast(IntegerType()))
    ticker_news_df_new = ticker_news_df_new.withColumn("day", dayofmonth("published_utc").cast(IntegerType()))
    
    ticker_news_df_new.repartition("year", "month", "day") \
                .sortWithinPartitions("published_utc") \
                .write \
                .mode("append") \
                .partitionBy("year", "month", "day") \
                .format("parquet") \
                .option("compression", "snappy") \
                .save("gs://stocks-pipeline/raw-data/sock_news")
    
    end = time.time()
    total = end - start
    time.sleep(60 - total)

spark.stop()