In [None]:
import requests
from pyspark.sql import SparkSession
from pyspark.sql.types import (StructType, StructField, StringType, IntegerType,
                            BooleanType, DoubleType,TimestampType,
                            DateType, FloatType, LongType)
from datetime import date, timedelta
import json,time
from pyspark.sql.functions import col, udf, from_unixtime, year, month, dayofmonth
from datetime import date, timedelta

spark = SparkSession.builder.appName("GetAllEnd").getOrCreate()

start_date = date(2023,1,21)
end_date = date(2023,2,15)

def get_business_days(start_date, end_date):
    business_days = []
    current_date = start_date
    while current_date <= end_date:
        if current_date.weekday() < 5:
            business_days.append(current_date)
        current_date += timedelta(days=1)
    return business_days

dates = get_business_days(start_date, end_date)

tickers_daily_schema = StructType([
    StructField("T", StringType(), nullable=True),
    StructField("v", StringType(), nullable=True),
    StructField("vw", StringType(), nullable=True),
    StructField("o", StringType(), nullable=True),
    StructField("c", StringType(), nullable=True),
    StructField("h", StringType(), nullable=True),
    StructField("l", StringType(), nullable=True),
    StructField("t", StringType(), nullable=True),
    StructField("n", StringType(), nullable=True)
])

def api_call(date):
    url = f"https://api.polygon.io/v2/aggs/grouped/locale/us/market/stocks/{date}?adjusted=true&apiKey=VOYRq_vnd9soI0AhCO8FQpK4auZ6ppc3"
    response = requests.get(url)
    try:
        var = response.json()['results']
        return var
    except:
        return 0 

start_time=60.0
end_time=1.0
INTERVAL_SECONDS = 60
new_column_names = ['ticker', 'volume', 'volume_weighted', 'opening_price', 'closing_price', 'highest_price', 'lowest_price', 'unix_time', 'no_of_transactions']

while len(dates)>0:
    
    if len(dates)>4:
        elapsed_time = start_time - end_time
        remaining_time = max(0, INTERVAL_SECONDS - elapsed_time)
        time.sleep(remaining_time)
        start_time=time.time()
        for date in dates[:5]:
            returned_json = api_call(date)
            if returned_json!=0:
                daily_high_low_df = spark.createDataFrame(returned_json, tickers_daily_schema)
                daily_high_low_df=daily_high_low_df.toDF(*new_column_names)
                daily_high_low_df= daily_high_low_df.withColumn("datetime", from_unixtime(daily_high_low_df["unix_time"]/1000))
                daily_high_low_df = daily_high_low_df.withColumn("year", year("datetime").cast(IntegerType()))
                daily_high_low_df = daily_high_low_df.withColumn("month", month("datetime").cast(IntegerType()))
                daily_high_low_df = daily_high_low_df.withColumn("day", dayofmonth("datetime").cast(IntegerType()))
                daily_high_low_df = daily_high_low_df.repartition(8)
                daily_high_low_df.repartition("year", "month", "day") \
                .sortWithinPartitions("ticker") \
                .write \
                .mode("append") \
                .partitionBy("year", "month", "day") \
                .format("parquet") \
                .option("compression", "snappy") \
                .save("gs://stocks-pipeline/raw-data/daily_high_low")
#             print(daily_high_low_df.show(1))
        dates[:5] = []
        end_time=time.time()
    else:
        for date in dates:
            returned_json = api_call(date)
            if returned_json!=0:
                daily_high_low_df = spark.createDataFrame(returned_json, tickers_daily_schema)
                daily_high_low_df=daily_high_low_df.toDF(*new_column_names)
                daily_high_low_df= daily_high_low_df.withColumn("datetime", from_unixtime(daily_high_low_df["unix_time"]/1000))
                daily_high_low_df = daily_high_low_df.withColumn("year", year("datetime").cast(IntegerType()))
                daily_high_low_df = daily_high_low_df.withColumn("month", month("datetime").cast(IntegerType()))
                daily_high_low_df = daily_high_low_df.withColumn("day", dayofmonth("datetime").cast(IntegerType()))
                daily_high_low_df = daily_high_low_df.repartition(8)
                daily_high_low_df.repartition("year", "month", "day") \
                .sortWithinPartitions("ticker") \
                .write \
                .mode("append") \
                .partitionBy("year", "month", "day") \
                .format("parquet") \
                .option("compression", "snappy") \
                .save("gs://stocks-pipeline/raw-data/daily_high_low")
            
        dates =[]

spark.stop()
