In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.functions import input_file_name
import pyspark
spark = SparkSession.builder.appName("StoreHistoricalData").getOrCreate()
# import udf

sc = spark.sparkContext
folder_path = "gs://stocks-pipeline/raw-data/hist/*.csv"

stocks_schema = StructType([
    StructField("Date", TimestampType(), True),
    StructField("Open", DecimalType(15,10), True),
    StructField("High", DecimalType(15,10), True),
    StructField("Low", DecimalType(15,10), True),
    StructField("Close", DecimalType(15,10), True),
    StructField("Volume", IntegerType(), True),
    StructField("Dividends", StringType(), True),
    StructField("Stock Splits", StringType(), True)
])


# Read all the files in the folder as a DataFrame
hist_df = spark.read.format('csv') \
    .option("header", True) \
    .schema(stocks_schema) \
    .option("path", folder_path) \
    .load()


# Add a new column with the file name to the DataFrame
full_file_name=input_file_name()
file_name_csv=split(full_file_name, '/').getItem(5)
abc=regexp_extract(file_name_csv, r"\b\w+\b", 0)
hist_df = hist_df.withColumn('ticker', trim(regexp_replace(abc,r"\s+", "")))

hist_df = hist_df.withColumnRenamed("Stock Splits", "Stock_Splits")
hist_df= hist_df.withColumn("year", year("Date").cast(IntegerType()))
hist_df = hist_df.withColumn("month", month("Date").cast(IntegerType()))
hist_df_final = hist_df.withColumn("day", dayofmonth("Date").cast(IntegerType()))

hist_df_final.repartition(96,"year", "month", "day") \
            .sortWithinPartitions("ticker") \
            .write \
            .mode("append") \
            .partitionBy("year", "month", "day") \
            .format("parquet") \
            .option("compression", "snappy") \
            .save("gs://stocks-pipeline/raw-data/daily_high_low_historical")

spark.stop()

                                                                                