In [34]:
##This is a general file for the creation of all the StackExchange Golden Layer##

In [3]:
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
os.environ["HADOOP_HOME"] = "C:/Spark/spark-3.5.5-bin-hadoop3"  
os.environ["HADOOP_OPTS"] = "-Djava.library.path=C:/Spark/spark-3.5.5-bin-hadoop3/bin"

In [2]:
from pyspark.sql import SparkSession
import sys, os

In [4]:
from pyspark.sql.functions import (col, date_format, when, sum,
    count, lower, regexp_replace,
    trim, lit, udf, year, month,
    dayofmonth, weekofyear,
    quarter, dayofweek, monotonically_increasing_id, row_number)
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
from pyspark.sql.window import Window


In [5]:
# Initialize Spark Session
spark = SparkSession.builder.appName("Date Dimension Table")\
    .config("spark.executor.memory", "3g")\
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.16.0,org.apache.parquet:parquet-hadoop:1.15.1")\
    .config("spark.pyspark.python", sys.executable) \
    .config("spark.pyspark.driver.python", sys.executable) \
    .config("spark.hadoop.io.native.lib.available", "false")\
    .getOrCreate()

In [6]:
#Date Dim
# Define Date Range
start_date = "2009-01-01"
end_date = "2026-12-31"
dates_df = spark.sql(f"SELECT sequence(to_date('{start_date}'), to_date('{end_date}'), interval 1 day) AS date_list") \
    .selectExpr("explode(date_list) AS FullDate")

# Add Date Attributes
Date_Dim = dates_df.withColumn("DateKey", date_format(col("FullDate"), "yyyyMMdd").cast("int")) \
    .withColumn("DayOfMonth", dayofmonth(col("FullDate"))) \
    .withColumn("Month", month(col("FullDate"))) \
    .withColumn("Year", year(col("FullDate"))) \
    .withColumn("DayOfWeek", dayofweek(col("FullDate"))) \
    .withColumn("DayName", date_format(col("FullDate"), "EEEE")) \
    .withColumn("WeekOfYear", weekofyear(col("FullDate"))) \
    .withColumn("MonthName", date_format(col("FullDate"), "MMMM")) \
    .withColumn("Quarter", quarter(col("FullDate"))) \
    .withColumn("FiscalYear", when(month(col("FullDate")) >= 7, year(col("FullDate")) + 1)
                               .otherwise(year(col("FullDate"))))
# Save as Parquet (or change to CSV, Delta, etc.)
Date_Dim.write.mode("overwrite").parquet("Gold/Date_Dim")




In [7]:
#Transformations On Tags File
Tags = spark.read.parquet(r"Silver/Tags")

Tags_window_spec = Window.orderBy("Id")  

Tags_Dim = Tags.withColumn("Tag_SK", row_number().over(Tags_window_spec))\
            .withColumnRenamed("Id", "Tag_BK")\
            .drop("TagDescPostId")\
            .withColumnRenamed("Count","Total_Count")

#Writing the file
Tags_Dim_path = "Gold/Tags_Dim"
Tags_Dim.write.mode("overwrite").parquet(Tags_Dim_path)

In [8]:
#Transformations On Users File
Users = spark.read.parquet(r"Silver/Users")

Users = Users.withColumnRenamed("UsersId_BK","User_BK")
Users_window_spec = Window.orderBy("User_BK")
Users_Dim = Users.withColumn("User_SK", row_number().over(Users_window_spec))

#Adding the Null User
new_row = spark.sql("SELECT -1 AS User_SK,NULL AS Views, -2 AS User_BK,\
                    NULL AS DisplayName, NULL AS Reputation,\
                    NULL AS CreationDate,NUll AS DownVotes, NULL AS LastAccessDate, NULL AS UpVotes")
Users_Dim = Users_Dim.unionByName(new_row)

#Writing the file
Users_Dim_path = "Gold/Users_Dim"
Users_Dim.write.mode("overwrite").parquet(Users_Dim_path)

In [9]:
#Transformations On Badges File
Badges = spark.read.parquet(r"Silver/Badges")

Badges = Badges.withColumnRenamed('Badge_Desc_Id','Badge_BK')

Badges_window_spec = Window.orderBy("Badge_BK")

Badges_Dim = Badges.withColumn("Badge_SK", row_number().over(Badges_window_spec))

#Writing the file
Badges_Dim_path = "Gold/Badges_Dim"
Badges_Dim.write.mode("overwrite").parquet(Badges_Dim_path)

In [10]:
#Transformations On Questions File
Questions = spark.read.parquet(r"Silver\Questions")

Questions_Dim= Questions.withColumnRenamed("QuestionId","Question_BK")

Questions_window_spec = Window.orderBy("Question_BK")

Questions_Dim = Questions_Dim.drop("LastActivityDate","Score","ViewCount","AnswerCount","CommentCount","Tags",)\
                         .withColumn("Question_SK", row_number().over(Questions_window_spec))

#Adding the Null User
new_row0 = spark.sql("SELECT -1 AS Question_SK,NULL AS Title, -1 AS Question_BK,\
                    NULL AS OwnerUserId, NULL AS Body, NULL AS CreationDate,NULL AS AcceptedAnswerId")
Questions_Dim = Questions_Dim.unionByName(new_row0)

#Writing the file
Questions_Dim_path = "Gold/Questions_Dim"
Questions_Dim.write.mode("overwrite").parquet(Questions_Dim_path)

In [11]:
#Transformations On Answers File
Answers = spark.read.parquet(r"Silver\Answers")

Answers_Dim= Answers.withColumnRenamed("AnswerId","Answer_BK")

Answers_window_spec = Window.orderBy("Answer_BK")

Answers_Dim = Answers_Dim.drop("CreationDate","LastActivityDate","Score","CommentCount")\
                         .withColumn("Answer_SK", row_number().over(Answers_window_spec))
#Adding the Null User
new_row1 = spark.sql("SELECT -1 AS Answer_BK,NULL AS ParentId,\
                     NULL AS OwnerUserId, NULL AS Body, -1 AS Answer_SK")
Answers_Dim = Answers_Dim.unionByName(new_row1)


#Writing the file
Answers_Dim_path = "Gold/Answers_Dim"
Answers_Dim.write.mode("overwrite").parquet(Answers_Dim_path)

In [None]:
Answer_BK|ParentId|OwnerUserId|                Body|Answer_SK