In [1]:
##This is a general file for the creation of all the StackExchange Golden Layer Fact Tables##

In [1]:
import sys, os
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
os.environ["HADOOP_HOME"] = "C:/Spark/spark-3.5.5-bin-hadoop3"  
os.environ["HADOOP_OPTS"] = "-Djava.library.path=C:/Spark/spark-3.5.5-bin-hadoop3/bin"

In [51]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (col, date_format, when, sum,
    count, lower, regexp_replace,
    trim, lit, udf, year, month,
    dayofmonth, weekofyear, when,
    quarter, dayofweek, monotonically_increasing_id, row_number)
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType
from pyspark.sql.window import Window


In [3]:
# Initialize Spark Session
spark = SparkSession.builder.appName("Fact Tables")\
    .config("spark.executor.memory", "3g")\
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.16.0,org.apache.parquet:parquet-hadoop:1.15.1")\
    .config("spark.pyspark.python", sys.executable) \
    .config("spark.pyspark.driver.python", sys.executable) \
    .config("spark.hadoop.io.native.lib.available", "false")\
    .getOrCreate()

In [4]:
Tags_Dim = spark.read.parquet(r"Gold/Tags_Dim")
Users_Dim = spark.read.parquet(r"Gold/Users_Dim")
Questions_Dim = spark.read.parquet(r"Gold/Questions_Dim")
Answers_Dim = spark.read.parquet(r"Gold/Answers_Dim")
Date_Dim = spark.read.parquet(r"Gold/Date_Dim")
Badges_Dim = spark.read.parquet(r"Gold/Badges_Dim")


In [38]:
#Question Fact Generation
Questions = spark.read.parquet(r"Silver\Questions")
Questions_Fact = Questions.join(
    Questions_Dim.select("Question_BK", col("Question_SK").alias("Question_FK")),
    Questions["QuestionId"] == Questions_Dim["Question_BK"],
    "left"
).join(
    Users_Dim.select("User_BK", col("User_SK").alias("User_FK")),
    Questions["OwnerUserId"] == Users_Dim["User_BK"],
    "left"
).join(
    Date_Dim.select(
        col("FullDate").alias("CreationDate_FullDate"), 
        col("DateKey").alias("CreationDate_FK")
    ),
    Questions["CreationDate"] == col("CreationDate_FullDate"),
    "left"
).join(
    Date_Dim.select(
        col("FullDate").alias("LastActivityDate_FullDate"), 
        col("DateKey").alias("LastActivityDate_FK")
    ),
    Questions["LastActivityDate"] == col("LastActivityDate_FullDate"),
    "left"
).drop(
    "CreationDate_FullDate", "LastActivityDate_FullDate",
    "QuestionId", "Question_BK",
    "OwnerUserId", "CreationDate", "LastActivityDate",
    "AcceptedAnswerId", "User_BK",
    "Body", "Title", "Tags"
)
#Writing the file
Questions_Fact_path = "Gold/Questions_Fact"
Questions_Fact.write.mode("overwrite").parquet(Questions_Fact_path)

In [46]:
#Answers Fact Generation
Answers = spark.read.parquet(r"Silver\Answers")

Answers_Fact = Answers.join(
    Answers_Dim.select("Answer_BK", col("Answer_SK").alias("Answer_FK")),
    Answers["AnswerId"] == Answers_Dim["Answer_BK"],
    "left"
).join(
    Users_Dim.select("User_BK", col("User_SK").alias("User_FK")),
    Answers["OwnerUserId"] == Users_Dim["User_BK"],
    "left"
).join(
    Date_Dim.select(
        col("FullDate").alias("CreationDate_FullDate"), 
        col("DateKey").alias("CreationDate_FK")
    ),
    Answers["CreationDate"] == col("CreationDate_FullDate"),
    "left"
).join(
    Date_Dim.select(
        col("FullDate").alias("LastActivityDate_FullDate"), 
        col("DateKey").alias("LastActivityDate_FK")
    ),
    Answers["LastActivityDate"] == col("LastActivityDate_FullDate"),
    "left"
).join(
    Questions_Dim.select("Question_BK", col("Question_SK").alias("ParentQuestion_FK")),
    Answers["ParentId"] == Questions_Dim["Question_BK"],
    "left"
).drop(
    "CreationDate_FullDate", "LastActivityDate_FullDate",
    "AnswerId", "Answer_BK",
    "OwnerUserId", "CreationDate", "LastActivityDate",
    "User_BK",
    "Body","Question_BK","ParentId"
)
#Writing the file
Answers_Fact_path = "Gold/Answers_Fact"
Answers_Fact.write.mode("overwrite").parquet(Answers_Fact_path)

In [71]:
#Badge Fact Generation
Badges = spark.read.parquet(r"Silver_Facts\Badges")
Badges= Badges.withColumnRenamed("User_fk","UserId").withColumnRenamed("Badge_Desc_Id","BadgeId")


Badges_Fact = Badges.join(
    Users_Dim.select("User_BK", col("User_SK").alias("User_FK")),
    Badges["UserId"] == Users_Dim["User_BK"],
    "left"
).join(
    Date_Dim.select(
        col("FullDate").alias("Assigneddate_FullDate"), 
        col("DateKey").alias("AssignedDate_FK")),
    Badges["Assigneddate"] == col("Assigneddate_FullDate"),
    "left"
).join(
    Badges_Dim.select(
        "Badge_BK", col("Badge_SK").alias("Badge_FK")),
    Badges["BadgeId"] == col("Badge_FK"),
    "left"
).withColumn("IsTagBased", when(col("TagBased") == True, 1).otherwise(0)
).drop("AssingingBadge_BK", "UserId", "User_BK", "Assigneddate", "TagBased", "BadgeId", "Assigneddate_FullDate", "Badge_BK")
#Writing the file
Badges_Fact_path = "Gold/Badges_Fact"
Badges_Fact.write.mode("overwrite").parquet(Badges_Fact_path)

In [75]:
Comments = spark.read.parquet(r"Silver_Facts\Comments")
Comments.show()

+---+------+------+-----+--------------------+------------+--------------+
| Id|PostId|UserId|Score|                Text|CreationDate|ContentLicense|
+---+------+------+-----+--------------------+------------+--------------+
|  1|     3|    13|    7|Could be a poster...|  2010-07-19|  CC BY-SA 2.5|
|  2|     5|    13|    0|Yes, R is nice- b...|  2010-07-19|  CC BY-SA 2.5|
|  3|     9|    13|    1|Again- why?  How ...|  2010-07-19|  CC BY-SA 2.5|
|  4|     5|    37|   11|It's mature, well...|  2010-07-19|  CC BY-SA 2.5|
|  6|    14|    23|   10|why ask the quest...|  2010-07-19|  CC BY-SA 2.5|
|  7|    18|    36|    1|also the US censu...|  2010-07-19|  CC BY-SA 2.5|
|  9|    16|    78|    1|Andrew Gelman has...|  2010-07-19|  CC BY-SA 2.5|
| 10|    23|    -2|    8|I am not sure I u...|  2010-07-19|  CC BY-SA 2.5|
| 11|    43|     5|    5|There are many R ...|  2010-07-19|  CC BY-SA 2.5|
| 12|    38|    54|    0|That's just an ex...|  2010-07-19|  CC BY-SA 2.5|
| 13|    20|    24|    2|

In [77]:
Answers_Dim.show()

+---------+--------+-----------+--------------------+---------+
|Answer_BK|ParentId|OwnerUserId|                Body|Answer_SK|
+---------+--------+-----------+--------------------+---------+
|        5|       3|         23|the rproject http...|        1|
|        9|       3|         50|incanter is a clo...|        2|
|       12|       7|          5|see my response t...|        3|
|       13|       6|         23|machine learning ...|        4|
|       14|       3|         36|i second that jay...|        5|
|       15|       1|          6|john cook gives s...|        6|
|       16|       3|          8|two projects spri...|        7|
|       18|       7|         36|also see the uci ...|        8|
|       19|       7|         55|gapminder has a n...|        9|
|       20|       2|         37|the assumption of...|       10|
|       24|       3|         61|for doing a varie...|       11|
|       28|       3|         -2|gsl for those of ...|       12|
|       29|      17|         36|continge