In [27]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

In [28]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, MapType

spark = SparkSession.builder \
    .appName("XML Reader") \
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.17.0") \
    .getOrCreate()

In [29]:
comments_schema = StructType([
    StructField("_Id", StringType(), False),
    StructField("_PostId", StringType(), True),
    StructField("_UserId", StringType(), True),
    StructField("_UserDisplayName", StringType(), True),
    StructField("_Score", IntegerType(), True),
    StructField("_Text", StringType(), True),
    StructField("_CreationDate", TimestampType(), True),
    StructField("_ContentLicense", StringType(), True)
])

votes_schema=StructType([
    StructField("Id", IntegerType(), True),
    StructField("PostId",IntegerType(), True),
    StructField("VoteTypeId", IntegerType(), True),
    StructField("CreationDate", TimestampType(), True),
    StructField("UserId",IntegerType(), True),
    StructField("BountyAmount", IntegerType(), True),
])

posts_schema=StructType([
    StructField("AcceptedAnswerId", IntegerType(), True),
    StructField("AnswerCount",IntegerType(), True),
    StructField("Body", StringType(), True),
    StructField("ClosedDate", TimestampType(), True),
    StructField("CommentCount",IntegerType(), True),
    StructField("CommunityOwnedDate", TimestampType(), True),
    StructField("ContentLicense", StringType(), True),
    StructField("CreationDate", TimestampType(), True),
    StructField("FavoriteCount",IntegerType(), True),
    StructField("Id",IntegerType(), True),
    StructField("LastActivityDate", TimestampType(), True),
    StructField("LastEditDate", TimestampType(), True),
    StructField("LastEditorDisplayName", StringType(), True),
    StructField("LastEditorUserId",IntegerType(), True),
    StructField("OwnerDisplayName", StringType(), True),
    StructField("OwnerUserId",IntegerType(), True),
    StructField("ParentId",IntegerType(), True),
    StructField("PostTypeId",IntegerType(), True),
    StructField("Score",IntegerType(), True),
    StructField("Tags", StringType(), True),
    StructField("Title", StringType(), True),
    StructField("ViewCount",IntegerType(), True)
])

In [30]:
Comments_df = spark.read.format("xml") \
    .option("rowTag", "row") \
    .schema(comments_schema)\
    .load("Comments.xml")
badges_df = spark.read.format("xml") \
    .option("rowTag", "row") \
    .load("badges.xml")
votes_df = spark.read.format("xml") \
    .option("rowTag", "row")\
    .option("attributePrefix", "") \
    .schema(votes_schema) \
    .load("Votes.xml") 
df_posts = spark.read.format("xml") \
    .option("rowTag", "row")\
    .option("attributePrefix", "") \
    .schema(posts_schema) \
    .load("Posts.xml") 

# Comments Table

**Calculate Nulls**

In [31]:
Comments_df.toPandas().isnull().sum() / Comments_df.count() * 100

_Id                  0.000000
_PostId              0.000000
_UserId              1.607733
_UserDisplayName    98.391386
_Score               0.000000
_Text                0.000000
_CreationDate        0.000000
_ContentLicense      0.000000
dtype: float64

**Renaming Columns**

In [32]:
Comments_df = Comments_df \
    .withColumnRenamed("_ContentLicense", "ContentLicense") \
    .withColumnRenamed("_CreationDate", "CreationDate") \
    .withColumnRenamed("_Id", "Id") \
    .withColumnRenamed("_PostId", "PostId") \
    .withColumnRenamed("_Score", "Score") \
    .withColumnRenamed("_Text", "Text") \
    .withColumnRenamed("_UserDisplayName", "UserDisplayName") \
    .withColumnRenamed("_UserId", "UserId")

**Dropping Columns**

In [33]:
Comments_df = Comments_df.drop('UserDisplayName')

**Formatting Date**

In [34]:
Comments_df = Comments_df.withColumn("CreationDate", date_format("CreationDate", "yy-MM-dd"))

**Handling Nulls**

In [35]:
Comments_df = Comments_df.fillna({"UserId": -2})

**Edit Data Types**

In [36]:
Comments_df = Comments_df.withColumn("Id", col("Id").cast("string")) \
                         .withColumn("UserId", col("UserId").cast("string")) \
                         .withColumn("PostId", col("PostId").cast("string"))

# Badges Table

**Calculate Nulls**

In [37]:
from pyspark.sql.functions import col, count, when

null_perc=badges_df.select([((count(when(col(c).isNull(),c)) / badges_df.count()) * 100).alias(c) for c in badges_df.columns])

**Dropping Columns**

In [38]:
Badge_Disc_Dim=badges_df.drop(*["_TagBased","_Date","_UserId"])

**Renaming Columns Names for Badge Disc Table** 

In [39]:
for col_name in Badge_Disc_Dim.columns:
    Badge_Disc_Dim = Badge_Disc_Dim.withColumnRenamed(col_name, col_name.lstrip("_"))

Badge_Disc_Dim=Badge_Disc_Dim.withColumnRenamed("Id","Badge_Disc_BK")

**Formatting Date**

In [40]:
from pyspark.sql.functions import date_format, to_timestamp ,to_date

badges_df = badges_df.withColumn("_Date", date_format(to_timestamp("_Date", "yyyy-MM-dd HH:mm:ss"), "yyyy-MM-dd"))

**Renaming Columns for Badges Fact Table**

In [41]:
from pyspark.sql.functions import col ,to_date

badges_df = badges_df.withColumnRenamed("_Date", "date") \
                    .withColumnRenamed("_Id", "Badge_Disc_fk") \
                    .withColumnRenamed("_UserId", "User_fk") \
                    .withColumnRenamed("_TagBased", "TagBased")

**Formatting Date**

In [42]:
badges_df = badges_df.withColumn("date", to_date(col("date"), "yyyy-MM-dd"))
badges_fact = badges_df.select("date", "Badge_Disc_fk", "TagBased", "User_fk")

# Votes Table

In [43]:
# %pip install beautifulsoup4

In [44]:
# from pyspark.sql.functions import udf
# from pyspark.sql.types import StringType
# from bs4 import BeautifulSoup
# import sys

**Calculate Nulls**

In [45]:
null_perc = votes_df.select(
    (count(when(col("UserId").isNull(), 1)) / count(lit(1)) * 100).alias("UserId_null_percentage"),
    (count(when(col("BountyAmount").isNull(), 1)) / count(lit(1)) * 100).alias("BountyAmount_null_percentage")
).collect()[0]

**Dropping columns**

In [46]:
votes_df = votes_df.drop("UserId", "BountyAmount")

**joining votes with posts**

In [47]:
df_votes_with_owner = votes_df.join(
    df_posts.select("Id", "OwnerUserId"),
    votes_df.PostId == df_posts.Id,
    "left"
)
df_votes_with_owner = df_votes_with_owner.drop(df_posts["Id"])


**Renaming columns**

In [48]:
df_votes_with_owner = df_votes_with_owner.withColumnRenamed("OwnerUserId", "PostOwnerId")

**Formatting Date**

In [49]:
df_votes_with_owner = df_votes_with_owner.withColumn("CreationDate", date_format(col("CreationDate"), "yyyy-MM-dd"))
