In [1]:
import sys, os
!"{sys.executable}" -m pip install beautifulsoup4



In [2]:
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [3]:
os.environ["HADOOP_HOME"] = "C:/Spark/spark-3.5.5-bin-hadoop3"  
os.environ["HADOOP_OPTS"] = "-Djava.library.path=C:/Spark/spark-3.5.5-bin-hadoop3/bin"

In [4]:
from pyspark.sql import SparkSession

In [5]:
from pyspark.sql.functions import col , date_format , when , sum , count , lower, regexp_replace, trim, lit, udf

In [6]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType

In [7]:
!pip install beautifulsoup4



In [8]:
from bs4 import BeautifulSoup

In [9]:
import sys

In [10]:
spark = SparkSession.builder.appName("Tags_Spark_Cleansing")\
    .config("spark.executor.memory", "3g")\
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.16.0,org.apache.parquet:parquet-hadoop:1.15.1")\
    .config("spark.pyspark.python", sys.executable) \
    .config("spark.pyspark.driver.python", sys.executable) \
    .config("spark.hadoop.io.native.lib.available", "false")\
    .getOrCreate()

In [11]:
schema = StructType([
    StructField("Id", IntegerType(), True),
    StructField("TagName",StringType(), True),
    StructField("Count", IntegerType(), True),
    StructField("ExcerptPostId", IntegerType(), True),
    StructField("WikiPostId",IntegerType(), True)
])
					

In [13]:
df_Tags = spark.read.format("xml") \
    .option("rowTag", "row")\
    .option("attributePrefix", "") \
    .schema(schema) \
    .load(r"DataSet/Tags.xml") 

In [14]:
df_Tags.show()
df_Tags.count()

+---+--------------------+-----+-------------+----------+
| Id|             TagName|Count|ExcerptPostId|WikiPostId|
+---+--------------------+-----+-------------+----------+
|  1|            bayesian| 7844|        20258|     20257|
|  2|               prior|  978|        62158|     62157|
|  3|         elicitation|   12|         NULL|      NULL|
|  5|         open-source|   18|         NULL|      NULL|
|  6|       distributions| 9359|         8046|      8045|
|  9|    machine-learning|19853|         9066|      9065|
| 10|             dataset| 1879|        20490|     20489|
| 11|              sample|  999|        28276|     28275|
| 12|          population|  517|        69287|     69286|
| 15|         measurement|  334|        66319|     66318|
| 16|              scales|  406|       139243|    139242|
| 17|       interpolation|  265|        64387|     64386|
| 18|       multivariable|   39|         NULL|      NULL|
| 21|               anova| 5195|         9251|      9250|
| 23|         

1597

In [15]:
# Calculate null percentages for the last two columns to decide which is better as a description
null_percentage = df_Tags.select(
    (count(when(col("ExcerptPostId").isNull(), 1)) / count(lit(1)) * 100).alias("ExcerptPostId_null_percentage"),
    (count(when(col("WikiPostId").isNull(), 1)) / count(lit(1)) * 100).alias("BountyAmount_null_percentage")
).collect()[0]

print(f"excerpt posts null percentage: {null_percentage['ExcerptPostId_null_percentage']:.2f}%")
print(f"wiki post null percentage: {null_percentage['BountyAmount_null_percentage']:.2f}%")


excerpt posts null percentage: 24.11%
wiki post null percentage: 24.11%


In [16]:
schema = StructType([
    StructField("AcceptedAnswerId", IntegerType(), True),
    StructField("AnswerCount",IntegerType(), True),
    StructField("Body", StringType(), True),
    StructField("ClosedDate", TimestampType(), True),
    StructField("CommentCount",IntegerType(), True),
    StructField("CommunityOwnedDate", TimestampType(), True),
    StructField("ContentLicense", StringType(), True),
    StructField("CreationDate", TimestampType(), True),
    StructField("FavoriteCount",IntegerType(), True),
    StructField("Id",IntegerType(), True),
    StructField("LastActivityDate", TimestampType(), True),
    StructField("LastEditDate", TimestampType(), True),
    StructField("LastEditorDisplayName", StringType(), True),
    StructField("LastEditorUserId",IntegerType(), True),
    StructField("OwnerDisplayName", StringType(), True),
    StructField("OwnerUserId",IntegerType(), True),
    StructField("ParentId",IntegerType(), True),
    StructField("PostTypeId",IntegerType(), True),
    StructField("Score",IntegerType(), True),
    StructField("Tags", StringType(), True),
    StructField("Title", StringType(), True),
    StructField("ViewCount",IntegerType(), True)
])
   

In [19]:
df_posts = spark.read.format("xml") \
    .option("rowTag", "row")\
    .option("attributePrefix", "") \
    .schema(schema) \
    .load(r"Dataset/Posts.xml") 

In [None]:
df_posts.show()

+----------------+-----------+--------------------+----------+------------+--------------------+--------------+--------------------+-------------+---+--------------------+--------------------+---------------------+----------------+----------------+-----------+--------+----------+-----+--------------------+--------------------+---------+
|AcceptedAnswerId|AnswerCount|                Body|ClosedDate|CommentCount|  CommunityOwnedDate|ContentLicense|        CreationDate|FavoriteCount| Id|    LastActivityDate|        LastEditDate|LastEditorDisplayName|LastEditorUserId|OwnerDisplayName|OwnerUserId|ParentId|PostTypeId|Score|                Tags|               Title|ViewCount|
+----------------+-----------+--------------------+----------+------------+--------------------+--------------+--------------------+-------------+---+--------------------+--------------------+---------------------+----------------+----------------+-----------+--------+----------+-----+--------------------+---------------

In [20]:
df_tags_with_desc = df_Tags.join(
    df_posts.select("Id", "Body"),  # Include "Id" for the join condition
    df_Tags.ExcerptPostId == df_posts.Id,  # Use df_Tags, not df_votes
    "left"
)

df_tags_with_desc.show()

+---+--------------------+-----+-------------+----------+------+--------------------+
| Id|             TagName|Count|ExcerptPostId|WikiPostId|    Id|                Body|
+---+--------------------+-----+-------------+----------+------+--------------------+
| 10|             dataset| 1879|        20490|     20489| 20490|Requests for data...|
| 28|cumulative-distri...|  776|        40912|     40911| 40912|Cumulative distri...|
| 26|         frequentist|  453|        87156|     87155| 87156|In the frequentis...|
| 30|         time-series|14091|         3017|      3016|  3017|Time series are d...|
|  3|         elicitation|   12|         NULL|      NULL|  NULL|                NULL|
|  5|         open-source|   18|         NULL|      NULL|  NULL|                NULL|
| 18|       multivariable|   39|         NULL|      NULL|  NULL|                NULL|
| 37|          algorithms|  942|        21863|     21862| 21863|An unambiguous li...|
| 23|              census|   55|        63450|     634

In [21]:
df_tags_with_desc = df_tags_with_desc.drop(df_posts.Id)
df_tags_with_desc.show()

+---+--------------------+-----+-------------+----------+--------------------+
| Id|             TagName|Count|ExcerptPostId|WikiPostId|                Body|
+---+--------------------+-----+-------------+----------+--------------------+
| 10|             dataset| 1879|        20490|     20489|Requests for data...|
| 28|cumulative-distri...|  776|        40912|     40911|Cumulative distri...|
| 26|         frequentist|  453|        87156|     87155|In the frequentis...|
| 30|         time-series|14091|         3017|      3016|Time series are d...|
|  3|         elicitation|   12|         NULL|      NULL|                NULL|
|  5|         open-source|   18|         NULL|      NULL|                NULL|
| 18|       multivariable|   39|         NULL|      NULL|                NULL|
| 37|          algorithms|  942|        21863|     21862|An unambiguous li...|
| 23|              census|   55|        63450|     63449|A census is a stu...|
| 17|       interpolation|  265|        64387|     6

In [22]:
df_tags_with_desc = df_tags_with_desc.withColumnRenamed('Body','TagDesc')
df_tags_with_desc = df_tags_with_desc.withColumnRenamed('ExcerptPostId','TagDescPostId')

In [23]:
df_tags_with_desc= df_tags_with_desc.drop('WikiPostId')

In [24]:
df_tags_with_desc.show(truncate=False)

+---+--------------------------------+-----+-------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Id |TagName                         |Count|TagDescPostId|TagDesc                                                                                                                                                                                                                                                                                             |
+---+--------------------------------+-----+-------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [25]:
output_path = "SilverDataSet/Tags"

df_tags_with_desc.write.mode("overwrite").parquet(output_path)

In [26]:
print(df_tags_with_desc.columns)


['Id', 'TagName', 'Count', 'TagDescPostId', 'TagDesc']


In [27]:
df_tags_with_desc.show()

+---+--------------------+-----+-------------+--------------------+
| Id|             TagName|Count|TagDescPostId|             TagDesc|
+---+--------------------+-----+-------------+--------------------+
| 10|             dataset| 1879|        20490|Requests for data...|
| 28|cumulative-distri...|  776|        40912|Cumulative distri...|
| 26|         frequentist|  453|        87156|In the frequentis...|
| 30|         time-series|14091|         3017|Time series are d...|
|  3|         elicitation|   12|         NULL|                NULL|
|  5|         open-source|   18|         NULL|                NULL|
| 18|       multivariable|   39|         NULL|                NULL|
| 37|          algorithms|  942|        21863|An unambiguous li...|
| 23|              census|   55|        63450|A census is a stu...|
| 17|       interpolation|  265|        64387|Given a set of bi...|
| 12|          population|  517|        69287|A population is t...|
|  9|    machine-learning|19853|         9066|Ma

In [28]:
df_tags_with_desc = df_tags_with_desc.drop(df_posts.Id)
