In [None]:
from pyspark.sql import SparkSession , Row

In [4]:
from pyspark.sql.functions import *#col , when , sum , count , lower, regexp_replace, trim , split , substring , length , expr , concat_ws

In [5]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType

In [6]:
import sys , os
!{sys.executable} -m pip install beautifulsoup4



In [7]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
from bs4 import BeautifulSoup

In [8]:
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [9]:
os.environ["HADOOP_HOME"] = "C:/Spark/spark-3.5.5-bin-hadoop3"  
os.environ["HADOOP_OPTS"] = "-Djava.library.path=C:/Spark/spark-3.5.5-bin-hadoop3/bin"

In [10]:
# .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.16.0")\
spark = SparkSession.builder.appName("posts_Spark_Cleansing")\
    .config("spark.executor.memory", "3g")\
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.16.0,org.apache.parquet:parquet-hadoop:1.15.1")\
    .config("spark.pyspark.python", sys.executable) \
    .config("spark.pyspark.driver.python", sys.executable) \
    .config("spark.hadoop.io.native.lib.available", "false")\
    .getOrCreate()

In [11]:
schema = StructType([
    StructField("AcceptedAnswerId", IntegerType(), True),
    StructField("AnswerCount",IntegerType(), True),
    StructField("Body", StringType(), True),
    StructField("ClosedDate", TimestampType(), True),
    StructField("CommentCount",IntegerType(), True),
    StructField("CommunityOwnedDate", TimestampType(), True),
    StructField("ContentLicense", StringType(), True),
    StructField("CreationDate", TimestampType(), True),
    StructField("FavoriteCount",IntegerType(), True),
    StructField("Id",IntegerType(), True),
    StructField("LastActivityDate", TimestampType(), True),
    StructField("LastEditDate", TimestampType(), True),
    StructField("LastEditorDisplayName", StringType(), True),
    StructField("LastEditorUserId",IntegerType(), True),
    StructField("OwnerDisplayName", StringType(), True),
    StructField("OwnerUserId",IntegerType(), True),
    StructField("ParentId",IntegerType(), True),
    StructField("PostTypeId",IntegerType(), True),
    StructField("Score",IntegerType(), True),
    StructField("Tags", StringType(), True),
    StructField("Title", StringType(), True),
    StructField("ViewCount",IntegerType(), True)
])
   

In [12]:
df_posts = spark.read.format("xml") \
    .option("rowTag", "row")\
    .option("attributePrefix", "") \
    .schema(schema) \
    .load(r"Dataset/Posts.xml") 

In [13]:
#df_posts.show()

In [75]:
df_posts.printSchema()

root
 |-- AcceptedAnswerId: integer (nullable = true)
 |-- AnswerCount: integer (nullable = true)
 |-- Body: string (nullable = true)
 |-- ClosedDate: timestamp (nullable = true)
 |-- CommentCount: integer (nullable = true)
 |-- CommunityOwnedDate: timestamp (nullable = true)
 |-- ContentLicense: string (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- FavoriteCount: integer (nullable = true)
 |-- Id: integer (nullable = true)
 |-- LastActivityDate: timestamp (nullable = true)
 |-- LastEditDate: timestamp (nullable = true)
 |-- LastEditorDisplayName: string (nullable = true)
 |-- LastEditorUserId: integer (nullable = true)
 |-- OwnerDisplayName: string (nullable = true)
 |-- OwnerUserId: integer (nullable = true)
 |-- ParentId: integer (nullable = true)
 |-- PostTypeId: integer (nullable = true)
 |-- Score: integer (nullable = true)
 |-- Tags: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- ViewCount: integer (nullable = true)



## Filtering posts to Questions and answers 

In [15]:
df_Questions = df_posts.where(col('PostTypeId') == 1)

In [16]:
#df_Questions.show()

In [17]:
df_Answers = df_posts.where(col('PostTypeId') == 2)

In [18]:
#df_Answers.show()

## Working with Questions 

### Filtering coloumns

In [19]:
df_Questions.printSchema()

root
 |-- AcceptedAnswerId: integer (nullable = true)
 |-- AnswerCount: integer (nullable = true)
 |-- Body: string (nullable = true)
 |-- ClosedDate: timestamp (nullable = true)
 |-- CommentCount: integer (nullable = true)
 |-- CommunityOwnedDate: timestamp (nullable = true)
 |-- ContentLicense: string (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- FavoriteCount: integer (nullable = true)
 |-- Id: integer (nullable = true)
 |-- LastActivityDate: timestamp (nullable = true)
 |-- LastEditDate: timestamp (nullable = true)
 |-- LastEditorDisplayName: string (nullable = true)
 |-- LastEditorUserId: integer (nullable = true)
 |-- OwnerDisplayName: string (nullable = true)
 |-- OwnerUserId: integer (nullable = true)
 |-- ParentId: integer (nullable = true)
 |-- PostTypeId: integer (nullable = true)
 |-- Score: integer (nullable = true)
 |-- Tags: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- ViewCount: integer (nullable = true)



In [20]:
df_Questions_Filtered_cols = df_Questions.select('Id','OwnerUserId','CreationDate','LastActivityDate','AcceptedAnswerId','Body','Title','Tags','Score','ViewCount','AnswerCount','CommentCount')

In [21]:
#df_Questions_Filtered_cols.show()

### Dealing with nulls 

In [None]:
null_counts = df_Questions_Filtered_cols.select([sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_Questions_Filtered_cols.columns])
#null_counts.show()

#### UserID Nulls to -2 and Accepted Answer ID nulls to -1

In [23]:
df_Questions_Handling_Nulls = df_Questions_Filtered_cols.fillna({
    'OwnerUserId':'-2',
    'AcceptedAnswerId': '-1'
})

In [24]:
#df_Questions_Handling_Nulls.show()

In [25]:
null_counts_again = df_Questions_Handling_Nulls.select([sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_Questions_Handling_Nulls.columns])

In [26]:
#null_counts_again.show()

#### Handling Duplication 

In [27]:
duplicates = df_Questions_Handling_Nulls.groupBy('OwnerUserId','Body').count().filter("count > 1")
#duplicates.show()

In [28]:
#duplicates.count()

In [29]:
df_Questions_dropping_duplicates=df_Questions_Handling_Nulls.dropDuplicates(['OwnerUserId','Body'])

In [30]:
duplicates_validation = df_Questions_dropping_duplicates.groupBy('OwnerUserId','Body').count().filter("count > 1")
#duplicates_validation.show()

#### Converting Date types 

In [31]:
from pyspark.sql.functions import to_date, col, to_timestamp


In [32]:
df_Questions_Date_only =  df_Questions_dropping_duplicates.withColumn("CreationDate", to_date(col("CreationDate"),"yyyy-MM-DD"))
df_Questions_Date_only = df_Questions_Date_only.withColumn("LastActivityDate", to_date(col("LastActivityDate"),"yyyy-MM-DD"))


In [33]:
#df_Questions_Date_only.show()

In [34]:
#df_Questions_Date_only.select('Title').show(truncate=False)

In [35]:
#df_Questions_Date_only.collect()
#df_Questions_Date_only.limit(10).show(truncate=False)

### Handling Body (HTML to Text )

In [36]:
def html_to_text(html):
    if html is None:
        return None
    # Parse the HTML and extract the text content
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text()


In [37]:
html_to_text_udf = udf(html_to_text, StringType())


In [38]:
df_Questions_HTMLToText = df_Questions_Date_only.withColumn("Body", html_to_text_udf(col("Body")))



In [39]:
#df_Questions_HTMLToText.show(truncate=False)

In [40]:
df_Questions_refined_text = df_Questions_HTMLToText.withColumn(
    "Body",
    trim(
        regexp_replace(
            lower(
                regexp_replace("Body", "\\s+", " ")
            ),
            "[^a-zA-Z0-9\\s]",
            ""
        )
    )
)

In [41]:
#df_Questions_refined_text.show()

In [42]:
#df_Questions_refined_text.select('Body').show(truncate = False)

#### Dealing with Tags

In [43]:
#df_Questions_refined_text.select('Tags').show()

In [44]:
df_Questions_final = df_Questions_refined_text.withColumn("Tags", expr("substring(Tags, 2, length(Tags) - 2)"))

In [45]:
#df_Questions_final.select(col('Tags')).show(truncate=False)

In [46]:
df_Questions_final = df_Questions_final.withColumn("Tags", split(df_Questions_final["Tags"], "\><"))


In [47]:
df_Questions_final.select(col('Tags')).show(truncate=False)

+-----------------------------------------------------------------------------------------+
|Tags                                                                                     |
+-----------------------------------------------------------------------------------------+
|[self-study, p-value, standard-deviation, least-squares, sums-of-squares]                |
|[self-study, variance, simulation, monte-carlo, numerical-integration]                   |
|[self-study, simulation, monte-carlo, importance-sampling]                               |
|[time-series, self-study, forecasting, arima]                                            |
|[probability, hypothesis-testing, self-study, normal-distribution, multivariate-analysis]|
|[hypothesis-testing, self-study, maximum-likelihood, inference, likelihood-ratio]        |
|[hypothesis-testing, self-study, inference, exponential-family]                          |
|[probability, self-study, normal-distribution, chi-squared-test, expected-value

In [48]:
df_Questions_final = df_Questions_final.withColumnRenamed("Id","QuestionId")

#### writing to a new file 

In [49]:
df_Questions_final.coalesce(1).write.mode("overwrite").parquet("SilverDataSet/Questions")

In [50]:
#df_Questions_final.write.mode("overwrite").parquet("D:/mada/programming/DataEngineering/Iti/GP/Transformation/SilverDataSet/Questions.parquet")


In [51]:
df_Questions_final.printSchema()

root
 |-- QuestionId: integer (nullable = true)
 |-- OwnerUserId: integer (nullable = true)
 |-- CreationDate: date (nullable = true)
 |-- LastActivityDate: date (nullable = true)
 |-- AcceptedAnswerId: integer (nullable = true)
 |-- Body: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Tags: array (nullable = true)
 |    |-- element: string (containsNull = false)
 |-- Score: integer (nullable = true)
 |-- ViewCount: integer (nullable = true)
 |-- AnswerCount: integer (nullable = true)
 |-- CommentCount: integer (nullable = true)



In [52]:
df_Questions_final.show()

+----------+-----------+------------+----------------+----------------+--------------------+--------------------+--------------------+-----+---------+-----------+------------+
|QuestionId|OwnerUserId|CreationDate|LastActivityDate|AcceptedAnswerId|                Body|               Title|                Tags|Score|ViewCount|AnswerCount|CommentCount|
+----------+-----------+------------+----------------+----------------+--------------------+--------------------+--------------------+-----+---------+-----------+------------+
|     95868|         -2|  2014-05-01|      2014-05-01|           95962|my attempt at mak...|Least Squares Fit...|[self-study, p-va...|    1|      135|          1|          19|
|    146732|         -2|  2015-04-16|      2015-04-17|          146762|i have some troub...|Variance reductio...|[self-study, vari...|    7|     1733|          1|           5|
|    151163|         -2|  2015-05-07|      2022-05-25|          151432|implement an esti...|Monte Carlo integ...|[self-s

## Working With Answers

In [53]:
df_Answers.show()

+----------------+-----------+--------------------+----------+------------+--------------------+--------------+--------------------+-------------+---+--------------------+--------------------+---------------------+----------------+----------------+-----------+--------+----------+-----+----+-----+---------+
|AcceptedAnswerId|AnswerCount|                Body|ClosedDate|CommentCount|  CommunityOwnedDate|ContentLicense|        CreationDate|FavoriteCount| Id|    LastActivityDate|        LastEditDate|LastEditorDisplayName|LastEditorUserId|OwnerDisplayName|OwnerUserId|ParentId|PostTypeId|Score|Tags|Title|ViewCount|
+----------------+-----------+--------------------+----------+------------+--------------------+--------------+--------------------+-------------+---+--------------------+--------------------+---------------------+----------------+----------------+-----------+--------+----------+-----+----+-----+---------+
|            NULL|       NULL|<p>The R-project<...|      NULL|           3|2

In [54]:
#print(spark.sparkContext.getConf().get("spark.jars"))


In [55]:
#df = spark.read.parquet("D:\mada\programming\DataEngineering\Iti\GP\Transformation\SilverDataSet\Questions")

In [56]:
#df.show()

In [57]:
df_Answers.printSchema()

root
 |-- AcceptedAnswerId: integer (nullable = true)
 |-- AnswerCount: integer (nullable = true)
 |-- Body: string (nullable = true)
 |-- ClosedDate: timestamp (nullable = true)
 |-- CommentCount: integer (nullable = true)
 |-- CommunityOwnedDate: timestamp (nullable = true)
 |-- ContentLicense: string (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- FavoriteCount: integer (nullable = true)
 |-- Id: integer (nullable = true)
 |-- LastActivityDate: timestamp (nullable = true)
 |-- LastEditDate: timestamp (nullable = true)
 |-- LastEditorDisplayName: string (nullable = true)
 |-- LastEditorUserId: integer (nullable = true)
 |-- OwnerDisplayName: string (nullable = true)
 |-- OwnerUserId: integer (nullable = true)
 |-- ParentId: integer (nullable = true)
 |-- PostTypeId: integer (nullable = true)
 |-- Score: integer (nullable = true)
 |-- Tags: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- ViewCount: integer (nullable = true)



In [58]:
df_Answers_Filtered_cols = df_Answers.select('Id','ParentId','OwnerUserId','CreationDate','LastActivityDate','Body','Score','CommentCount')

In [59]:
df_Answers_Filtered_cols.show()

+---+--------+-----------+--------------------+--------------------+--------------------+-----+------------+
| Id|ParentId|OwnerUserId|        CreationDate|    LastActivityDate|                Body|Score|CommentCount|
+---+--------+-----------+--------------------+--------------------+--------------------+-----+------------+
|  5|       3|         23|2010-07-19 22:14:...|2010-07-19 22:21:...|<p>The R-project<...|   90|           3|
|  9|       3|         50|2010-07-19 22:16:...|2010-07-19 22:16:...|<p><a href="http:...|   15|           3|
| 12|       7|          5|2010-07-19 22:18:...|2010-07-19 22:18:...|<p>See my respons...|   24|           1|
| 13|       6|         23|2010-07-19 22:18:...|2010-07-19 22:18:...|<p>Machine Learni...|   27|           6|
| 14|       3|         36|2010-07-19 22:19:...|2010-07-19 22:19:...|<p>I second that ...|    6|           1|
| 15|       1|          6|2010-07-19 22:19:...|2010-07-19 22:19:...|<p>John Cook give...|   24|           0|
| 16|       3|     

### Dealing with nulls

In [60]:
null_counts = df_Answers_Filtered_cols.select([sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_Answers_Filtered_cols.columns])


In [61]:
null_counts.show()

+---+--------+-----------+------------+----------------+----+-----+------------+
| Id|ParentId|OwnerUserId|CreationDate|LastActivityDate|Body|Score|CommentCount|
+---+--------+-----------+------------+----------------+----+-----+------------+
|  0|       0|       2528|           0|               0|   0|    0|           0|
+---+--------+-----------+------------+----------------+----+-----+------------+



In [62]:
df_Answers_Handling_Nulls = df_Answers_Filtered_cols.fillna({
    'OwnerUserId':'-2',
})

In [63]:
null_counts = df_Answers_Handling_Nulls.select([sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_Answers_Handling_Nulls.columns])
null_counts.show()

+---+--------+-----------+------------+----------------+----+-----+------------+
| Id|ParentId|OwnerUserId|CreationDate|LastActivityDate|Body|Score|CommentCount|
+---+--------+-----------+------------+----------------+----+-----+------------+
|  0|       0|          0|           0|               0|   0|    0|           0|
+---+--------+-----------+------------+----------------+----+-----+------------+



### Handling Duplicates 

In [64]:
duplicates = df_Answers_Handling_Nulls.groupBy('OwnerUserId','Body','ParentId').count().filter("count > 1")


In [65]:
#duplicates.show()
#duplicates.count()

### Converting DataTypes

In [66]:
df_Answers_Handling_Nulls.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- ParentId: integer (nullable = true)
 |-- OwnerUserId: integer (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- LastActivityDate: timestamp (nullable = true)
 |-- Body: string (nullable = true)
 |-- Score: integer (nullable = true)
 |-- CommentCount: integer (nullable = true)



In [None]:
df_Answers_Date_only =  df_Answers_Handling_Nulls.withColumn("CreationDate", to_date(col("CreationDate"),"yyyy-MM-DD"))
df_Answers_Date_only = df_Answers_Date_only.withColumn("LastActivityDate", to_date(col("LastActivityDate"),"yyyy-MM-DD"))

In [68]:
df_Answers_Date_only.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- ParentId: integer (nullable = true)
 |-- OwnerUserId: integer (nullable = true)
 |-- CreationDate: date (nullable = true)
 |-- LastActivityDate: date (nullable = true)
 |-- Body: string (nullable = true)
 |-- Score: integer (nullable = true)
 |-- CommentCount: integer (nullable = true)



### Handling Body (HTML to Text )

In [69]:
df_Answers_HTMLToText = df_Answers_Date_only.withColumn("Body", html_to_text_udf(col("Body")))


In [70]:
df_Answers_refined_text = df_Answers_HTMLToText.withColumn(
    "Body",
    trim(
        regexp_replace(
            lower(
                regexp_replace("Body", "\\s+", " ")
            ),
            "[^a-zA-Z0-9\\s]",
            ""
        )
    )
)

In [71]:
df_Answers_refined_text.select('Body').show(truncate = False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [72]:
df_Answers_final = df_Answers_refined_text.withColumnRenamed("Id","AnswerId")


In [73]:
df_Answers_final.coalesce(1).write.mode("overwrite").parquet("SilverDataSet/Answers")

In [74]:
#spark.stop()