In [1]:
spark

In [2]:
sc

# Transformations:
### remove underscores before each column
### UserDisplayName --> drop column
### nulls in _UserId --> replace with -2
### Date Column --> change date format to YY-MM-DD
### any id --> string to prevent aggregations?
### store Score column as 32-int datatype to save memory as Score values are usually small

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, MapType

spark = SparkSession.builder \
    .appName("XML Processing") \
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.16.0") \
    .getOrCreate()

25/04/01 22:11:57 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
schema = StructType([
    StructField("_Id", StringType(), False),
    StructField("_PostId", StringType(), True),
    StructField("_UserId", StringType(), True),
    StructField("_UserDisplayName", StringType(), True),
    StructField("_Score", IntegerType(), True),
    StructField("_Text", StringType(), True),
    StructField("_CreationDate", TimestampType(), True),
    StructField("_ContentLicense", StringType(), True)
])

In [5]:
df_Comments = spark.read.format("xml") \
    .option("rowTag", "row") \
    .schema(schema) \
    .load(r"Dataset/Comments.xml")

In [6]:
df_Comments.printSchema()

root
 |-- _Id: string (nullable = false)
 |-- _PostId: string (nullable = true)
 |-- _UserId: string (nullable = true)
 |-- _UserDisplayName: string (nullable = true)
 |-- _Score: integer (nullable = true)
 |-- _Text: string (nullable = true)
 |-- _CreationDate: timestamp (nullable = true)
 |-- _ContentLicense: string (nullable = true)



In [7]:
df_Comments.show()

[Stage 0:>                                                          (0 + 1) / 1]

+---+-------+-------+----------------+------+--------------------+--------------------+---------------+
|_Id|_PostId|_UserId|_UserDisplayName|_Score|               _Text|       _CreationDate|_ContentLicense|
+---+-------+-------+----------------+------+--------------------+--------------------+---------------+
|  1|      3|     13|            NULL|     7|Could be a poster...|2010-07-19 22:15:...|   CC BY-SA 2.5|
|  2|      5|     13|            NULL|     0|Yes, R is nice- b...|2010-07-19 22:16:...|   CC BY-SA 2.5|
|  3|      9|     13|            NULL|     1|Again- why?  How ...|2010-07-19 22:18:...|   CC BY-SA 2.5|
|  4|      5|     37|            NULL|    11|It's mature, well...|2010-07-19 22:19:...|   CC BY-SA 2.5|
|  6|     14|     23|            NULL|    10|why ask the quest...|2010-07-19 22:22:...|   CC BY-SA 2.5|
|  7|     18|     36|            NULL|     1|also the US censu...|2010-07-19 22:25:...|   CC BY-SA 2.5|
|  9|     16|     78|            NULL|     1|Andrew Gelman has..

                                                                                

In [8]:
df_Comments.summary()

DataFrame[summary: string, _Id: string, _PostId: string, _UserId: string, _UserDisplayName: string, _Score: string, _Text: string, _ContentLicense: string]

In [9]:
df_Comments.toPandas().isnull().sum() / df_Comments.count() * 100

                                                                                

_Id                  0.000000
_PostId              0.000000
_UserId              1.607733
_UserDisplayName    98.391386
_Score               0.000000
_Text                0.000000
_CreationDate        0.000000
_ContentLicense      0.000000
dtype: float64

In [10]:
df_Comments.columns

['_Id',
 '_PostId',
 '_UserId',
 '_UserDisplayName',
 '_Score',
 '_Text',
 '_CreationDate',
 '_ContentLicense']

### remove underscores before each column

In [11]:
df_Comments = df_Comments \
    .withColumnRenamed("_ContentLicense", "ContentLicense") \
    .withColumnRenamed("_CreationDate", "CreationDate") \
    .withColumnRenamed("_Id", "Id") \
    .withColumnRenamed("_PostId", "PostId") \
    .withColumnRenamed("_Score", "Score") \
    .withColumnRenamed("_Text", "Text") \
    .withColumnRenamed("_UserDisplayName", "UserDisplayName") \
    .withColumnRenamed("_UserId", "UserId")

### drop _UserDisplayName Column

In [12]:
df_Comments = df_Comments.drop('UserDisplayName')

### verify it is gone

In [13]:
df_Comments.columns

['Id', 'PostId', 'UserId', 'Score', 'Text', 'CreationDate', 'ContentLicense']

### change Date Format to YY-MM-DD

In [14]:
df_Comments = df_Comments.withColumn("CreationDate", date_format("CreationDate", "yy-MM-dd"))

In [15]:
df_Comments.show()

+---+------+------+-----+--------------------+------------+--------------+
| Id|PostId|UserId|Score|                Text|CreationDate|ContentLicense|
+---+------+------+-----+--------------------+------------+--------------+
|  1|     3|    13|    7|Could be a poster...|    10-07-19|  CC BY-SA 2.5|
|  2|     5|    13|    0|Yes, R is nice- b...|    10-07-19|  CC BY-SA 2.5|
|  3|     9|    13|    1|Again- why?  How ...|    10-07-19|  CC BY-SA 2.5|
|  4|     5|    37|   11|It's mature, well...|    10-07-19|  CC BY-SA 2.5|
|  6|    14|    23|   10|why ask the quest...|    10-07-19|  CC BY-SA 2.5|
|  7|    18|    36|    1|also the US censu...|    10-07-19|  CC BY-SA 2.5|
|  9|    16|    78|    1|Andrew Gelman has...|    10-07-19|  CC BY-SA 2.5|
| 10|    23|  NULL|    8|I am not sure I u...|    10-07-19|  CC BY-SA 2.5|
| 11|    43|     5|    5|There are many R ...|    10-07-19|  CC BY-SA 2.5|
| 12|    38|    54|    0|That's just an ex...|    10-07-19|  CC BY-SA 2.5|
| 13|    20|    24|    2|

### replace nulls in UserId with -2

In [16]:
df_Comments = df_Comments.fillna({"UserId": -2})

In [17]:
df_Comments.show()

+---+------+------+-----+--------------------+------------+--------------+
| Id|PostId|UserId|Score|                Text|CreationDate|ContentLicense|
+---+------+------+-----+--------------------+------------+--------------+
|  1|     3|    13|    7|Could be a poster...|    10-07-19|  CC BY-SA 2.5|
|  2|     5|    13|    0|Yes, R is nice- b...|    10-07-19|  CC BY-SA 2.5|
|  3|     9|    13|    1|Again- why?  How ...|    10-07-19|  CC BY-SA 2.5|
|  4|     5|    37|   11|It's mature, well...|    10-07-19|  CC BY-SA 2.5|
|  6|    14|    23|   10|why ask the quest...|    10-07-19|  CC BY-SA 2.5|
|  7|    18|    36|    1|also the US censu...|    10-07-19|  CC BY-SA 2.5|
|  9|    16|    78|    1|Andrew Gelman has...|    10-07-19|  CC BY-SA 2.5|
| 10|    23|    -2|    8|I am not sure I u...|    10-07-19|  CC BY-SA 2.5|
| 11|    43|     5|    5|There are many R ...|    10-07-19|  CC BY-SA 2.5|
| 12|    38|    54|    0|That's just an ex...|    10-07-19|  CC BY-SA 2.5|
| 13|    20|    24|    2|

### any id --> string to prevent aggregations

In [18]:
df_Comments = df_Comments.withColumn("Id", col("Id").cast("string")) \
                         .withColumn("UserId", col("UserId").cast("string")) \
                         .withColumn("PostId", col("PostId").cast("string"))

In [19]:
df_Comments.printSchema()

root
 |-- Id: string (nullable = false)
 |-- PostId: string (nullable = true)
 |-- UserId: string (nullable = false)
 |-- Score: integer (nullable = true)
 |-- Text: string (nullable = true)
 |-- CreationDate: string (nullable = true)
 |-- ContentLicense: string (nullable = true)



In [20]:
df_Comments.show()

+---+------+------+-----+--------------------+------------+--------------+
| Id|PostId|UserId|Score|                Text|CreationDate|ContentLicense|
+---+------+------+-----+--------------------+------------+--------------+
|  1|     3|    13|    7|Could be a poster...|    10-07-19|  CC BY-SA 2.5|
|  2|     5|    13|    0|Yes, R is nice- b...|    10-07-19|  CC BY-SA 2.5|
|  3|     9|    13|    1|Again- why?  How ...|    10-07-19|  CC BY-SA 2.5|
|  4|     5|    37|   11|It's mature, well...|    10-07-19|  CC BY-SA 2.5|
|  6|    14|    23|   10|why ask the quest...|    10-07-19|  CC BY-SA 2.5|
|  7|    18|    36|    1|also the US censu...|    10-07-19|  CC BY-SA 2.5|
|  9|    16|    78|    1|Andrew Gelman has...|    10-07-19|  CC BY-SA 2.5|
| 10|    23|    -2|    8|I am not sure I u...|    10-07-19|  CC BY-SA 2.5|
| 11|    43|     5|    5|There are many R ...|    10-07-19|  CC BY-SA 2.5|
| 12|    38|    54|    0|That's just an ex...|    10-07-19|  CC BY-SA 2.5|
| 13|    20|    24|    2|

In [21]:
df_Comments.toPandas().isnull().sum() / df_Comments.count() * 100

                                                                                

Id                0.0
PostId            0.0
UserId            0.0
Score             0.0
Text              0.0
CreationDate      0.0
ContentLicense    0.0
dtype: float64

In [23]:
df_Comments.coalesce(1).write.mode("overwrite").parquet("Dataset/SilverDataset/Comments")

                                                                                

In [24]:
df_Comments.write.mode("overwrite").parquet("Dataset/SilverDataset/CommentsPartitions")

25/04/01 22:16:05 WARN MemoryManager: Total allocation exceeds 95.00% (1,019,058,573 bytes) of heap memory
Scaling row group sizes to 94.91% for 8 writers
25/04/01 22:16:05 WARN MemoryManager: Total allocation exceeds 95.00% (1,019,058,573 bytes) of heap memory
Scaling row group sizes to 84.36% for 9 writers
25/04/01 22:16:15 WARN MemoryManager: Total allocation exceeds 95.00% (1,019,058,573 bytes) of heap memory
Scaling row group sizes to 94.91% for 8 writers
                                                                                

### Checking for non textual Comments --> nothing seems wrong

In [174]:
df_Comments.withColumn("Is_JSON", from_json(col("Text"), MapType(StringType(), StringType())).isNotNull()) \
    .select("Text", "Is_JSON") \
    .show(10, False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+
|Text                                                                                                                                                                                       |Is_JSON|
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+
|Could be a poster child for argumentative and subjective.  At the least, need to define 'valuable'.                                                                                        |false  |
|Yes, R is nice- but WHY is it 'valuable'.                                                                                                                                                  |false  |
|Again- wh

In [177]:
df_json = df_Comments.filter(from_json(col("Text"), MapType(StringType(), StringType())).isNotNull())
df_json.show(10,truncate=False)



+---+------+------+-----+----+------------+--------------+
|Id |PostId|UserId|Score|Text|CreationDate|ContentLicense|
+---+------+------+-----+----+------------+--------------+
+---+------+------+-----+----+------------+--------------+



                                                                                

In [181]:
df_csv = df_Comments.filter(
    col("Text").rlike(r'[^,]+,[^,]+')
)
df_xml.select(col("Text")).show(truncate=False)

[Stage 162:>                                                        (0 + 1) / 1]

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Text                                                                                                                                                                                                                                                                                                                                                                                                        

                                                                                

In [182]:
df_yaml = df_Comments.filter(
    col("Text").rlike(r'^\s*\w+:\s*\w+')
)
df_yaml.select(col("Text")).show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Text                                                                                                                                                                                                                                                                                                                                                                                                           

In [183]:
df_ini = df_Comments.filter(
    col("Text").rlike(r'^\s*\[.*\]\s*$') & col("Text").rlike(r'^\s*\w+\s*=\s*\w+')
)
df_ini.select(col("Text")).show(truncate=False)

[Stage 166:>                                                        (0 + 4) / 4]

+----+
|Text|
+----+
+----+



                                                                                