In [1]:
spark

In [2]:
sc

# Transformations:
## remove underscores before each column
## _UserDisplayName --> drop column
## nulls in _UserId --> replace with -2
## Date Column --> change to DateTime
## any id --> string to prevent aggregations?

In [39]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder \
    .appName("XML Processing") \
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.16.0") \
    .getOrCreate()

In [54]:
df_Comments = spark.read.format("xml") \
    .option("rowTag", "row") \
    .load(r"/media/maya-emad/DISK/Graduation Project-Data Engineering/Data/Comments.xml")

                                                                                

In [55]:
df_Comments.show()

+---------------+--------------------+---+-------+------+--------------------+----------------+-------+
|_ContentLicense|       _CreationDate|_Id|_PostId|_Score|               _Text|_UserDisplayName|_UserId|
+---------------+--------------------+---+-------+------+--------------------+----------------+-------+
|   CC BY-SA 2.5|2010-07-19 22:15:...|  1|      3|     7|Could be a poster...|            NULL|     13|
|   CC BY-SA 2.5|2010-07-19 22:16:...|  2|      5|     0|Yes, R is nice- b...|            NULL|     13|
|   CC BY-SA 2.5|2010-07-19 22:18:...|  3|      9|     1|Again- why?  How ...|            NULL|     13|
|   CC BY-SA 2.5|2010-07-19 22:19:...|  4|      5|    11|It's mature, well...|            NULL|     37|
|   CC BY-SA 2.5|2010-07-19 22:22:...|  6|     14|    10|why ask the quest...|            NULL|     23|
|   CC BY-SA 2.5|2010-07-19 22:25:...|  7|     18|     1|also the US censu...|            NULL|     36|
|   CC BY-SA 2.5|2010-07-19 22:30:...|  9|     16|     1|Andrew 

In [56]:
df_Comments.printSchema()

root
 |-- _ContentLicense: string (nullable = true)
 |-- _CreationDate: timestamp (nullable = true)
 |-- _Id: long (nullable = true)
 |-- _PostId: long (nullable = true)
 |-- _Score: long (nullable = true)
 |-- _Text: string (nullable = true)
 |-- _UserDisplayName: string (nullable = true)
 |-- _UserId: long (nullable = true)



In [57]:
df_Comments.summary()

DataFrame[summary: string, _ContentLicense: string, _Id: string, _PostId: string, _Score: string, _Text: string, _UserDisplayName: string, _UserId: string]

In [58]:
df_Comments.toPandas().isnull().sum() / df_Comments.count() * 100

                                                                                

_ContentLicense      0.000000
_CreationDate        0.000000
_Id                  0.000000
_PostId              0.000000
_Score               0.000000
_Text                0.000000
_UserDisplayName    98.391386
_UserId              1.607733
dtype: float64

In [59]:
df_Comments.columns

['_ContentLicense',
 '_CreationDate',
 '_Id',
 '_PostId',
 '_Score',
 '_Text',
 '_UserDisplayName',
 '_UserId']

In [74]:
# remove underscores before each column
df_Comments = df_Comments \
    .withColumnRenamed("_ContentLicense", "ContentLicense") \
    .withColumnRenamed("_CreationDate", "CreationDate") \
    .withColumnRenamed("_Id", "Id") \
    .withColumnRenamed("_PostId", "PostId") \
    .withColumnRenamed("_Score", "Score") \
    .withColumnRenamed("_Text", "Text") \
    .withColumnRenamed("_UserDisplayName", "UserDisplayName") \
    .withColumnRenamed("_UserId", "UserId")

In [70]:
# Drop _UserDisplayName Column
df_Comments = df_Comments.drop('_UserDisplayName')

In [71]:
# Verify it is gone
df_Comments.columns

['ContentLicense', 'CreationDate', 'Id', 'PostId', 'Score', 'Text', 'UserId']

In [77]:
df_Comments = df_Comments.withColumn("CreationDate", to_timestamp("CreationDate"))

In [79]:
df_Comments.show()

+--------------+--------------------+---+------+-----+--------------------+------+
|ContentLicense|        CreationDate| Id|PostId|Score|                Text|UserId|
+--------------+--------------------+---+------+-----+--------------------+------+
|  CC BY-SA 2.5|2010-07-19 22:15:...|  1|     3|    7|Could be a poster...|    13|
|  CC BY-SA 2.5|2010-07-19 22:16:...|  2|     5|    0|Yes, R is nice- b...|    13|
|  CC BY-SA 2.5|2010-07-19 22:18:...|  3|     9|    1|Again- why?  How ...|    13|
|  CC BY-SA 2.5|2010-07-19 22:19:...|  4|     5|   11|It's mature, well...|    37|
|  CC BY-SA 2.5|2010-07-19 22:22:...|  6|    14|   10|why ask the quest...|    23|
|  CC BY-SA 2.5|2010-07-19 22:25:...|  7|    18|    1|also the US censu...|    36|
|  CC BY-SA 2.5|2010-07-19 22:30:...|  9|    16|    1|Andrew Gelman has...|    78|
|  CC BY-SA 2.5|2010-07-19 22:31:...| 10|    23|    8|I am not sure I u...|  NULL|
|  CC BY-SA 2.5|2010-07-19 22:34:...| 11|    43|    5|There are many R ...|     5|
|  C

In [80]:
# replace nulls in UserId with -2
df_Comments = df_Comments.fillna({"UserId": -2})

In [81]:
df_Comments.show()

+--------------+--------------------+---+------+-----+--------------------+------+
|ContentLicense|        CreationDate| Id|PostId|Score|                Text|UserId|
+--------------+--------------------+---+------+-----+--------------------+------+
|  CC BY-SA 2.5|2010-07-19 22:15:...|  1|     3|    7|Could be a poster...|    13|
|  CC BY-SA 2.5|2010-07-19 22:16:...|  2|     5|    0|Yes, R is nice- b...|    13|
|  CC BY-SA 2.5|2010-07-19 22:18:...|  3|     9|    1|Again- why?  How ...|    13|
|  CC BY-SA 2.5|2010-07-19 22:19:...|  4|     5|   11|It's mature, well...|    37|
|  CC BY-SA 2.5|2010-07-19 22:22:...|  6|    14|   10|why ask the quest...|    23|
|  CC BY-SA 2.5|2010-07-19 22:25:...|  7|    18|    1|also the US censu...|    36|
|  CC BY-SA 2.5|2010-07-19 22:30:...|  9|    16|    1|Andrew Gelman has...|    78|
|  CC BY-SA 2.5|2010-07-19 22:31:...| 10|    23|    8|I am not sure I u...|    -2|
|  CC BY-SA 2.5|2010-07-19 22:34:...| 11|    43|    5|There are many R ...|     5|
|  C

In [82]:
# any id --> string to prevent aggregations
df_Comments = df_Comments.withColumn("Id", col("Id").cast("string")) \
                         .withColumn("UserId", col("UserId").cast("string")) \
                         .withColumn("PostId", col("PostId").cast("string"))

In [83]:
df_Comments.printSchema()

root
 |-- ContentLicense: string (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- Id: string (nullable = true)
 |-- PostId: string (nullable = true)
 |-- Score: long (nullable = true)
 |-- Text: string (nullable = true)
 |-- UserId: string (nullable = false)



In [87]:
df_Comments.select("Score").distinct().show()



+-----+
|Score|
+-----+
|   26|
|   29|
|   19|
|   54|
|    0|
|   22|
|    7|
|   34|
|   57|
|   43|
|   32|
|   31|
|  119|
|   39|
|  116|
|   25|
|    6|
|   68|
|   72|
|   58|
+-----+
only showing top 20 rows



                                                                                

In [92]:
# store Score Column as 32-int datatype to save memory as Score values are usually small
df_Comments = df_Comments.withColumn("Score", col("Score").cast('int'))

In [93]:
df_Comments.printSchema()

root
 |-- ContentLicense: string (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- Id: string (nullable = true)
 |-- PostId: string (nullable = true)
 |-- Score: integer (nullable = true)
 |-- Text: string (nullable = true)
 |-- UserId: string (nullable = false)



In [94]:
df_Comments.show()

+--------------+--------------------+---+------+-----+--------------------+------+
|ContentLicense|        CreationDate| Id|PostId|Score|                Text|UserId|
+--------------+--------------------+---+------+-----+--------------------+------+
|  CC BY-SA 2.5|2010-07-19 22:15:...|  1|     3|    7|Could be a poster...|    13|
|  CC BY-SA 2.5|2010-07-19 22:16:...|  2|     5|    0|Yes, R is nice- b...|    13|
|  CC BY-SA 2.5|2010-07-19 22:18:...|  3|     9|    1|Again- why?  How ...|    13|
|  CC BY-SA 2.5|2010-07-19 22:19:...|  4|     5|   11|It's mature, well...|    37|
|  CC BY-SA 2.5|2010-07-19 22:22:...|  6|    14|   10|why ask the quest...|    23|
|  CC BY-SA 2.5|2010-07-19 22:25:...|  7|    18|    1|also the US censu...|    36|
|  CC BY-SA 2.5|2010-07-19 22:30:...|  9|    16|    1|Andrew Gelman has...|    78|
|  CC BY-SA 2.5|2010-07-19 22:31:...| 10|    23|    8|I am not sure I u...|    -2|
|  CC BY-SA 2.5|2010-07-19 22:34:...| 11|    43|    5|There are many R ...|     5|
|  C

In [95]:
df_Comments.toPandas().isnull().sum() / df_Comments.count() * 100

                                                                                

ContentLicense    0.0
CreationDate      0.0
Id                0.0
PostId            0.0
Score             0.0
Text              0.0
UserId            0.0
dtype: float64