In [None]:
#import findspark

In [None]:
#findspark.init()

In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [4]:
import sys , os
!{sys.executable} -m pip install beautifulsoup4



In [5]:
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

In [6]:
os.environ["HADOOP_HOME"] = "C:/Spark/spark-3.5.5-bin-hadoop3"  
os.environ["HADOOP_OPTS"] = "-Djava.library.path=C:/Spark/spark-3.5.5-bin-hadoop3/bin"

In [7]:
spark = SparkSession.builder.appName("Users_Spark_Cleansing")\
    .config("spark.executor.memory", "3g")\
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.16.0,org.apache.parquet:parquet-hadoop:1.15.1")\
    .config("spark.pyspark.python", sys.executable) \
    .config("spark.pyspark.driver.python", sys.executable) \
    .config("spark.hadoop.io.native.lib.available", "false")\
    .getOrCreate()

In [8]:
Users_df = spark.read.format("xml") \
    .option("rowTag", "row") \
    .load("Dataset/Users.xml")

Users_df.show()


+--------------------+----------+--------------------+------------------+----------+---+--------------------+--------------------+-----------+--------+------+--------------------+
|            _AboutMe|_AccountId|       _CreationDate|      _DisplayName|_DownVotes|_Id|     _LastAccessDate|           _Location|_Reputation|_UpVotes|_Views|         _WebsiteUrl|
+--------------------+----------+--------------------+------------------+----------+---+--------------------+--------------------+-----------+--------+------+--------------------+
|<p>Hi, I'm not re...|        -1|2010-07-19 09:55:...|         Community|      9720| -1|2010-07-19 09:55:...|  on the server farm|          1|   17823|  3131|http://meta.stack...|
|<p>Dev #2 who hel...|         2|2010-07-19 17:01:...|      Geoff Dalgas|         0|  2|2019-02-08 00:01:...|       Corvallis, OR|        101|       3|    49|http://stackoverf...|
|<p>Former <a href...|         3|2010-07-19 18:34:...|      Jarrod Dixon|         0|  3|2019-02-07 1

In [9]:
# calculate nulls in each column
from pyspark.sql.functions import col, count, when

null_perc=Users_df.select([((count(when(col(c).isNull(),c)) / Users_df.count()) * 100).alias(c) for c in Users_df.columns])

In [10]:
null_perc.show()

+-----------------+-------------------+-------------+------------+----------+---+---------------+----------------+-----------+--------+------+-----------------+
|         _AboutMe|         _AccountId|_CreationDate|_DisplayName|_DownVotes|_Id|_LastAccessDate|       _Location|_Reputation|_UpVotes|_Views|      _WebsiteUrl|
+-----------------+-------------------+-------------+------------+----------+---+---------------+----------------+-----------+--------+------+-----------------+
|79.84055556887088|0.00479351916209285|          0.0|         0.0|       0.0|0.0|            0.0|74.0907293339405|        0.0|     0.0|   0.0|87.49640486062843|
+-----------------+-------------------+-------------+------------+----------+---+---------------+----------------+-----------+--------+------+-----------------+



In [11]:
Users_df=Users_df.drop(*["_Location","_AccountId","_AboutMe","_WebsiteURL"])
Users_df.show()


+--------------------+------------------+----------+---+--------------------+-----------+--------+------+
|       _CreationDate|      _DisplayName|_DownVotes|_Id|     _LastAccessDate|_Reputation|_UpVotes|_Views|
+--------------------+------------------+----------+---+--------------------+-----------+--------+------+
|2010-07-19 09:55:...|         Community|      9720| -1|2010-07-19 09:55:...|          1|   17823|  3131|
|2010-07-19 17:01:...|      Geoff Dalgas|         0|  2|2019-02-08 00:01:...|        101|       3|    49|
|2010-07-19 18:34:...|      Jarrod Dixon|         0|  3|2019-02-07 18:22:...|        101|      23|    50|
|2010-07-19 22:03:...|            Emmett|         0|  4|2016-11-24 21:37:...|        101|       0|    24|
|2010-07-19 22:03:...|             Shane|         5|  5|2022-12-07 21:30:...|      12151|     684|  2101|
|2010-07-19 22:04:...|            Harlan|         0|  6|2022-07-14 19:07:...|        842|      65|   217|
|2010-07-19 22:04:...|             Vince|     

In [12]:
# Renaming Columns Names
for col_name in Users_df.columns:
    Users_df = Users_df.withColumnRenamed(col_name, col_name.lstrip("_"))

Users_df=Users_df.withColumnRenamed("Id","UsersId_BK")


In [13]:
# check if thier is duplicates
duplicate_count = Users_df.groupBy(Users_df.columns).count().filter(col("count") > 1)
duplicate_count.show()


+------------+-----------+---------+----------+--------------+----------+-------+-----+-----+
|CreationDate|DisplayName|DownVotes|UsersId_BK|LastAccessDate|Reputation|UpVotes|Views|count|
+------------+-----------+---------+----------+--------------+----------+-------+-----+-----+
+------------+-----------+---------+----------+--------------+----------+-------+-----+-----+



In [14]:
Users_df.printSchema()


root
 |-- CreationDate: timestamp (nullable = true)
 |-- DisplayName: string (nullable = true)
 |-- DownVotes: long (nullable = true)
 |-- UsersId_BK: long (nullable = true)
 |-- LastAccessDate: timestamp (nullable = true)
 |-- Reputation: long (nullable = true)
 |-- UpVotes: long (nullable = true)
 |-- Views: long (nullable = true)



In [15]:
# convert DataTypes
from pyspark.sql.functions import to_date, col
from pyspark.sql.types import IntegerType

# convert to date
Users_df = Users_df.withColumn("CreationDate", to_date(col("CreationDate"), "yyyy-MM-dd")).withColumn("LastAccessDate", to_date(col("LastAccessDate"), "yyyy-MM-dd"))

#convert to Integer
columns_to_convert = ["DownVotes", "UpVotes", "Views","Reputation","UsersId_BK"]
for col_name in columns_to_convert:
    Users_df = Users_df.withColumn(col_name, col(col_name).cast(IntegerType()))

Users_df.printSchema()


root
 |-- CreationDate: date (nullable = true)
 |-- DisplayName: string (nullable = true)
 |-- DownVotes: integer (nullable = true)
 |-- UsersId_BK: integer (nullable = true)
 |-- LastAccessDate: date (nullable = true)
 |-- Reputation: integer (nullable = true)
 |-- UpVotes: integer (nullable = true)
 |-- Views: integer (nullable = true)



In [16]:
Users_df.write.mode("overwrite").parquet("SilverDataSet/Users")