In [20]:
import findspark

In [21]:
findspark.init()

In [22]:
import pyspark

In [23]:
from pyspark.sql import SparkSession

In [24]:
spark = SparkSession.builder \
    .appName("XML Reader") \
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.17.0") \
    .getOrCreate()

In [25]:
badges_df = spark.read.format("xml") \
    .option("rowTag", "row") \
    .load("badges.xml")

badges_df.show()


+------+--------------------+---+-------+---------+-------+
|_Class|               _Date|_Id|  _Name|_TagBased|_UserId|
+------+--------------------+---+-------+---------+-------+
|     3|2010-07-19 22:39:...|  1|Teacher|    false|      5|
|     3|2010-07-19 22:39:...|  2|Teacher|    false|      6|
|     3|2010-07-19 22:39:...|  3|Teacher|    false|      8|
|     3|2010-07-19 22:39:...|  4|Teacher|    false|     23|
|     3|2010-07-19 22:39:...|  5|Teacher|    false|     36|
|     3|2010-07-19 22:39:...|  6|Teacher|    false|     37|
|     3|2010-07-19 22:39:...|  7|Teacher|    false|     50|
|     3|2010-07-19 22:39:...|  8|Teacher|    false|     55|
|     3|2010-07-19 22:39:...|  9|Student|    false|      5|
|     3|2010-07-19 22:39:...| 10|Student|    false|      8|
|     3| 2010-07-19 22:39:08| 11|Student|    false|     13|
|     3|2010-07-19 22:39:...| 12|Student|    false|     18|
|     3|2010-07-19 22:39:...| 13|Student|    false|     23|
|     3|2010-07-19 22:39:...| 14|Student

In [71]:
# calculate nulls in each column
from pyspark.sql.functions import col, count, when

null_perc=badges_df.select([((count(when(col(c).isNull(),c)) / badges_df.count()) * 100).alias(c) for c in badges_df.columns])

In [72]:
null_perc.show()

+------+-----+---+-----+---------+-------+
|_Class|_Date|_Id|_Name|_TagBased|_UserId|
+------+-----+---+-----+---------+-------+
|   0.0|  0.0|0.0|  0.0|      0.0|    0.0|
+------+-----+---+-----+---------+-------+



In [26]:
Badge_Disc_Dim=badges_df.drop(*["_TagBased","_Date","_UserId"])
Badge_Disc_Dim.show()


+------+---+-------+
|_Class|_Id|  _Name|
+------+---+-------+
|     3|  1|Teacher|
|     3|  2|Teacher|
|     3|  3|Teacher|
|     3|  4|Teacher|
|     3|  5|Teacher|
|     3|  6|Teacher|
|     3|  7|Teacher|
|     3|  8|Teacher|
|     3|  9|Student|
|     3| 10|Student|
|     3| 11|Student|
|     3| 12|Student|
|     3| 13|Student|
|     3| 14|Student|
|     3| 16|Student|
|     3| 17|Student|
|     3| 18|Student|
|     3| 19|Student|
|     3| 20| Editor|
|     3| 21| Editor|
+------+---+-------+
only showing top 20 rows



In [27]:
# Renaming Columns Names
for col_name in Badge_Disc_Dim.columns:
    Badge_Disc_Dim = Badge_Disc_Dim.withColumnRenamed(col_name, col_name.lstrip("_"))

Badge_Disc_Dim=Badge_Disc_Dim.withColumnRenamed("Id","Badge_Disc_BK")
Badge_Disc_Dim.show()

+-----+-------------+-------+
|Class|Badge_Disc_BK|   Name|
+-----+-------------+-------+
|    3|            1|Teacher|
|    3|            2|Teacher|
|    3|            3|Teacher|
|    3|            4|Teacher|
|    3|            5|Teacher|
|    3|            6|Teacher|
|    3|            7|Teacher|
|    3|            8|Teacher|
|    3|            9|Student|
|    3|           10|Student|
|    3|           11|Student|
|    3|           12|Student|
|    3|           13|Student|
|    3|           14|Student|
|    3|           16|Student|
|    3|           17|Student|
|    3|           18|Student|
|    3|           19|Student|
|    3|           20| Editor|
|    3|           21| Editor|
+-----+-------------+-------+
only showing top 20 rows



In [10]:
from pyspark.sql.functions import date_format, to_timestamp ,to_date

badges_df = badges_df.withColumn("_Date", date_format(to_timestamp("_Date", "yyyy-MM-dd HH:mm:ss"), "yyyy-MM-dd"))
badges_df.show()



+------+----------+---+-------+---------+-------+
|_Class|     _Date|_Id|  _Name|_TagBased|_UserId|
+------+----------+---+-------+---------+-------+
|     3|2010-07-19|  1|Teacher|    false|      5|
|     3|2010-07-19|  2|Teacher|    false|      6|
|     3|2010-07-19|  3|Teacher|    false|      8|
|     3|2010-07-19|  4|Teacher|    false|     23|
|     3|2010-07-19|  5|Teacher|    false|     36|
|     3|2010-07-19|  6|Teacher|    false|     37|
|     3|2010-07-19|  7|Teacher|    false|     50|
|     3|2010-07-19|  8|Teacher|    false|     55|
|     3|2010-07-19|  9|Student|    false|      5|
|     3|2010-07-19| 10|Student|    false|      8|
|     3|2010-07-19| 11|Student|    false|     13|
|     3|2010-07-19| 12|Student|    false|     18|
|     3|2010-07-19| 13|Student|    false|     23|
|     3|2010-07-19| 14|Student|    false|     24|
|     3|2010-07-19| 16|Student|    false|     59|
|     3|2010-07-19| 17|Student|    false|     66|
|     3|2010-07-19| 18|Student|    false|     69|


In [11]:
from pyspark.sql.functions import col ,to_date


badges_df = badges_df.withColumnRenamed("_Date", "date") \
                     .withColumnRenamed("_Id", "Badge_Disc_fk") \
                     .withColumnRenamed("_UserId", "User_fk") \
                     .withColumnRenamed("_TagBased", "TagBased")

badges_df = badges_df.withColumn("date", to_date(col("date"), "yyyy-MM-dd"))

badges_fact = badges_df.select("date", "Badge_Disc_fk", "TagBased", "User_fk")

badges_fact.show()


+----------+-------------+--------+-------+
|      date|Badge_Disc_fk|TagBased|User_fk|
+----------+-------------+--------+-------+
|2010-07-19|            1|   false|      5|
|2010-07-19|            2|   false|      6|
|2010-07-19|            3|   false|      8|
|2010-07-19|            4|   false|     23|
|2010-07-19|            5|   false|     36|
|2010-07-19|            6|   false|     37|
|2010-07-19|            7|   false|     50|
|2010-07-19|            8|   false|     55|
|2010-07-19|            9|   false|      5|
|2010-07-19|           10|   false|      8|
|2010-07-19|           11|   false|     13|
|2010-07-19|           12|   false|     18|
|2010-07-19|           13|   false|     23|
|2010-07-19|           14|   false|     24|
|2010-07-19|           16|   false|     59|
|2010-07-19|           17|   false|     66|
|2010-07-19|           18|   false|     69|
|2010-07-19|           19|   false|     75|
|2010-07-19|           20|   false|     13|
|2010-07-19|           21|   fal

In [12]:
badges_fact.printSchema()


root
 |-- date: date (nullable = true)
 |-- Badge_Disc_fk: long (nullable = true)
 |-- TagBased: boolean (nullable = true)
 |-- User_fk: long (nullable = true)



In [17]:
output_path = "Downloads"
badges_fact.write.parquet(output_path, mode="overwrite")


Py4JJavaError: An error occurred while calling o77.parquet.
: org.apache.hadoop.mapreduce.lib.input.InvalidInputException: Input path does not exist: file:/d:/a trainning iti/GP/stats.stackexchange.com/badges.xml
	at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:340)
	at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:279)
	at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.getSplits(FileInputFormat.java:404)
	at org.apache.spark.rdd.NewHadoopRDD.getPartitions(NewHadoopRDD.scala:138)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:294)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:290)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:294)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:290)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:294)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:290)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:294)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:290)
	at org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:49)
	at org.apache.spark.rdd.RDD.$anonfun$partitions$2(RDD.scala:294)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.rdd.RDD.partitions(RDD.scala:290)
	at org.apache.spark.sql.execution.datasources.WriteFilesExec.doExecuteWrite(WriteFiles.scala:77)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeWrite$1(SparkPlan.scala:235)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:246)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:243)
	at org.apache.spark.sql.execution.SparkPlan.executeWrite(SparkPlan.scala:231)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeWrite$4(FileFormatWriter.scala:305)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.writeAndCommit(FileFormatWriter.scala:271)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeWrite(FileFormatWriter.scala:304)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:190)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:190)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:869)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:391)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:364)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:243)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:802)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.io.IOException: Input path does not exist: file:/d:/a trainning iti/GP/stats.stackexchange.com/badges.xml
	at org.apache.hadoop.mapreduce.lib.input.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:313)
	... 74 more


In [28]:


Badge_Disc_Dim.coalesce(1).write.mode("overwrite").parquet("SilverDataSet/Questions")


Py4JJavaError: An error occurred while calling o105.parquet.
: java.lang.UnsatisfiedLinkError: org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Ljava/lang/String;I)Z
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
	at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:793)
	at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1249)
	at org.apache.hadoop.fs.FileUtil.list(FileUtil.java:1454)
	at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:601)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
	at org.apache.hadoop.fs.ChecksumFileSystem.listStatus(ChecksumFileSystem.java:761)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
	at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.getAllCommittedTaskPaths(FileOutputCommitter.java:334)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJobInternal(FileOutputCommitter.java:404)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJob(FileOutputCommitter.java:377)
	at org.apache.parquet.hadoop.ParquetOutputCommitter.commitJob(ParquetOutputCommitter.java:48)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.commitJob(HadoopMapReduceCommitProtocol.scala:192)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$writeAndCommit$3(FileFormatWriter.scala:275)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at org.apache.spark.util.Utils$.timeTakenMs(Utils.scala:552)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.writeAndCommit(FileFormatWriter.scala:275)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeWrite(FileFormatWriter.scala:304)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:190)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:190)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:869)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:391)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:364)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:243)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:802)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
