In [80]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import *
from pyspark.sql.functions import avg, max, min, count


In [81]:
spark = SparkSession.builder.appName('Test1').getOrCreate()

In [82]:
spark

In [83]:
f1 = spark.read.csv('archive/features.csv', header=True,inferSchema=True)
s1 = spark.read.csv('archive/stores.csv', header=True,inferSchema=True)
te1 = spark.read.csv('archive/test.csv', header=True,inferSchema=True)
tr1 = spark.read.csv('archive/train.csv', header=True,inferSchema=True)

In [84]:
f1.show()

+-----+----------+-----------+----------+---------+---------+---------+---------+---------+-----------+------------+---------+
|Store|      Date|Temperature|Fuel_Price|MarkDown1|MarkDown2|MarkDown3|MarkDown4|MarkDown5|        CPI|Unemployment|IsHoliday|
+-----+----------+-----------+----------+---------+---------+---------+---------+---------+-----------+------------+---------+
|    1|2010-02-05|      42.31|     2.572|       NA|       NA|       NA|       NA|       NA|211.0963582|       8.106|    false|
|    1|2010-02-12|      38.51|     2.548|       NA|       NA|       NA|       NA|       NA|211.2421698|       8.106|     true|
|    1|2010-02-19|      39.93|     2.514|       NA|       NA|       NA|       NA|       NA|211.2891429|       8.106|    false|
|    1|2010-02-26|      46.63|     2.561|       NA|       NA|       NA|       NA|       NA|211.3196429|       8.106|    false|
|    1|2010-03-05|       46.5|     2.625|       NA|       NA|       NA|       NA|       NA|211.3501429|       8

Below we are type casting the correct datatypes since even after infer_schema =True data types were considered wrong

In [85]:
#Typecasting data type since even after infer_schema =True data types were considered wrong

f1 = f1.withColumn("Store",col("Store").cast("Integer"))
f1 = f1.withColumn("Date",col("Date").cast("Date"))
f1 = f1.withColumn("Temperature",col("Temperature").cast("Float"))

f1 = f1.withColumn("MarkDown1",col("MarkDown1").cast("Float"))
f1 = f1.withColumn("MarkDown2",col("MarkDown2").cast("Float"))
f1 = f1.withColumn("MarkDown3",col("MarkDown3").cast("Float"))
f1 = f1.withColumn("MarkDown4",col("MarkDown4").cast("Float"))
f1 = f1.withColumn("MarkDown5",col("MarkDown5").cast("Float"))
f1 = f1.withColumn("CPI",col("CPI").cast("Float"))
f1 = f1.withColumn("Unemployment",col("Unemployment").cast("Float"))
f1 = f1.withColumn("IsHoliday",col("IsHoliday").cast("Boolean"))
f1.printSchema()

root
 |-- Store: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Temperature: float (nullable = true)
 |-- Fuel_Price: double (nullable = true)
 |-- MarkDown1: float (nullable = true)
 |-- MarkDown2: float (nullable = true)
 |-- MarkDown3: float (nullable = true)
 |-- MarkDown4: float (nullable = true)
 |-- MarkDown5: float (nullable = true)
 |-- CPI: float (nullable = true)
 |-- Unemployment: float (nullable = true)
 |-- IsHoliday: boolean (nullable = true)



In [86]:
f1.printSchema()

root
 |-- Store: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Temperature: float (nullable = true)
 |-- Fuel_Price: double (nullable = true)
 |-- MarkDown1: float (nullable = true)
 |-- MarkDown2: float (nullable = true)
 |-- MarkDown3: float (nullable = true)
 |-- MarkDown4: float (nullable = true)
 |-- MarkDown5: float (nullable = true)
 |-- CPI: float (nullable = true)
 |-- Unemployment: float (nullable = true)
 |-- IsHoliday: boolean (nullable = true)



In [87]:
#Handling Missing Data: Fill or remove missing values in MarkDown1-5 (often missing in real-world sales data).
f1.select('MarkDown1').distinct().show()

+---------+
|MarkDown1|
+---------+
|  6277.39|
| 10165.22|
|   2124.1|
|  4018.39|
|  2301.44|
| 46932.68|
|   516.47|
|  8081.05|
|  5659.29|
|  7866.57|
| 11685.51|
|  5000.58|
|  5857.72|
|  7011.68|
|   717.12|
| 28078.98|
| 21068.05|
|  7505.03|
| 11611.99|
|  8692.55|
+---------+
only showing top 20 rows



Lets find out the Number of null values in each column of Markdown1 - 5

In [88]:
# finding number of Null values in MarkDown1-5
print(f1.where((f1.MarkDown1.isNull())).count(),
f1.where((f1.MarkDown2.isNull())).count(),
f1.where((f1.MarkDown3.isNull())).count(),
f1.where((f1.MarkDown4.isNull())).count(),
f1.where((f1.MarkDown5.isNull())).count())

4158 5269 4577 4726 4140


In [89]:
# Finding Mean values in each column
m1_mean = f1.select(mean("MarkDown1")).collect()[0][0]

m2_mean = f1.select(mean("MarkDown2")).collect()[0][0]
m3_mean = f1.select(mean("MarkDown3")).collect()[0][0]
m4_mean = f1.select(mean("MarkDown4")).collect()[0][0]
m5_mean = f1.select(mean("MarkDown5")).collect()[0][0]
print(m1_mean)
print(m2_mean)
print(m3_mean)
print(m4_mean)

7032.371786093377
3384.176592808865
1760.100175767131
3292.9358917338986


In [90]:
f1=f1.fillna({"MarkDown1":m1_mean})
f1=f1.fillna({"MarkDown2":m2_mean})
f1=f1.fillna({"MarkDown3":m3_mean})
f1=f1.fillna({"MarkDown4":m4_mean})
f1=f1.fillna({"MarkDown5":m5_mean})

In [91]:
# Checking number of Null values after process in MarkDown1-5
print(f1.where((f1.MarkDown1.isNull())).count(),
f1.where((f1.MarkDown2.isNull())).count(),
f1.where((f1.MarkDown3.isNull())).count(),
f1.where((f1.MarkDown4.isNull())).count(),
f1.where((f1.MarkDown5.isNull())).count())

0 0 0 0 0


We have identified null values in Markdown1 - 5 columns and replaced it with mean value of specific columns 

Now Lets check if Date column is in the "yyyy-MM-dd"

In [92]:
f1.select("Date").show()

+----------+
|      Date|
+----------+
|2010-02-05|
|2010-02-12|
|2010-02-19|
|2010-02-26|
|2010-03-05|
|2010-03-12|
|2010-03-19|
|2010-03-26|
|2010-04-02|
|2010-04-09|
|2010-04-16|
|2010-04-23|
|2010-04-30|
|2010-05-07|
|2010-05-14|
|2010-05-21|
|2010-05-28|
|2010-06-04|
|2010-06-11|
|2010-06-18|
+----------+
only showing top 20 rows



Hence no format changing needed for date column

Now we will calculate a 4-Week Moving Average for the train.csv file to make forecasting easy in Dashboard graphs

In [93]:
tr1 = tr1.withColumn("Weekly_Sales",col("Weekly_Sales").cast("Float"))


In [94]:
tr1.printSchema()

root
 |-- Store: integer (nullable = true)
 |-- Dept: integer (nullable = true)
 |-- Date: date (nullable = true)
 |-- Weekly_Sales: float (nullable = true)
 |-- IsHoliday: boolean (nullable = true)



In [95]:
# Define a Window for 4-Week Moving Average (Including Current Row)
window_spec = Window.partitionBy("Store", "Dept").orderBy("Date").rowsBetween(-3, 0)

In [96]:
# Compute the Weekly Moving Average
tr1 = tr1.withColumn("4_Week_Moving_Avg", avg(col("Weekly_Sales")).over(window_spec))
tr1 = tr1.withColumn("4_Week_Moving_Avg",col("4_Week_Moving_Avg").cast("Float"))

In [97]:
tr1.show()

+-----+----+----------+------------+---------+-----------------+
|Store|Dept|      Date|Weekly_Sales|IsHoliday|4_Week_Moving_Avg|
+-----+----+----------+------------+---------+-----------------+
|    1|   2|2010-02-05|    50605.27|    false|         50605.27|
|    1|   2|2010-02-12|    44682.74|     true|        47644.004|
|    1|   2|2010-02-19|    47928.89|    false|        47738.965|
|    1|   2|2010-02-26|    44292.87|    false|         46877.44|
|    1|   2|2010-03-05|    48397.98|    false|         46325.62|
|    1|   2|2010-03-12|    43751.94|    false|         46092.92|
|    1|   2|2010-03-19|    43615.49|    false|         45014.57|
|    1|   2|2010-03-26|    41892.55|    false|        44414.492|
|    1|   2|2010-04-02|     47450.5|    false|         44177.62|
|    1|   2|2010-04-09|    46549.73|    false|        44877.066|
|    1|   2|2010-04-16|    45025.02|    false|         45229.45|
|    1|   2|2010-04-23|    44418.11|    false|         45860.84|
|    1|   2|2010-04-30|  

In [98]:
tr1.select(min("Weekly_Sales")).show()

+-----------------+
|min(Weekly_Sales)|
+-----------------+
|         -4988.94|
+-----------------+



Aggregating Total_Weekly_Sales, Avg_Weekly_Sales, Max_Weekly_Sales, Min_Weekly_Sales, Total_Transactions per Store to rank top performers as storewise_stats

In [None]:
#grouping Weekly_Sales per Store

storewise_stats = (
    tr1.groupBy("Store")
    .agg(
        sum("Weekly_Sales").alias("Total_Weekly_Sales"),
        avg("Weekly_Sales").alias("Avg_Weekly_Sales"),
        max("Weekly_Sales").alias("Max_Weekly_Sales"),
        min("Weekly_Sales").alias("Min_Weekly_Sales"),
        count("Weekly_Sales").alias("Total_Transactions")  # Counts number of sales records per store
    )
    .orderBy("Store")
    .withColumn("Total_Weekly_Sales", col("Total_Weekly_Sales").cast("decimal(20,2)"))
    .withColumn("Avg_Weekly_Sales", col("Avg_Weekly_Sales").cast("decimal(20,2)"))
    .withColumn("Max_Weekly_Sales", col("Max_Weekly_Sales").cast("decimal(20,2)"))
    .withColumn("Min_Weekly_Sales", col("Min_Weekly_Sales").cast("decimal(20,2)"))
)

storewise_stats.show()

+-----+------------------+----------------+----------------+----------------+------------------+
|Store|Total_Weekly_Sales|Avg_Weekly_Sales|Max_Weekly_Sales|Min_Weekly_Sales|Total_Transactions|
+-----+------------------+----------------+----------------+----------------+------------------+
|    1|      222402808.88|        21710.54|       203670.47|         -863.00|             10244|
|    2|      275382440.86|        26898.07|       285353.53|        -1098.00|             10238|
|    3|       57586735.05|         6373.03|       155897.94|        -1008.96|              9036|
|    4|      299543953.46|        29161.21|       385051.03|         -898.00|             10272|
|    5|       45475688.87|         5053.42|        93517.72|         -101.26|              8999|
|    6|      223756130.79|        21913.24|       342578.66|         -698.00|             10211|
|    7|       81598275.18|         8358.77|       222921.09|         -459.00|              9762|
|    8|      129951181.15|    

We are joining Stores.csv to storewise_stats for creating one more df for data analytics and BI reports called Transformedfile1

In [100]:
Transformedfile1 = storewise_stats.join(s1, on="Store", how="left")
Transformedfile1.show()

+-----+------------------+----------------+----------------+----------------+------------------+----+------+
|Store|Total_Weekly_Sales|Avg_Weekly_Sales|Max_Weekly_Sales|Min_Weekly_Sales|Total_Transactions|Type|  Size|
+-----+------------------+----------------+----------------+----------------+------------------+----+------+
|   12|      144287230.04|        14867.31|       360140.66|         -598.00|              9705|   B|112238|
|    1|      222402808.88|        21710.54|       203670.47|         -863.00|             10244|   A|151315|
|   13|      286517703.72|        27355.14|       292165.78|          -98.00|             10474|   A|219622|
|    6|      223756130.79|        21913.24|       342578.66|         -698.00|             10211|   A|202505|
|    3|       57586735.05|         6373.03|       155897.94|        -1008.96|              9036|   B| 37392|
|    5|       45475688.87|         5053.42|        93517.72|         -101.26|              8999|   B| 34875|
|   15|       89133

In [102]:
f1_tr1_join = f1.join(tr1, ["Store", "Date"], "inner")
f1_tr1_join.show()

+-----+----------+-----------+----------+---------+---------+---------+---------+---------+---------+------------+---------+----+------------+---------+-----------------+
|Store|      Date|Temperature|Fuel_Price|MarkDown1|MarkDown2|MarkDown3|MarkDown4|MarkDown5|      CPI|Unemployment|IsHoliday|Dept|Weekly_Sales|IsHoliday|4_Week_Moving_Avg|
+-----+----------+-----------+----------+---------+---------+---------+---------+---------+---------+------------+---------+----+------------+---------+-----------------+
|    1|2010-02-05|      42.31|     2.572|7032.3716|3384.1765|1760.1002|3292.9358|4132.2163|211.09636|       8.106|    false|   2|    50605.27|    false|         50605.27|
|    1|2010-02-12|      38.51|     2.548|7032.3716|3384.1765|1760.1002|3292.9358|4132.2163|211.24217|       8.106|     true|   2|    44682.74|     true|        47644.004|
|    1|2010-02-19|      39.93|     2.514|7032.3716|3384.1765|1760.1002|3292.9358|4132.2163|211.28914|       8.106|    false|   2|    47928.89|   

In [1]:
#f1_tr1_join.schema.names
Transformedfile1.schema.names

NameError: name 'Transformedfile1' is not defined

Saving the transformed data frames as csv files: 

f1_tr1_join as joined_data.csv, 
Transformedfile1 as store_wise_stats.csv


In [109]:
# Transformedfile1.write.option("header","true").csv("transformed_data/store_wise_stats.csv",header=True)
# f1_tr1_join.write.option("header","true").csv("transformed_data/joined_data.csv",header=True)

In [112]:
# Write Transformedfile1 to CSV
Transformedfile1.write.option("header", "true").mode("overwrite").csv("transformed_data/store_wise_stats.csv")

# Write f1_tr1_join to CSV
f1_tr1_join.write.option("header", "true").mode("overwrite").csv("transformed_data/joined_data.csv")


Py4JJavaError: An error occurred while calling o956.csv.
: java.lang.RuntimeException: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.getWinUtilsPath(Shell.java:735)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:270)
	at org.apache.hadoop.util.Shell.getSetPermissionCommand(Shell.java:286)
	at org.apache.hadoop.fs.RawLocalFileSystem.setPermission(RawLocalFileSystem.java:978)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkOneDirWithMode(RawLocalFileSystem.java:660)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:700)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirsWithOptionalPermission(RawLocalFileSystem.java:699)
	at org.apache.hadoop.fs.RawLocalFileSystem.mkdirs(RawLocalFileSystem.java:672)
	at org.apache.hadoop.fs.ChecksumFileSystem.mkdirs(ChecksumFileSystem.java:788)
	at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.setupJob(FileOutputCommitter.java:356)
	at org.apache.spark.internal.io.HadoopMapReduceCommitProtocol.setupJob(HadoopMapReduceCommitProtocol.scala:188)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.writeAndCommit(FileFormatWriter.scala:269)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeWrite(FileFormatWriter.scala:304)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:190)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:190)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$executeCollect$1(AdaptiveSparkPlanExec.scala:392)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec$$Lambda$3211/2115267269.apply(Unknown Source)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:420)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:392)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1$$Lambda$4120/1055343933.apply(Unknown Source)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$$$Lambda$1705/204027375.apply(Unknown Source)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.execution.SQLExecution$$$Lambda$1695/337983567.apply(Unknown Source)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.TreeNode$$Lambda$1294/306501241.apply(Unknown Source)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:869)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:391)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:364)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:243)
	at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:860)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:497)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.io.FileNotFoundException: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset. -see https://wiki.apache.org/hadoop/WindowsProblems
	at org.apache.hadoop.util.Shell.fileNotFoundException(Shell.java:547)
	at org.apache.hadoop.util.Shell.getHadoopHomeDir(Shell.java:568)
	at org.apache.hadoop.util.Shell.getQualifiedBin(Shell.java:591)
	at org.apache.hadoop.util.Shell.<clinit>(Shell.java:688)
	at org.apache.hadoop.util.StringUtils.<clinit>(StringUtils.java:79)
	at org.apache.hadoop.conf.Configuration.getTimeDurationHelper(Configuration.java:1907)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1867)
	at org.apache.hadoop.conf.Configuration.getTimeDuration(Configuration.java:1840)
	at org.apache.hadoop.util.ShutdownHookManager.getShutdownTimeout(ShutdownHookManager.java:183)
	at org.apache.hadoop.util.ShutdownHookManager$HookEntry.<init>(ShutdownHookManager.java:207)
	at org.apache.hadoop.util.ShutdownHookManager.addShutdownHook(ShutdownHookManager.java:304)
	at org.apache.spark.util.SparkShutdownHookManager.install(ShutdownHookManager.scala:181)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks$lzycompute(ShutdownHookManager.scala:50)
	at org.apache.spark.util.ShutdownHookManager$.shutdownHooks(ShutdownHookManager.scala:48)
	at org.apache.spark.util.ShutdownHookManager$.addShutdownHook(ShutdownHookManager.scala:153)
	at org.apache.spark.util.ShutdownHookManager$.<init>(ShutdownHookManager.scala:58)
	at org.apache.spark.util.ShutdownHookManager$.<clinit>(ShutdownHookManager.scala)
	at org.apache.spark.util.Utils$.createTempDir(Utils.scala:242)
	at org.apache.spark.util.SparkFileUtils.createTempDir(SparkFileUtils.scala:103)
	at org.apache.spark.util.SparkFileUtils.createTempDir$(SparkFileUtils.scala:102)
	at org.apache.spark.util.Utils$.createTempDir(Utils.scala:94)
	at org.apache.spark.deploy.SparkSubmit.prepareSubmitEnvironment(SparkSubmit.scala:377)
	at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:969)
	at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:199)
	at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:222)
	at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:91)
	at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:1125)
	at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:1134)
	at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.io.FileNotFoundException: HADOOP_HOME and hadoop.home.dir are unset.
	at org.apache.hadoop.util.Shell.checkHadoopHomeInner(Shell.java:467)
	at org.apache.hadoop.util.Shell.checkHadoopHome(Shell.java:438)
	at org.apache.hadoop.util.Shell.<clinit>(Shell.java:515)
	... 25 more
