In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext

In [2]:
SparkContext.setSystemProperty('spark.executor.memory','4g')
conf = SparkConf()
conf.set('spark.executor.instances',20)
sc = SparkContext('yarn', 'kdd99', conf=conf)
hc=HiveContext(sc)

In [3]:
hc.sql("USE itv000684_kdd99data")

In [4]:
kdd = hc.table("kdd99")

In [5]:
(train_data, test_data) = kdd.randomSplit([0.7,0.3], seed=42)

In [6]:
train_data.cache()

protocol_type,service,flag,is_anomaly,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
icmp,eco_i,SF,ipsweep.,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0,1.0,0.0,1.0,0.67,0.0,0.0,0.0,0.0
icmp,eco_i,SF,ipsweep.,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,15.0,1.0,0.0,1.0,0.53,0.0,0.0,0.0,0.0
icmp,eco_i,SF,ipsweep.,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,0.52,0.0,0.0,0.0,0.0
icmp,eco_i,SF,ipsweep.,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,43.0,1.0,0.0,1.0,0.51,0.0,0.0,0.0,0.0
icmp,eco_i,SF,ipsweep.,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,53.0,1.0,0.0,1.0,0.51,0.0,0.0,0.0,0.0
icmp,eco_i,SF,ipsweep.,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,97.0,1.0,0.0,1.0,0.51,0.0,0.0,0.0,0.0
icmp,eco_i,SF,ipsweep.,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,127.0,1.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0
icmp,eco_i,SF,ipsweep.,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,131.0,1.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0
icmp,eco_i,SF,ipsweep.,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,233.0,1.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0
icmp,eco_i,SF,ipsweep.,0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,255.0,1.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0


In [7]:
services = train_data.withColumnRenamed('service','srvc').select('srvc').distinct()

In [8]:
## filter and remove any rows with a service not trained upon

In [9]:
test_data = test_data.join(services, test_data.service == services.srvc)
#test_data.cache()

In [10]:
print("training set has " + str(train_data.count()) + " instances")

training set has 3429322 instances


In [11]:
print("test set has " + str(test_data.count()) + " instances")

test set has 1469108 instances


In [12]:
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier

In [13]:
index1 = StringIndexer(inputCol="protocol_type", outputCol="protocol-cat")
index2 = StringIndexer(inputCol="service", outputCol="service-cat")
index3 = StringIndexer(inputCol="flag", outputCol="flag-cat")
index4 = StringIndexer(inputCol="is_anomaly", outputCol="label")
onehotencode = OneHotEncoder(inputCol="service-cat", outputCol="service-onehotencode")

feat_columns = [col for col in kdd.columns +
               ['protocol-cat','service-onehotencode','flag-cat','label']
               if col not in ['protocol_type','service','flag','is_anomaly']]
vectorAssembler = VectorAssembler(inputCols = feat_columns, outputCol = 'features')

In [14]:
randomjungle = RandomForestClassifier(numTrees=500, maxDepth=6, maxBins=80,seed=42)
pipeline = Pipeline(stages=[index1,index2,index3,index4,onehotencode, vectorAssembler, randomjungle])

In [20]:
themodel = pipeline.fit(train_data)

In [18]:
themodel.save('/user/itv000684/kdd/model.model')

In [15]:
from pyspark.ml import PipelineModel
model = PipelineModel.load('/user/itv000684/kdd/model.model')

In [16]:
results = model.transform(test_data).select("label","prediction").cache()

In [17]:
import pandas as pd

def eval_metrics(lap):
    labels = lap.select("label").distinct().toPandas()['label'].tolist()
    tpos = [lap.filter(lap.label == x).filter(lap.prediction == x).count() for x in labels]
    fpos = [lap.filter(lap.label == x).filter(lap.prediction != x).count() for x in labels]
    fneg = [lap.filter(lap.label != x).filter(lap.prediction != x).count() for x in labels]
    precision = zip(labels,[float(tp)/(tp+fp+1e-50) for (tp,fp) in zip(tpos,fpos)])
    recall = zip(labels, [float(tp)/(tp+fn+1e-50) for (tp,fn) in zip(tpos,fneg)])
    return(precision,recall)

In [20]:
(precision, recall) = eval_metrics(results)
ordered_labels = model.stages[3]._call_java("labels")
df = pd.DataFrame([(x, test_data.filter(test_data.is_anomaly == x).count(),y[1],z[1]) for x,y,z in zip(ordered_labels, sorted(precision, key = lambda x: x[0]), sorted(recall, key=lambda x: x[0]))], columns = ['type','count','precision','recall'])

Py4JJavaError: An error occurred while calling o1656.count.
: org.apache.spark.sql.catalyst.errors.package$TreeNodeException: execute, tree:
Exchange SinglePartition, true, [id=#7735]
+- *(3) HashAggregate(keys=[], functions=[partial_count(1)], output=[count#17831L])
   +- *(3) Project
      +- *(3) BroadcastHashJoin [service#3], [srvc#1752], Inner, BuildLeft
         :- BroadcastExchange HashedRelationBroadcastMode(List(input[0, string, true])), [id=#7721]
         :  +- *(1) Project [service#3]
         :     +- *(1) Filter ((isnotnull(is_anomaly#5) AND (is_anomaly#5 = teardrop.)) AND isnotnull(service#3))
         :        +- *(1) Sample 0.7, 1.0, false, 42
         :           +- *(1) Project [service#3, is_anomaly#5]
         :              +- *(1) Sort [protocol_type#2 ASC NULLS FIRST, service#3 ASC NULLS FIRST, flag#4 ASC NULLS FIRST, is_anomaly#5 ASC NULLS FIRST, duration#6 ASC NULLS FIRST, src_bytes#7 ASC NULLS FIRST, dst_bytes#8 ASC NULLS FIRST, wrong_fragment#9 ASC NULLS FIRST, urgent#10 ASC NULLS FIRST, hot#11 ASC NULLS FIRST, num_failed_logins#12 ASC NULLS FIRST, num_compromised#13 ASC NULLS FIRST, root_shell#14 ASC NULLS FIRST, su_attempted#15 ASC NULLS FIRST, num_root#16 ASC NULLS FIRST, num_file_creations#17 ASC NULLS FIRST, num_shells#18 ASC NULLS FIRST, num_access_files#19 ASC NULLS FIRST, num_outbound_cmds#20 ASC NULLS FIRST, count#21 ASC NULLS FIRST, srv_count#22 ASC NULLS FIRST, serror_rate#23 ASC NULLS FIRST, srv_serror_rate#24 ASC NULLS FIRST, rerror_rate#25 ASC NULLS FIRST, ... 14 more fields], false, 0
         :                 +- *(1) ColumnarToRow
         :                    +- FileScan orc itv000684_kdd99data.kdd99[protocol_type#2,service#3,flag#4,is_anomaly#5,duration#6,src_bytes#7,dst_bytes#8,wrong_fragment#9,urgent#10,hot#11,num_failed_logins#12,num_compromised#13,root_shell#14,su_attempted#15,num_root#16,num_file_creations#17,num_shells#18,num_access_files#19,num_outbound_cmds#20,count#21,srv_count#22,serror_rate#23,srv_serror_rate#24,rerror_rate#25,... 14 more fields] Batched: true, DataFilters: [], Format: ORC, Location: InMemoryFileIndex[hdfs://m01.itversity.com:9000/user/itv000684/warehouse/itv000684_kdd99data.db/k..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<protocol_type:string,service:string,flag:string,is_anomaly:string,duration:int,src_bytes:i...
         +- *(3) HashAggregate(keys=[srvc#1752], functions=[], output=[srvc#1752])
            +- Exchange hashpartitioning(srvc#1752, 200), true, [id=#7728]
               +- *(2) HashAggregate(keys=[srvc#1752], functions=[], output=[srvc#1752])
                  +- *(2) Project [service#3 AS srvc#1752]
                     +- *(2) Filter isnotnull(service#3)
                        +- InMemoryTableScan [service#3], [isnotnull(service#3)]
                              +- InMemoryRelation [protocol_type#2, service#3, flag#4, is_anomaly#5, duration#6, src_bytes#7, dst_bytes#8, wrong_fragment#9, urgent#10, hot#11, num_failed_logins#12, num_compromised#13, root_shell#14, su_attempted#15, num_root#16, num_file_creations#17, num_shells#18, num_access_files#19, num_outbound_cmds#20, count#21, srv_count#22, serror_rate#23, srv_serror_rate#24, rerror_rate#25, ... 14 more fields], StorageLevel(disk, memory, deserialized, 1 replicas)
                                    +- *(1) Sample 0.0, 0.7, false, 42
                                       +- *(1) Sort [protocol_type#2 ASC NULLS FIRST, service#3 ASC NULLS FIRST, flag#4 ASC NULLS FIRST, is_anomaly#5 ASC NULLS FIRST, duration#6 ASC NULLS FIRST, src_bytes#7 ASC NULLS FIRST, dst_bytes#8 ASC NULLS FIRST, wrong_fragment#9 ASC NULLS FIRST, urgent#10 ASC NULLS FIRST, hot#11 ASC NULLS FIRST, num_failed_logins#12 ASC NULLS FIRST, num_compromised#13 ASC NULLS FIRST, root_shell#14 ASC NULLS FIRST, su_attempted#15 ASC NULLS FIRST, num_root#16 ASC NULLS FIRST, num_file_creations#17 ASC NULLS FIRST, num_shells#18 ASC NULLS FIRST, num_access_files#19 ASC NULLS FIRST, num_outbound_cmds#20 ASC NULLS FIRST, count#21 ASC NULLS FIRST, srv_count#22 ASC NULLS FIRST, serror_rate#23 ASC NULLS FIRST, srv_serror_rate#24 ASC NULLS FIRST, rerror_rate#25 ASC NULLS FIRST, ... 14 more fields], false, 0
                                          +- *(1) ColumnarToRow
                                             +- FileScan orc itv000684_kdd99data.kdd99[protocol_type#2,service#3,flag#4,is_anomaly#5,duration#6,src_bytes#7,dst_bytes#8,wrong_fragment#9,urgent#10,hot#11,num_failed_logins#12,num_compromised#13,root_shell#14,su_attempted#15,num_root#16,num_file_creations#17,num_shells#18,num_access_files#19,num_outbound_cmds#20,count#21,srv_count#22,serror_rate#23,srv_serror_rate#24,rerror_rate#25,... 14 more fields] Batched: true, DataFilters: [], Format: ORC, Location: InMemoryFileIndex[hdfs://m01.itversity.com:9000/user/itv000684/warehouse/itv000684_kdd99data.db/k..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<protocol_type:string,service:string,flag:string,is_anomaly:string,duration:int,src_bytes:i...

	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:56)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.doExecute(ShuffleExchangeExec.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:525)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:453)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:452)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.inputRDDs(HashAggregateExec.scala:162)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:720)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:316)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:382)
	at org.apache.spark.sql.Dataset.$anonfun$count$1(Dataset.scala:2981)
	at org.apache.spark.sql.Dataset.$anonfun$count$1$adapted(Dataset.scala:2980)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3618)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:764)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3616)
	at org.apache.spark.sql.Dataset.count(Dataset.scala:2980)
	at sun.reflect.GeneratedMethodAccessor103.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.util.concurrent.ExecutionException: org.apache.spark.SparkException: Job 257 cancelled because SparkContext was shut down
	at java.util.concurrent.FutureTask.report(FutureTask.java:122)
	at java.util.concurrent.FutureTask.get(FutureTask.java:206)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:195)
	at org.apache.spark.sql.execution.InputAdapter.doExecuteBroadcast(WholeStageCodegenExec.scala:515)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeBroadcast$1(SparkPlan.scala:188)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.SparkPlan.executeBroadcast(SparkPlan.scala:184)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.prepareBroadcast(BroadcastHashJoinExec.scala:116)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.codegenInner(BroadcastHashJoinExec.scala:210)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doConsume(BroadcastHashJoinExec.scala:100)
	at org.apache.spark.sql.execution.CodegenSupport.constructDoConsumeFunction(WholeStageCodegenExec.scala:221)
	at org.apache.spark.sql.execution.CodegenSupport.consume(WholeStageCodegenExec.scala:192)
	at org.apache.spark.sql.execution.CodegenSupport.consume$(WholeStageCodegenExec.scala:149)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.consume(HashAggregateExec.scala:48)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.generateResultFunction(HashAggregateExec.scala:626)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.doProduceWithKeys(HashAggregateExec.scala:762)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.doProduce(HashAggregateExec.scala:169)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:95)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.produce(HashAggregateExec.scala:48)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doProduce(BroadcastHashJoinExec.scala:95)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:95)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.produce(BroadcastHashJoinExec.scala:39)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:51)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:95)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.doProduceWithoutKeys(HashAggregateExec.scala:243)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.doProduce(HashAggregateExec.scala:167)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:95)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.aggregate.HashAggregateExec.produce(HashAggregateExec.scala:48)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:632)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:692)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:171)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.inputRDD$lzycompute(ShuffleExchangeExec.scala:106)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.inputRDD(ShuffleExchangeExec.scala:106)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency$lzycompute(ShuffleExchangeExec.scala:139)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.shuffleDependency(ShuffleExchangeExec.scala:137)
	at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec.$anonfun$doExecute$1(ShuffleExchangeExec.scala:154)
	at org.apache.spark.sql.catalyst.errors.package$.attachTree(package.scala:52)
	... 39 more
Caused by: org.apache.spark.SparkException: Job 257 cancelled because SparkContext was shut down
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$cleanUpAfterSchedulerStop$1(DAGScheduler.scala:979)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$cleanUpAfterSchedulerStop$1$adapted(DAGScheduler.scala:977)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:79)
	at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:977)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:2257)
	at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84)
	at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:2170)
	at org.apache.spark.SparkContext.$anonfun$stop$12(SparkContext.scala:1973)
	at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1357)
	at org.apache.spark.SparkContext.stop(SparkContext.scala:1973)
	at org.apache.spark.scheduler.cluster.YarnClientSchedulerBackend$MonitorThread.run(YarnClientSchedulerBackend.scala:122)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:775)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2120)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2139)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2164)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1004)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:388)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1003)
	at org.apache.spark.sql.execution.SparkPlan.executeCollectIterator(SparkPlan.scala:392)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.$anonfun$relationFuture$1(BroadcastExchangeExec.scala:120)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withThreadLocalCaptured$1(SQLExecution.scala:182)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
df