In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('BDAS_Datass transformation').getOrCreate()

In [2]:
from pyspark.sql.types import *
schema = StructType([
               StructField('year',IntegerType(),True),
               StructField('month',IntegerType(),True),
               StructField('day',IntegerType(),True),
               StructField('hour',IntegerType(),True),
               StructField('PM25',FloatType(),True),
               StructField('SO2',FloatType(),True),
               StructField('NO2',FloatType(),True),
               StructField('CO',FloatType(),True),
               StructField('O3',FloatType(),True),
               StructField('TEMP',FloatType(),True),
               StructField('PRES',FloatType(),True),
               StructField('DewPointTempeature',FloatType(),True),
               StructField('RAIN',FloatType(),True),
               StructField('WindDirection',StringType(),True),
               StructField('WindSpend',FloatType(),True),
               StructField('station',StringType(),True),
               StructField('NQR',StringType(),True),
               StructField('Season',StringType(),True),
                ])

In [3]:
file = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("../final_dataset/finishedpreprocessing.csv")

In [18]:
print(type(file))

<class 'pyspark.sql.dataframe.DataFrame'>


In [None]:
df.show()

In [5]:
df = df.drop(*["PM25","day"])
df.show()

+----+-----+----+----+-----+------+----+----+------+------------------+----+-------------+---------+-------------+------------+------+
|year|month|hour| SO2|  NO2|    CO|  O3|TEMP|  PRES|DewPointTempeature|RAIN|WindDirection|WindSpend|      station|         NQR|Season|
+----+-----+----+----+-----+------+----+----+------+------------------+----+-------------+---------+-------------+------------+------+
|2015|    1|   0|10.0| 16.0| 400.0|54.0|-1.0|1027.0|             -23.0| 0.0|           NW|      0.9|Wanshouxigong|        Good|Winter|
|2015|    1|   1|11.0| 17.0| 400.0|53.0| 0.0|1025.0|             -22.9| 0.0|           NW|      2.7|Wanshouxigong|        Good|Winter|
|2015|    1|   2|10.0| 15.0| 400.0|55.0| 0.0|1027.0|             -22.9| 0.0|           NW|      2.4|Wanshouxigong|        Good|Winter|
|2015|    1|   3|13.0| 13.0| 400.0|57.0| 0.0|1028.0|             -24.4| 0.0|           NW|      2.4|Wanshouxigong|        Good|Winter|
|2015|    1|   4|15.0| 12.0| 400.0|58.0| 0.0|1030.0|   

In [6]:
from pyspark.ml.feature import StringIndexer
column = ['WindDirection','station','NQR','Season']
for i in column:
    indexer = StringIndexer(inputCol =i, outputCol= i+"Index" )
    model = indexer.fit(df)
    indexed = model.transform(df)
    df = indexed
new_df = df.drop(*column)

In [None]:
new_df.show()

In [7]:
from pyspark.ml.feature import RFormula
formula = RFormula(
    formula="NQRIndex ~ .",
    featuresCol="features",
    labelCol="label")

output = formula.fit(new_df).transform(new_df)
vectorformat = output.select("features", "label")

In [None]:
vectorformat.show(truncate = False)

In [8]:
from pyspark.ml.feature import ChiSqSelector
from pyspark.ml.linalg import Vectors

selector = ChiSqSelector(numTopFeatures=16, featuresCol="features",
                         outputCol="selectedFeatures", labelCol="label")

result = selector.fit(vectorformat).transform(vectorformat)

print("ChiSqSelector output with top %d features selected" % selector.getNumTopFeatures())


ChiSqSelector output with top 16 features selected


In [9]:
result.select('features','label','selectedFeatures').show(20,truncate = False)

+-----------------------------------------------------------------------------------------------------------+-----+-----------------------------------------------------------------------------------------------------------+
|features                                                                                                   |label|selectedFeatures                                                                                           |
+-----------------------------------------------------------------------------------------------------------+-----+-----------------------------------------------------------------------------------------------------------+
|[2015.0,1.0,0.0,10.0,16.0,400.0,54.0,-1.0,1027.0,-23.0,0.0,0.8999999761581421,1.0,4.0,0.0]                 |0.0  |[2015.0,1.0,0.0,10.0,16.0,400.0,54.0,-1.0,1027.0,-23.0,0.0,0.8999999761581421,1.0,4.0,0.0]                 |
|[2015.0,1.0,1.0,11.0,17.0,400.0,53.0,0.0,1025.0,-22.899999618530273,0.0,2.700000047683716,1.0,4.0,0.0] 

In [10]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(numTrees=int(9), maxDepth=int(5), labelCol='label', seed=11)
model = rf.fit(vectorformat)

In [11]:
ff=model.featureImportances
importancesList=[float(col) for col in  ff]
colList = new_df.columns
result=dict(zip(colList,importancesList))
print(result)

{'NQRIndex': 0.0, 'year': 0.00013154716809603672, 'DewPointTempeature': 0.0884079368264622, 'stationIndex': 0.001036175290576831, 'NO2': 0.1810933064930174, 'CO': 0.39062896097298794, 'WindDirectionIndex': 0.02110121663588012, 'month': 0.009670148938959573, 'TEMP': 0.024795955307260142, 'hour': 0.0005757787124201702, 'PRES': 0.02374150268508552, 'O3': 0.046052694563309626, 'RAIN': 0.0, 'WindSpend': 0.002673793209110729, 'SO2': 0.2100909831968337}


In [None]:
def sort_by_value(d): 
    items=d.items() 
    backitems=[[v[1],v[0]] for v in items] 
    backitems.sort() 
    return [ backitems[i][1] for i in range(0,len(backitems))]

In [None]:
dic = sort_by_value(result)
dic.reverse()
print(dic)

In [12]:
after_feature = new_df.drop(*['hour','RAIN','year'])
after_feature.show()
print("Number of attribute: ",len(after_feature.columns))

+-----+----+-----+------+----+----+------+------------------+---------+------------------+------------+--------+-----------+
|month| SO2|  NO2|    CO|  O3|TEMP|  PRES|DewPointTempeature|WindSpend|WindDirectionIndex|stationIndex|NQRIndex|SeasonIndex|
+-----+----+-----+------+----+----+------+------------------+---------+------------------+------------+--------+-----------+
|    1|10.0| 16.0| 400.0|54.0|-1.0|1027.0|             -23.0|      0.9|               1.0|         4.0|     0.0|        0.0|
|    1|11.0| 17.0| 400.0|53.0| 0.0|1025.0|             -22.9|      2.7|               1.0|         4.0|     0.0|        0.0|
|    1|10.0| 15.0| 400.0|55.0| 0.0|1027.0|             -22.9|      2.4|               1.0|         4.0|     0.0|        0.0|
|    1|13.0| 13.0| 400.0|57.0| 0.0|1028.0|             -24.4|      2.4|               1.0|         4.0|     0.0|        0.0|
|    1|15.0| 12.0| 400.0|58.0| 0.0|1030.0|             -24.4|      2.4|               1.0|         4.0|     0.0|        0.0|


In [None]:
formula = RFormula(
    formula="NQRIndex ~ .",
    featuresCol="features",
    labelCol="label")

output = formula.fit(after_feature).transform(after_feature)
vectorformat2 = output.select("features", "label")

In [13]:
from pyspark.sql.functions import col, explode, array, lit
df_0 = after_feature.filter(col("NQRIndex") == 0)
df_1 = after_feature.filter(col("NQRIndex") == 1)
df_2 = after_feature.filter(col("NQRIndex") == 2)
df_3 = after_feature.filter(col("NQRIndex") == 3)
df_4 = after_feature.filter(col("NQRIndex") == 4)
df_5 = after_feature.filter(col("NQRIndex") == 5)

print(df_0.count(),df_1.count(),df_2.count(),df_3.count(),df_4.count(),df_5.count())

108279 49511 38691 12271 3658 1791


In [14]:
count_list = [df_1,df_2,df_3,df_4,df_5]
combined_df = df_0
for element in count_list:
    ratio = int(df_0.count()/element.count())
    a = range(ratio)
    oversampled_df = element.withColumn("dummy", explode(array([lit(x) for x in a]))).drop('dummy')
    combined_df = combined_df.unionAll(oversampled_df)

In [20]:
df_0 = combined_df.filter(col("NQRIndex") == 0)
df_1 = combined_df.filter(col("NQRIndex") == 1)
df_2 = combined_df.filter(col("NQRIndex") == 2)
df_3 = combined_df.filter(col("NQRIndex") == 3)
df_4 = combined_df.filter(col("NQRIndex") == 4)
df_5 = combined_df.filter(col("NQRIndex") == 5)

print(df_0.count(),df_1.count(),df_2.count(),df_3.count(),df_4.count(),df_5.count())

108279 99022 77382 98168 106082 107460


In [19]:
file = combined_df
file.write.csv("../final_dataset/view2.csv",header = True)

In [15]:
print(combined_df.count())

596393


In [22]:
combined_df.printSchema()
combined_df.show()

root
 |-- month: integer (nullable = true)
 |-- SO2: float (nullable = true)
 |-- NO2: float (nullable = true)
 |-- CO: float (nullable = true)
 |-- O3: float (nullable = true)
 |-- TEMP: float (nullable = true)
 |-- PRES: float (nullable = true)
 |-- DewPointTempeature: float (nullable = true)
 |-- WindSpend: float (nullable = true)
 |-- WindDirectionIndex: double (nullable = true)
 |-- stationIndex: double (nullable = true)
 |-- NQRIndex: double (nullable = true)
 |-- SeasonIndex: double (nullable = true)

+-----+----+----+------+----+----+------+------------------+---------+------------------+------------+--------+-----------+
|month| SO2| NO2|    CO|  O3|TEMP|  PRES|DewPointTempeature|WindSpend|WindDirectionIndex|stationIndex|NQRIndex|SeasonIndex|
+-----+----+----+------+----+----+------+------------------+---------+------------------+------------+--------+-----------+
|    1|10.0|16.0| 400.0|54.0|-1.0|1027.0|             -23.0|      0.9|               1.0|         4.0|     0.0|   

In [23]:
combined_df.count()

596393

In [24]:
formula = RFormula(
    formula="NQRIndex ~ .",
    featuresCol="features",
    labelCol="label")

output = formula.fit(combined_df).transform(combined_df)
vectorformat3 = output.select("features", "label")

In [25]:
vectorformat3.show()
vectorformat3.count()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.0,10.0,16.0,40...|  0.0|
|[1.0,11.0,17.0,40...|  0.0|
|[1.0,10.0,15.0,40...|  0.0|
|[1.0,13.0,13.0,40...|  0.0|
|[1.0,15.0,12.0,40...|  0.0|
|[1.0,13.0,15.0,50...|  0.0|
|[1.0,10.0,30.0,50...|  0.0|
|[1.0,10.0,41.0,60...|  0.0|
|[1.0,10.0,55.0,60...|  0.0|
|[1.0,11.0,56.0,60...|  0.0|
|[1.0,15.0,56.0,90...|  0.0|
|[1.0,26.0,64.0,13...|  0.0|
|[1.0,42.0,76.0,17...|  0.0|
|[1.0,29.0,59.0,13...|  0.0|
|[1.0,18.0,33.0,90...|  0.0|
|[1.0,19.0,39.0,10...|  0.0|
|[1.0,23.0,58.0,13...|  0.0|
|[1.0,28.0,91.0,15...|  0.0|
|[1.0,23.0,86.0,15...|  0.0|
|[1.0,17.0,62.0,10...|  0.0|
+--------------------+-----+
only showing top 20 rows



596393

In [26]:
(trainingData, testData) = vectorformat3.randomSplit([0.7, 0.3])

In [28]:
print(trainingData.count(),testData.count())

417708 178685


In [36]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier, DecisionTreeClassifier)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import time

In [37]:
start_time = time.time()
dt = DecisionTreeClassifier()
model_dt = dt.fit(trainingData)
prediction_dt = model_dt.transform(testData)
prediction_dt.show()
print("RunTime %s seconds ---" % (time.time() - start_time))

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[1.0,1.0,9.0,200....|  0.0|[23595.0,783.0,11...|[0.96125641652407...|       0.0|
|[1.0,1.0,9.0,300....|  0.0|[23595.0,783.0,11...|[0.96125641652407...|       0.0|
|[1.0,1.0,11.0,300...|  0.0|[23595.0,783.0,11...|[0.96125641652407...|       0.0|
|[1.0,1.0,12.0,200...|  0.0|[23595.0,783.0,11...|[0.96125641652407...|       0.0|
|[1.0,1.0,14.0,400...|  0.0|[23595.0,783.0,11...|[0.96125641652407...|       0.0|
|[1.0,1.0,17.0,600...|  0.0|[6180.0,1293.0,78...|[0.81789306511381...|       0.0|
|[1.0,1.0,25.0,300...|  0.0|[23595.0,783.0,11...|[0.96125641652407...|       0.0|
|[1.0,1.0,34.0,100...|  0.0|[23595.0,783.0,11...|[0.96125641652407...|       0.0|
|[1.0,1.0,60.0,100...|  0.0|[23595.0,783.0,11...|[0.96125641652407...|       0.0|
|[1.0,2.0,2.0,10

In [32]:
prediction_dt.select("prediction", "label", "features").show(5)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|[1.0,1.0,9.0,200....|
|       0.0|  0.0|[1.0,1.0,9.0,300....|
|       0.0|  0.0|[1.0,1.0,11.0,300...|
|       0.0|  0.0|[1.0,1.0,12.0,200...|
|       0.0|  0.0|[1.0,1.0,14.0,400...|
+----------+-----+--------------------+
only showing top 5 rows



In [38]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(prediction_dt)
print("Aaccuracy = ", accuracy*100)

Aaccuracy =  60.42253127011221


In [39]:
start_time = time.time()
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=5)
model_rf = rf.fit(trainingData)
prediction_rf = model_rf.transform(testData)
prediction_rf.show()
print("RunTime %s seconds ---" % (time.time() - start_time))

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[1.0,1.0,9.0,200....|  0.0|[4.60572978113011...|[0.92114595622602...|       0.0|
|[1.0,1.0,9.0,300....|  0.0|[4.70904629588762...|[0.94180925917752...|       0.0|
|[1.0,1.0,11.0,300...|  0.0|[4.60572978113011...|[0.92114595622602...|       0.0|
|[1.0,1.0,12.0,200...|  0.0|[4.60572978113011...|[0.92114595622602...|       0.0|
|[1.0,1.0,14.0,400...|  0.0|[4.70904629588762...|[0.94180925917752...|       0.0|
|[1.0,1.0,17.0,600...|  0.0|[4.40919061875809...|[0.88183812375161...|       0.0|
|[1.0,1.0,25.0,300...|  0.0|[4.66686858095273...|[0.93337371619054...|       0.0|
|[1.0,1.0,34.0,100...|  0.0|[4.59439108060429...|[0.91887821612085...|       0.0|
|[1.0,1.0,60.0,100...|  0.0|[3.53272635697201...|[0.70654527139440...|       0.0|
|[1.0,2.0,2.0,10

In [40]:
accuracy = evaluator.evaluate(prediction_rf)
print("Aaccuracy = ", accuracy*100)

Aaccuracy =  59.93284271203515


In [42]:

from pyspark.ml.classification import GBTClassifier

In [44]:
start_time = time.time()
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)
model = gbt.fit(trainingData)
predictions = model.transform(testData)
predictions.select("prediction", "label", "features").show(5)
print("RunTime %s seconds ---" % (time.time() - start_time))

Py4JJavaError: An error occurred while calling o811.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 189.0 failed 1 times, most recent failure: Lost task 2.0 in stage 189.0 (TID 637, localhost, executor driver): java.lang.IllegalArgumentException: requirement failed: GBTClassifier was given dataset with invalid label 2.0.  Labels must be in {0,1}; note that GBTClassifier currently only supports binary classification.
	at scala.Predef$.require(Predef.scala:224)
	at org.apache.spark.ml.classification.GBTClassifier$$anonfun$1.apply(GBTClassifier.scala:154)
	at org.apache.spark.ml.classification.GBTClassifier$$anonfun$1.apply(GBTClassifier.scala:152)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:216)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1005)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:996)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:936)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:996)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:700)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1925)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1938)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1951)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1965)
	at org.apache.spark.rdd.RDD.count(RDD.scala:1158)
	at org.apache.spark.ml.tree.impl.DecisionTreeMetadata$.buildMetadata(DecisionTreeMetadata.scala:116)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:105)
	at org.apache.spark.ml.regression.DecisionTreeRegressor.train(DecisionTreeRegressor.scala:125)
	at org.apache.spark.ml.tree.impl.GradientBoostedTrees$.boost(GradientBoostedTrees.scala:291)
	at org.apache.spark.ml.tree.impl.GradientBoostedTrees$.run(GradientBoostedTrees.scala:53)
	at org.apache.spark.ml.classification.GBTClassifier.train(GBTClassifier.scala:167)
	at org.apache.spark.ml.classification.GBTClassifier.train(GBTClassifier.scala:60)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:96)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.IllegalArgumentException: requirement failed: GBTClassifier was given dataset with invalid label 2.0.  Labels must be in {0,1}; note that GBTClassifier currently only supports binary classification.
	at scala.Predef$.require(Predef.scala:224)
	at org.apache.spark.ml.classification.GBTClassifier$$anonfun$1.apply(GBTClassifier.scala:154)
	at org.apache.spark.ml.classification.GBTClassifier$$anonfun$1.apply(GBTClassifier.scala:152)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:216)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1005)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:996)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:936)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:996)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:700)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [53]:
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder


In [65]:
pipeline = Pipeline(stages=[rf])
paramGrid = ParamGridBuilder().addGrid(rf.numTrees, [5]).build()
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)
cvmodel = crossval.fit(vectorformat3)
result = cvmodel.bestModel.transform(df)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37312)
Traceback (most recent call last):
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 827, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 963, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37312)
Traceback (most recent call last):
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 827, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37312)
Traceback (most recent call last):
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 827, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 963, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-65-0d26aebfcbc7>", line 7, in <modu

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37312)
Traceback (most recent call last):
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 827, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 963, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-65-0d26aebfcbc7>", line 7, in <modu

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37312)
Traceback (most recent call last):
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 827, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 963, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-65-0d26aebfcbc7>", line 7, in <modu

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37312)
Traceback (most recent call last):
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 827, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 963, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-65-0d26aebfcbc7>", line 7, in <modu

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37312)
Traceback (most recent call last):
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 827, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 963, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-65-0d26aebfcbc7>", line 7, in <modu

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:37312)
Traceback (most recent call last):
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 827, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 963, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-65-0d26aebfcbc7>", line 7, in <modu

Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:37312)

ValueError: `data` should be an RDD of LabeledPoint