In [2]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('test1').getOrCreate()



In [3]:
df = spark.read.csv('data of air quality/finaldataset.csv', inferSchema=True, header=True)

In [4]:
df.show()
df.printSchema()
df=df.drop("Date")

+--------------------+-----+----+----+-----+-----+----+----+-----+-------+-------+---+----------+-------------------+---------+
|                Date| PM25|  NO| NO2|  NOx|  NH3|  CO| SO2|   O3|Benzene|Toluene|AQI|AQI_Bucket|ChangeStrategyOrNot|CityIndex|
+--------------------+-----+----+----+-----+-----+----+----+-----+-------+-------+---+----------+-------------------+---------+
|2020-03-12 00:00:...|31.21| 7.2|1.27|10.65|25.63|0.56|4.22| 2.81|   0.01|   0.08| 52|         2|                  0|       15|
|2020-03-13 00:00:...|38.39|7.19|0.91|10.37|29.16|0.57|4.46| 0.18|    0.0|    0.0| 60|         2|                  0|       15|
|2020-03-14 00:00:...|43.23|7.14|1.07|10.48|28.95|0.57|4.53| 0.41|    0.0|    0.0| 62|         2|                  0|       15|
|2020-03-15 00:00:...|33.82|7.09|0.36| 9.73|28.41|0.48|4.63|  0.3|    0.0|    0.0| 70|         2|                  0|       15|
|2020-03-16 00:00:...|27.14|5.63|2.32| 8.09|23.98| 0.5|4.71|13.02|   0.13|   0.68| 54|         2|       

In [5]:
from pyspark.ml.feature import RFormula

formula = RFormula(
    formula = 'ChangeStrategyOrNot ~ .',
    featuresCol = 'features',
    labelCol = 'label'
    )

resultofvec = formula.fit(df).transform(df)
dfvectorformat = resultofvec.select('features','label')
dfvectorformat.show(truncate = False)

+---------------------------------------------------------------------+-----+
|features                                                             |label|
+---------------------------------------------------------------------+-----+
|[31.21,7.2,1.27,10.65,25.63,0.56,4.22,2.81,0.01,0.08,52.0,2.0,15.0]  |0.0  |
|[38.39,7.19,0.91,10.37,29.16,0.57,4.46,0.18,0.0,0.0,60.0,2.0,15.0]   |0.0  |
|[43.23,7.14,1.07,10.48,28.95,0.57,4.53,0.41,0.0,0.0,62.0,2.0,15.0]   |0.0  |
|[33.82,7.09,0.36,9.73,28.41,0.48,4.63,0.3,0.0,0.0,70.0,2.0,15.0]     |0.0  |
|[27.14,5.63,2.32,8.09,23.98,0.5,4.71,13.02,0.13,0.68,54.0,2.0,15.0]  |0.0  |
|[27.32,3.07,2.14,3.41,24.57,0.48,4.84,6.03,0.25,1.34,40.0,1.0,15.0]  |0.0  |
|[31.76,3.0,1.48,5.24,23.42,0.47,5.04,8.76,0.24,1.19,51.0,2.0,15.0]   |0.0  |
|[43.8,2.97,1.31,4.97,23.41,0.48,5.3,9.96,0.26,1.11,63.0,2.0,15.0]    |0.0  |
|[35.48,3.01,0.83,4.64,24.85,0.49,5.32,6.43,0.27,1.15,69.0,2.0,15.0]  |0.0  |
|[51.27,3.01,0.88,4.62,24.44,0.49,5.63,8.62,49.66,50.11,70.0,2.0

In [6]:
#Feature Selection
from pyspark.ml.classification import RandomForestClassifier

In [7]:
model = RandomForestClassifier(numTrees = int(9), maxDepth = int(5), labelCol = 'label', seed = 11)
rfmodel = model.fit(dfvectorformat)

In [8]:
importance = rfmodel.featureImportances
column_list = df.columns
column_list.remove("ChangeStrategyOrNot")
print(column_list)


['PM25', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'AQI', 'AQI_Bucket', 'CityIndex']


In [9]:
importancesList = [float(col) for col in importance ]

final = dict(zip(column_list, importancesList))
print(final)

{'O3': 0.0012312572791757417, 'CO': 0.00015199902047995084, 'NOx': 0.26403267447796913, 'Benzene': 0.0, 'Toluene': 0.0, 'NO2': 0.0, 'CityIndex': 0.050996252281869776, 'SO2': 0.006771384045941805, 'NO': 0.0001492801460635482, 'PM25': 0.08825140064994251, 'NH3': 0.0020205165017734314, 'AQI': 0.04108780924144026, 'AQI_Bucket': 0.5453074263553438}


In [10]:
#4.2 Project data
from pyspark.sql.functions import col, explode, array, lit


In [11]:
minor_df = dfvectorformat.filter(col("label") == 1)
major_df = dfvectorformat.filter(col("label") == 0)
ratio = int(major_df.count()/minor_df.count())
print(major_df.count(),minor_df.count())

4654 2020


In [12]:
a = range(ratio)
oversampled_df = minor_df.withColumn("dummy", explode(array([lit(x) for x in a]))).drop('dummy')
combined_df = major_df.unionAll(oversampled_df)
combined_df.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[31.21,7.2,1.27,1...|  0.0|
|[38.39,7.19,0.91,...|  0.0|
|[43.23,7.14,1.07,...|  0.0|
|[33.82,7.09,0.36,...|  0.0|
|[27.14,5.63,2.32,...|  0.0|
|[27.32,3.07,2.14,...|  0.0|
|[31.76,3.0,1.48,5...|  0.0|
|[43.8,2.97,1.31,4...|  0.0|
|[35.48,3.01,0.83,...|  0.0|
|[51.27,3.01,0.88,...|  0.0|
|[33.24,2.93,0.13,...|  0.0|
|[35.34,2.98,0.16,...|  0.0|
|[47.34,2.96,0.14,...|  0.0|
|[53.64,2.96,0.17,...|  0.0|
|[54.9,2.96,0.13,3...|  0.0|
|[44.64,2.94,0.11,...|  0.0|
|[51.33,2.92,0.35,...|  0.0|
|[34.54,2.93,0.13,...|  0.0|
|[39.47,2.93,0.24,...|  0.0|
|[27.27,2.95,0.25,...|  0.0|
+--------------------+-----+
only showing top 20 rows



In [13]:
minor_df = combined_df.filter(col("label") == 1)
major_df = combined_df.filter(col("label") == 0)

print(major_df.count(),minor_df.count())

4654 4040


In [14]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier, DecisionTreeClassifier)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [15]:
finaldf = df.drop("AQI", 'AQI_Bucket')
finaldf.printSchema()

root
 |-- PM25: double (nullable = true)
 |-- NO: double (nullable = true)
 |-- NO2: double (nullable = true)
 |-- NOx: double (nullable = true)
 |-- NH3: double (nullable = true)
 |-- CO: double (nullable = true)
 |-- SO2: double (nullable = true)
 |-- O3: double (nullable = true)
 |-- Benzene: double (nullable = true)
 |-- Toluene: double (nullable = true)
 |-- ChangeStrategyOrNot: integer (nullable = true)
 |-- CityIndex: integer (nullable = true)



In [16]:
from pyspark.ml.feature import RFormula

f = RFormula(
    formula = 'ChangeStrategyOrNot ~ .',
    featuresCol = 'features',
    labelCol = 'label'
    )

vec = f.fit(finaldf).transform(finaldf)
df2vec = vec.select('features','label')
df2vec.show(truncate = False)

+------------------------------------------------------------+-----+
|features                                                    |label|
+------------------------------------------------------------+-----+
|[31.21,7.2,1.27,10.65,25.63,0.56,4.22,2.81,0.01,0.08,15.0]  |0.0  |
|[38.39,7.19,0.91,10.37,29.16,0.57,4.46,0.18,0.0,0.0,15.0]   |0.0  |
|[43.23,7.14,1.07,10.48,28.95,0.57,4.53,0.41,0.0,0.0,15.0]   |0.0  |
|[33.82,7.09,0.36,9.73,28.41,0.48,4.63,0.3,0.0,0.0,15.0]     |0.0  |
|[27.14,5.63,2.32,8.09,23.98,0.5,4.71,13.02,0.13,0.68,15.0]  |0.0  |
|[27.32,3.07,2.14,3.41,24.57,0.48,4.84,6.03,0.25,1.34,15.0]  |0.0  |
|[31.76,3.0,1.48,5.24,23.42,0.47,5.04,8.76,0.24,1.19,15.0]   |0.0  |
|[43.8,2.97,1.31,4.97,23.41,0.48,5.3,9.96,0.26,1.11,15.0]    |0.0  |
|[35.48,3.01,0.83,4.64,24.85,0.49,5.32,6.43,0.27,1.15,15.0]  |0.0  |
|[51.27,3.01,0.88,4.62,24.44,0.49,5.63,8.62,49.66,50.11,15.0]|0.0  |
|[33.24,2.93,0.13,3.65,18.55,0.37,5.41,5.64,13.07,13.69,15.0]|0.0  |
|[35.34,2.98,0.16,3.71,20.24,0.32,

In [17]:
(trainingData, testData) = df2vec.randomSplit([0.7, 0.3])

In [18]:
#6.3.1 decision tree
dt = DecisionTreeClassifier()

In [19]:

# Train model. 
model_dt = dt.fit(trainingData)

In [20]:
#test
prediction_dt = model_dt.transform(testData)

In [21]:
#binary classifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
my_binary_eval = BinaryClassificationEvaluator(labelCol = 'label')

In [22]:
print("DTC")
print(my_binary_eval.evaluate(prediction_dt))

DTC
0.9872420745069393


In [23]:
#evaluation
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(prediction_dt)
print("Test Error = %g" % (1.0 - accuracy))
print('Accuracy =', accuracy*100)

Test Error = 0.0105316
Accuracy = 98.9468405215647


In [24]:
#10-cross
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
grid = ParamGridBuilder().addGrid(dt.maxDepth,[25]).build()
cs_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
crossval = CrossValidator(estimator = dt,
                          estimatorParamMaps = grid,
                          evaluator = cs_evaluator,
                          numFolds =10 )
cvModel = crossval.fit(trainingData)

In [25]:
result = cvModel.transform(testData)

In [26]:
print(cs_evaluator.evaluate(result))

0.9899699097291875


In [27]:
#6.3.2 Random Forrest
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=20)
model_rf = rf.fit(trainingData)
prediction_rf = model_rf.transform(testData)

In [28]:
#binary classifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
my_binary_eval = BinaryClassificationEvaluator(labelCol = 'label')

In [29]:
print("DTC")
print(my_binary_eval.evaluate(prediction_rf))

DTC
0.9989685902118332


In [30]:
#evaluation
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracyrf = evaluator.evaluate(prediction_rf)
print("Test Error = %g" % (1.0 - accuracyrf))
print('Accuracy =', accuracyrf*100)

Test Error = 0.00702106
Accuracy = 99.29789368104312


In [31]:
#10-cross
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
grid = ParamGridBuilder().addGrid(rf.maxDepth,[25,10]).addGrid(rf.numTrees,[5,10]).build()
rf_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
crossval = CrossValidator(estimator = rf,
                          estimatorParamMaps = grid,
                          evaluator = rf_evaluator,
                          numFolds =10 )
rfModel = crossval.fit(trainingData)

In [32]:
rfresult = rfModel.transform(testData)

In [33]:
print(rf_evaluator.evaluate(rfresult))

0.9929789368104313


In [34]:
#importance evaluation for random forrest
importance = model_rf.featureImportances
column_list = finaldf.columns
column_list.remove("ChangeStrategyOrNot")
importancesList = [float(col) for col in importance ]

final = dict(zip(column_list, importancesList))
print(final)

{'CityIndex': 0.031518063066468864, 'O3': 0.012312122060959631, 'CO': 0.011963262573339175, 'SO2': 0.006204718552057114, 'NO': 0.06276796663963156, 'NOx': 0.2085396878155751, 'Benzene': 0.0012991069427307762, 'PM25': 0.6011814049446115, 'NH3': 0.002251441441279128, 'NO2': 0.059622133961276226, 'Toluene': 0.0023400920020711603}


In [35]:
#importance evaluation for decision tree
importance = model_dt.featureImportances
column_list = finaldf.columns
column_list.remove("ChangeStrategyOrNot")
importancesList = [float(col) for col in importance ]

final = dict(zip(column_list, importancesList))
print(final)

{'CityIndex': 0.02811704059072851, 'O3': 0.001971750491355737, 'CO': 0.005343342807943468, 'SO2': 0.0013499743426237593, 'NO': 0.0017874348623360741, 'NOx': 0.004290210558966963, 'Benzene': 0.0010300472088038417, 'PM25': 0.9527184489163569, 'NH3': 0.0, 'NO2': 0.0007010043504359482, 'Toluene': 0.0026907458704488654}


In [36]:
#print(model_dt)

In [37]:
#print(model_dt.toDebugString)

In [38]:
#6.3.3 Gradient Boosted Tree

from pyspark.ml.classification import GBTClassifier


# Train a GBT model.
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)

# Train model.  This also runs the indexers.
model = gbt.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)



In [39]:
#10-cross GBT
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
gbdgrid = ParamGridBuilder().addGrid(gbt.maxDepth,[25,10]).build()
gbt_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
crossval = CrossValidator(estimator = gbt,
                          estimatorParamMaps = gbdgrid,
                          evaluator = gbt_evaluator,
                          numFolds =10 )
gbtModel = crossval.fit(trainingData)

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1035, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 883, in send_command
    response = connection.send_command(command)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1040, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/int

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45619)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-39-19f61f414004>", line 9, in <module>
    gbtModel = crossval.fit(trainingData)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/tuning.py", line 236, in _fit
    model = est.fit(train, epm[j])
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 62, in fit
    return self.copy(params)._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wra

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45619)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-39-19f61f414004>", line 9, in <module>
    gbtModel = crossval.fit(trainingData)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/tuning.py", line 236, in _fit
    model = est.fit(train, epm[j])
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 62, in fit
    return self.copy(params)._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wra

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45619)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-39-19f61f414004>", line 9, in <module>
    gbtModel = crossval.fit(trainingData)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/tuning.py", line 236, in _fit
    model = est.fit(train, epm[j])
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 62, in fit
    return self.copy(params)._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wra

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45619)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-39-19f61f414004>", line 9, in <module>
    gbtModel = crossval.fit(trainingData)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/tuning.py", line 236, in _fit
    model = est.fit(train, epm[j])
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 62, in fit
    return self.copy(params)._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wra

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45619)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-39-19f61f414004>", line 9, in <module>
    gbtModel = crossval.fit(trainingData)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/tuning.py", line 236, in _fit
    model = est.fit(train, epm[j])
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 62, in fit
    return self.copy(params)._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wra

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45619)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-39-19f61f414004>", line 9, in <module>
    gbtModel = crossval.fit(trainingData)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/tuning.py", line 236, in _fit
    model = est.fit(train, epm[j])
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 62, in fit
    return self.copy(params)._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wra

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:45619)
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-39-19f61f414004>", line 9, in <module>
    gbtModel = crossval.fit(trainingData)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/tuning.py", line 236, in _fit
    model = est.fit(train, epm[j])
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/base.py", line 62, in fit
    return self.copy(params)._fit(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/ubuntu/spark-2.1.1-bin-hadoop2.7/python/pyspark/ml/wra

Py4JError: An error occurred while calling o6659.fit

In [None]:
gbtresult = gbtModel.transform(testData)

In [None]:
print(gbt_evaluator.evaluate(gbtresult))

In [None]:
#evaluate
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))
print('Accuracy',accuracy )


In [None]:
importance = model.featureImportances
column_list = finaldf.columns
column_list.remove("ChangeStrategyOrNot")
importancesList = [float(col) for col in importance ]

final = dict(zip(column_list, importancesList))
print(final)