In [3]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('BDAS').getOrCreate()

In [4]:
from pyspark.sql.types import *
schema = StructType([
               StructField('month',IntegerType(),True),
               StructField('SO2',FloatType(),True),
               StructField('NO2',FloatType(),True),
               StructField('CO',FloatType(),True),
               StructField('O3',FloatType(),True),
               StructField('TEMP',FloatType(),True),
               StructField('PRES',FloatType(),True),
               StructField('DewPointTempeature',FloatType(),True),
               StructField('WindSpend',FloatType(),True),
               StructField('WindDirectionIndex',DoubleType(),True),
               StructField('stationIndex',DoubleType(),True),
               StructField('NQRIndex',DoubleType(),True),
               StructField('SeasonIndex',DoubleType(),True),
                ])

In [5]:
file = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("../final_dataset/after_feature.csv")

In [6]:
column_list = file.columns
last = column_list[-1]
second_last = column_list[-2]
column_list[-1]=second_last
column_list[-2] = last
file = file.select(*column_list)
file.show()

+-----+----+-----+------+----+----+------+------------------+---------+------------------+------------+-----------+--------+
|month| SO2|  NO2|    CO|  O3|TEMP|  PRES|DewPointTempeature|WindSpend|WindDirectionIndex|stationIndex|SeasonIndex|NQRIndex|
+-----+----+-----+------+----+----+------+------------------+---------+------------------+------------+-----------+--------+
|    1|10.0| 16.0| 400.0|54.0|-1.0|1027.0|             -23.0|      0.9|               1.0|         4.0|        0.0|     0.0|
|    1|11.0| 17.0| 400.0|53.0| 0.0|1025.0|             -22.9|      2.7|               1.0|         4.0|        0.0|     0.0|
|    1|10.0| 15.0| 400.0|55.0| 0.0|1027.0|             -22.9|      2.4|               1.0|         4.0|        0.0|     0.0|
|    1|13.0| 13.0| 400.0|57.0| 0.0|1028.0|             -24.4|      2.4|               1.0|         4.0|        0.0|     0.0|
|    1|15.0| 12.0| 400.0|58.0| 0.0|1030.0|             -24.4|      2.4|               1.0|         4.0|        0.0|     0.0|


In [7]:
from pyspark.ml.feature import RFormula
formula = RFormula(
    formula="NQRIndex ~ .",
    featuresCol="features",
    labelCol="label")

output = formula.fit(file).transform(file)
vectorformat2 = output.select("features", "label")

In [8]:
vectorformat2.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.0,10.0,16.0,40...|  0.0|
|[1.0,11.0,17.0,40...|  0.0|
|[1.0,10.0,15.0,40...|  0.0|
|[1.0,13.0,13.0,40...|  0.0|
|[1.0,15.0,12.0,40...|  0.0|
|[1.0,13.0,15.0,50...|  0.0|
|[1.0,10.0,30.0,50...|  0.0|
|[1.0,10.0,41.0,60...|  0.0|
|[1.0,10.0,55.0,60...|  0.0|
|[1.0,11.0,56.0,60...|  0.0|
|[1.0,15.0,56.0,90...|  0.0|
|[1.0,26.0,64.0,13...|  0.0|
|[1.0,42.0,76.0,17...|  0.0|
|[1.0,37.0,70.0,15...|  1.0|
|[1.0,29.0,59.0,13...|  0.0|
|[1.0,18.0,33.0,90...|  0.0|
|[1.0,19.0,39.0,10...|  0.0|
|[1.0,23.0,58.0,13...|  0.0|
|[1.0,35.0,89.0,18...|  1.0|
|[1.0,43.0,106.0,2...|  1.0|
+--------------------+-----+
only showing top 20 rows



In [9]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import (RandomForestClassifier, GBTClassifier, DecisionTreeClassifier)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.util import MLUtils
import time

In [10]:
(trainingData, testData) = vectorformat2.randomSplit([0.7, 0.3])

In [11]:
start_time = time.time()
rf = RandomForestClassifier(labelCol="label", featuresCol="features",maxDepth=15)
model_rf = rf.fit(trainingData)
prediction_rf = model_rf.transform(testData)
prediction_rf.show()
print("RunTime %s seconds ---" % (time.time() - start_time))

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[1.0,1.0,8.0,200....|  0.0|[19.9773576729089...|[0.99886788364544...|       0.0|
|[1.0,1.0,12.0,200...|  0.0|[19.8467773747752...|[0.99233886873876...|       0.0|
|[1.0,1.0,25.0,300...|  0.0|[19.9461311375264...|[0.99730655687632...|       0.0|
|[1.0,1.0,62.0,100...|  0.0|[14.5857256100710...|[0.72928628050355...|       0.0|
|[1.0,2.0,2.0,100....|  0.0|[19.9534910945276...|[0.99767455472638...|       0.0|
|[1.0,2.0,2.0,100....|  0.0|[19.9954166472843...|[0.99977083236421...|       0.0|
|[1.0,2.0,2.0,100....|  0.0|[19.9863218983185...|[0.99931609491592...|       0.0|
|[1.0,2.0,2.0,100....|  0.0|[19.9954166472843...|[0.99977083236421...|       0.0|
|[1.0,2.0,2.0,100....|  0.0|[19.9940697979656...|[0.99970348989828...|       0.0|
|[1.0,2.0,2.0,10

In [12]:
print(model_rf)

RandomForestClassificationModel (uid=rfc_53222bad3ebb) with 20 trees


In [13]:
evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
evaluatorf1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
accuracy = evaluator_accuracy.evaluate(prediction_rf)
f1_score = evaluatorf1.evaluate(prediction_rf)
print("Aaccuracy = ", accuracy*100)
print("F1 score = ", f1_score*100)
print("Test Error = %g" % (1.0 - accuracy))

Aaccuracy =  81.28332300061997
F1 score =  81.11866016641761
Test Error = 0.187167


In [15]:
print(model_rf.toDebugString)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [14]:
ff = model_rf.featureImportances
importancesList=[float(col) for col in  ff]
colList = file.columns
result=dict(zip(colList,importancesList))
print(result)

{'O3': 0.0668266836867723, 'DewPointTempeature': 0.10704676056369415, 'PRES': 0.05616735722740879, 'WindSpend': 0.04127716161236746, 'SO2': 0.12042320123552756, 'stationIndex': 0.031337918707861595, 'WindDirectionIndex': 0.030110974780392485, 'SeasonIndex': 0.0, 'TEMP': 0.0506734871577071, 'CO': 0.30375581048892475, 'month': 0.054599338931692956, 'NO2': 0.13778130560765084}
