In [1]:
# Section must be included at the beginning of each new notebook. Remember to change the app name. 
# If you're using VirtualBox, change the below to '/home/user/spark-2.1.1-bin-hadoop2.7'
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('zli786-Iteration4').getOrCreate()

In [2]:
# Input the dataset
df_1 = spark.read.options(header='True', inferSchema='True').csv("./dataset1.csv")

In [3]:
# Input the second dataset
df_2 = spark.read.options(header='True', inferSchema='True').csv("./dataset2.csv")
df_2.printSchema()

root
 |-- No: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- PM2.5: double (nullable = true)
 |-- PM10: double (nullable = true)
 |-- SO2: double (nullable = true)
 |-- NO2: double (nullable = true)
 |-- CO: integer (nullable = true)
 |-- O3: double (nullable = true)
 |-- TEMP: double (nullable = true)
 |-- PRES: double (nullable = true)
 |-- DEWP: double (nullable = true)
 |-- RAIN: double (nullable = true)
 |-- wd: string (nullable = true)
 |-- WSPM: double (nullable = true)
 |-- station: string (nullable = true)



In [4]:
df_1.count()

5065

In [5]:
df_2.count()

29999

In [6]:
df = df_2.union(df_1)
df.count()

35064

In [7]:
# show the visualise of the dataset
df.show()

+---+----+-----+---+----+-----+-----+----+----+---+----+----+------+-----+----+---+----+---------+
| No|year|month|day|hour|PM2.5| PM10| SO2| NO2| CO|  O3|TEMP|  PRES| DEWP|RAIN| wd|WSPM|  station|
+---+----+-----+---+----+-----+-----+----+----+---+----+----+------+-----+----+---+----+---------+
|  1|2013|    3|  1|   0|  3.0|  6.0|13.0| 7.0|300|85.0|-2.3|1020.8|-19.7| 0.0|  E| 0.5|Changping|
|  2|2013|    3|  1|   1|  3.0|  3.0| 6.0| 6.0|300|85.0|-2.5|1021.3|-19.0| 0.0|ENE| 0.7|Changping|
|  3|2013|    3|  1|   2|  3.0|  3.0|22.0|13.0|400|74.0|-3.0|1021.3|-19.9| 0.0|ENE| 0.2|Changping|
|  4|2013|    3|  1|   3|  3.0|  6.0|12.0| 8.0|300|81.0|-3.6|1021.8|-19.1| 0.0|NNE| 1.0|Changping|
|  5|2013|    3|  1|   4|  3.0|  3.0|14.0| 8.0|300|81.0|-3.5|1022.3|-19.4| 0.0|  N| 2.1|Changping|
|  6|2013|    3|  1|   5|  3.0|  3.0|10.0|17.0|400|71.0|-4.5|1022.6|-19.5| 0.0|NNW| 1.7|Changping|
|  7|2013|    3|  1|   6|  4.0|  6.0|12.0|22.0|500|65.0|-4.5|1023.4|-19.5| 0.0|NNW| 1.8|Changping|
|  8|2013|

In [8]:
# print the columns name of the dataset
df.columns

['No',
 'year',
 'month',
 'day',
 'hour',
 'PM2.5',
 'PM10',
 'SO2',
 'NO2',
 'CO',
 'O3',
 'TEMP',
 'PRES',
 'DEWP',
 'RAIN',
 'wd',
 'WSPM',
 'station']

In [9]:
# Print the data type in the dataset
df.printSchema()

root
 |-- No: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- PM2.5: double (nullable = true)
 |-- PM10: double (nullable = true)
 |-- SO2: double (nullable = true)
 |-- NO2: double (nullable = true)
 |-- CO: integer (nullable = true)
 |-- O3: double (nullable = true)
 |-- TEMP: double (nullable = true)
 |-- PRES: double (nullable = true)
 |-- DEWP: double (nullable = true)
 |-- RAIN: double (nullable = true)
 |-- wd: string (nullable = true)
 |-- WSPM: double (nullable = true)
 |-- station: string (nullable = true)



In [10]:
# Count the size of the dataset
df.count()

35064

In [11]:
# Rename the dataset's target column, from PM2.5 to Target
df1 = df.withColumnRenamed("PM2.5","Target")
df1.columns

['No',
 'year',
 'month',
 'day',
 'hour',
 'Target',
 'PM10',
 'SO2',
 'NO2',
 'CO',
 'O3',
 'TEMP',
 'PRES',
 'DEWP',
 'RAIN',
 'wd',
 'WSPM',
 'station']

In [12]:
# Generated the descriptive statistics of the dataset
# To perform better layout, separate the columns into three parts
# Time attributes 
df1.describe([
    'year',
    'month',
    'day',
    'hour',
]).show()

+-------+------------------+-----------------+------------------+-----------------+
|summary|              year|            month|               day|             hour|
+-------+------------------+-----------------+------------------+-----------------+
|  count|             35064|            35064|             35064|            35064|
|   mean| 2014.662559890486|6.522929500342231|15.729637234770705|             11.5|
| stddev|1.1772134318241192| 3.44875236004786| 8.800217529431587|6.922285262427998|
|    min|              2013|                1|                 1|                0|
|    max|              2017|               12|                31|               23|
+-------+------------------+-----------------+------------------+-----------------+



In [13]:
# The concentration of Inorganic air pollutants, and PM
df1.describe([
    'Target',
    'PM10',
    'SO2',
    'NO2',
    'CO',
    'O3',
]).show()

+-------+-----------------+-----------------+------------------+------------------+------------------+------------------+
|summary|           Target|             PM10|               SO2|               NO2|                CO|                O3|
+-------+-----------------+-----------------+------------------+------------------+------------------+------------------+
|  count|            34290|            34482|             34436|             34397|             33543|             34460|
|   mean|71.09974336541265|94.65787077315701|14.958905587176204| 44.18208550745705|1152.3013445428255|57.940002617527554|
| stddev| 72.3269261250207|83.44173842092758| 20.97533141570151|29.519796285531175| 1103.056282149164|54.316674392640884|
|    min|              2.0|              2.0|            0.2856|            1.8477|               100|            0.2142|
|    max|            882.0|            999.0|             310.0|             226.0|             10000|             429.0|
+-------+---------------

In [14]:
# The Influencing factors in the dataset
df1.describe([
    'TEMP',
    'PRES',
    'DEWP',
    'RAIN',
    'wd',
    'WSPM',
    'station'
]).show()

+-------+------------------+------------------+------------------+------------------+-----+------------------+---------+
|summary|              TEMP|              PRES|              DEWP|              RAIN|   wd|              WSPM|  station|
+-------+------------------+------------------+------------------+------------------+-----+------------------+---------+
|  count|             35011|             35014|             35011|             35013|34924|             35021|    35064|
|   mean|13.686111287926389|1007.7602777935974|1.5054954157264748|0.0603661497158197| null|1.8538362696667694|     null|
| stddev|11.365312950567448| 10.22566353049497|13.822098888069743|0.7528993068240725| null|1.3098083299251684|     null|
|    min|             -16.6|             982.4|             -35.1|               0.0|    E|               0.0|Changping|
|    max|              41.4|            1036.5|              27.2|              52.1|  WSW|              10.0|Changping|
+-------+------------------+----

In [15]:
# Remove the null value
df2 = df1.na.drop()
df2.count()

32681

In [16]:
df2.printSchema()
df2.describe([
    'Target',
    'PM10',
    'SO2',
    'NO2',
    'CO',
    'O3',
]).show()

root
 |-- No: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- Target: double (nullable = true)
 |-- PM10: double (nullable = true)
 |-- SO2: double (nullable = true)
 |-- NO2: double (nullable = true)
 |-- CO: integer (nullable = true)
 |-- O3: double (nullable = true)
 |-- TEMP: double (nullable = true)
 |-- PRES: double (nullable = true)
 |-- DEWP: double (nullable = true)
 |-- RAIN: double (nullable = true)
 |-- wd: string (nullable = true)
 |-- WSPM: double (nullable = true)
 |-- station: string (nullable = true)

+-------+-----------------+-----------------+------------------+-----------------+------------------+-----------------+
|summary|           Target|             PM10|               SO2|              NO2|                CO|               O3|
+-------+-----------------+-----------------+------------------+-----------------+------------------+----

In [17]:
df2_filter = df2.drop('No','year', 'day', 'PM10',
                      'PRES','wd','station')
df2_filter.columns

['month',
 'hour',
 'Target',
 'SO2',
 'NO2',
 'CO',
 'O3',
 'TEMP',
 'DEWP',
 'RAIN',
 'WSPM']

In [18]:
# Import VectorAssembler and Vectors to vectorization the data
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [19]:
# The input columns are the feature column names, and the output column is the feature.
assembler = VectorAssembler(
inputCols=["month", "hour", "SO2", 'NO2', "CO", 
           "O3", "TEMP", "DEWP", "RAIN", "WSPM"],
outputCol="Features")

In [20]:
# Now that we've created the assembler variable, let's actually transform the data.
df3 = assembler.transform(df2_filter)

In [21]:
# Using print schema, the features output column has been added. 
df3.printSchema()
# The "features" column is a dense vector that combines the various features as expected.
df3.head(1)

root
 |-- month: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- Target: double (nullable = true)
 |-- SO2: double (nullable = true)
 |-- NO2: double (nullable = true)
 |-- CO: integer (nullable = true)
 |-- O3: double (nullable = true)
 |-- TEMP: double (nullable = true)
 |-- DEWP: double (nullable = true)
 |-- RAIN: double (nullable = true)
 |-- WSPM: double (nullable = true)
 |-- Features: vector (nullable = true)



[Row(month=3, hour=0, Target=3.0, SO2=13.0, NO2=7.0, CO=300, O3=85.0, TEMP=-2.3, DEWP=-19.7, RAIN=0.0, WSPM=0.5, Features=DenseVector([3.0, 0.0, 13.0, 7.0, 300.0, 85.0, -2.3, -19.7, 0.0, 0.5]))]

In [22]:
# Let's select two columns (the feature and predictor).
# This is now in the appropriate format to be processed by Spark.
dataset = df3.select("Features","Target")
dataset.show()

+--------------------+------+
|            Features|Target|
+--------------------+------+
|[3.0,0.0,13.0,7.0...|   3.0|
|[3.0,1.0,6.0,6.0,...|   3.0|
|[3.0,2.0,22.0,13....|   3.0|
|[3.0,3.0,12.0,8.0...|   3.0|
|[3.0,4.0,14.0,8.0...|   3.0|
|[3.0,5.0,10.0,17....|   3.0|
|[3.0,6.0,12.0,22....|   4.0|
|[3.0,7.0,25.0,39....|   3.0|
|[3.0,8.0,13.0,42....|   9.0|
|[3.0,9.0,5.0,18.0...|  11.0|
|[3.0,10.0,3.0,10....|   9.0|
|[3.0,11.0,4.0,9.0...|   3.0|
|[3.0,12.0,4.0,8.0...|   3.0|
|[3.0,13.0,5.0,9.0...|   3.0|
|[3.0,14.0,5.0,6.0...|   9.0|
|[3.0,15.0,6.0,7.0...|   3.0|
|[3.0,16.0,5.0,9.0...|   3.0|
|[3.0,17.0,19.0,12...|   6.0|
|[3.0,18.0,40.0,22...|   4.0|
|[3.0,19.0,37.0,22...|   7.0|
+--------------------+------+
only showing top 20 rows



In [23]:
# To avoid overfit, do a randomised 70/30 split. 
train_data,test_data = dataset.randomSplit([0.7,0.3])

In [24]:
# The descriptive statistics of training data.
train_data.describe().show()

# The descriptive statistics of testing data.
test_data.describe().show()

+-------+-----------------+
|summary|           Target|
+-------+-----------------+
|  count|            23048|
|   mean| 70.2375216938563|
| stddev|71.17829361527279|
|    min|              3.0|
|    max|            581.0|
+-------+-----------------+

+-------+-----------------+
|summary|           Target|
+-------+-----------------+
|  count|             9633|
|   mean|70.49131111803177|
| stddev|70.42330404614813|
|    min|              3.0|
|    max|            662.0|
+-------+-----------------+



In [25]:
# Import the Linear Regression model
from pyspark.ml.regression import LinearRegression

In [26]:
# Initialize the Linear Regression model
lr = LinearRegression(labelCol='Target',featuresCol='Features')
# Fit the model
lr_model = lr.fit(train_data)

In [27]:
# Print the coefficients and intercept for Linear Regression.
print("Intercept:",lr_model.intercept)
print("Coefficients:",lr_model.coefficients)
display("Coefficients:",lr_model.coefficients)

Intercept: -10.738517782818937
Coefficients: [-1.1568311191555523,-0.20028479384310988,0.3836006506746051,0.930698185377942,0.030858662060314027,0.3067638818096114,-1.3056688024742853,1.93683197727284,-1.6729262264879232,3.2619202144037582]


'Coefficients:'

DenseVector([-1.1568, -0.2003, 0.3836, 0.9307, 0.0309, 0.3068, -1.3057, 1.9368, -1.6729, 3.2619])

In [28]:
# To evaluate the model against the test data.
test_results = lr_model.evaluate(test_data)
print ('Model: Linear Regression')
# Evaluation metrics 
# R2
print("R2: {}".format(test_results.r2))
# Mean Absolute Error
print("MAE: {}".format(test_results.meanAbsoluteError))
# Mean Squared Error
print("MSE: {}".format(test_results.meanSquaredError))
# root of Mean Squared Error
print("RSME: {}".format(test_results.rootMeanSquaredError))
# Explained Variance
print("Explained variance: {}".format(test_results.explainedVariance))
# This shows the difference between the predicted value and the test data.
test_results.predictions.show()

Model: Linear Regression
R2: 0.7038064757649373
MAE: 26.084266332326752
MSE: 1468.8020390872548
RSME: 38.324953217026305
Explained variance: 3568.0285331556306
+--------------------+------+-------------------+
|            Features|Target|         prediction|
+--------------------+------+-------------------+
|[1.0,0.0,3.0,14.0...|  14.0|  4.688619087199685|
|[1.0,0.0,5.0,11.0...|   8.0|  8.505467159391147|
|[1.0,0.0,6.0,22.0...|  12.0|   8.68382282692786|
|[1.0,0.0,6.0,26.0...|  15.0|  19.40342179277104|
|[1.0,0.0,7.0,36.0...|  24.0| 53.699074648820165|
|[1.0,0.0,8.0,112....| 255.0|  276.1904477085751|
|[1.0,0.0,9.0,31.0...|  15.0| 27.222367119211675|
|[1.0,0.0,10.0,14....|  20.0|  0.489921981407905|
|[1.0,0.0,10.0,65....|  87.0| 110.60889154383187|
|[1.0,0.0,11.0,36....|  27.0|  39.24364133938381|
|[1.0,0.0,12.0,101...| 115.0| 158.14344735640574|
|[1.0,0.0,14.0,17....|  16.0|-7.0152186308313835|
|[1.0,0.0,14.0,58....|  59.0|  75.18610841030814|
|[1.0,0.0,16.0,32....|  14.0| 19.1296850

In [29]:
# Import the Random Forest Regressor model
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [30]:
featureIndexer = VectorIndexer(inputCol="Features",
                               outputCol="indexedFeatures",maxCategories=4).fit(dataset)
dataset1=featureIndexer.transform(dataset)
dataset1.show()

+--------------------+------+--------------------+
|            Features|Target|     indexedFeatures|
+--------------------+------+--------------------+
|[3.0,0.0,13.0,7.0...|   3.0|[3.0,0.0,13.0,7.0...|
|[3.0,1.0,6.0,6.0,...|   3.0|[3.0,1.0,6.0,6.0,...|
|[3.0,2.0,22.0,13....|   3.0|[3.0,2.0,22.0,13....|
|[3.0,3.0,12.0,8.0...|   3.0|[3.0,3.0,12.0,8.0...|
|[3.0,4.0,14.0,8.0...|   3.0|[3.0,4.0,14.0,8.0...|
|[3.0,5.0,10.0,17....|   3.0|[3.0,5.0,10.0,17....|
|[3.0,6.0,12.0,22....|   4.0|[3.0,6.0,12.0,22....|
|[3.0,7.0,25.0,39....|   3.0|[3.0,7.0,25.0,39....|
|[3.0,8.0,13.0,42....|   9.0|[3.0,8.0,13.0,42....|
|[3.0,9.0,5.0,18.0...|  11.0|[3.0,9.0,5.0,18.0...|
|[3.0,10.0,3.0,10....|   9.0|[3.0,10.0,3.0,10....|
|[3.0,11.0,4.0,9.0...|   3.0|[3.0,11.0,4.0,9.0...|
|[3.0,12.0,4.0,8.0...|   3.0|[3.0,12.0,4.0,8.0...|
|[3.0,13.0,5.0,9.0...|   3.0|[3.0,13.0,5.0,9.0...|
|[3.0,14.0,5.0,6.0...|   9.0|[3.0,14.0,5.0,6.0...|
|[3.0,15.0,6.0,7.0...|   3.0|[3.0,15.0,6.0,7.0...|
|[3.0,16.0,5.0,9.0...|   3.0|[3

In [31]:
(trainingData,testData)=dataset1.randomSplit([0.7,0.3])
trainingData.describe().show()
testData.describe().show()

+-------+-----------------+
|summary|           Target|
+-------+-----------------+
|  count|            22816|
|   mean| 69.9772966339411|
| stddev|70.83501777465986|
|    min|              3.0|
|    max|            662.0|
+-------+-----------------+

+-------+-----------------+
|summary|           Target|
+-------+-----------------+
|  count|             9865|
|   mean|71.08719716168271|
| stddev|71.23128852707381|
|    min|              3.0|
|    max|            581.0|
+-------+-----------------+



In [32]:
# Initialize Random Forest model
rf = RandomForestRegressor(featuresCol="indexedFeatures",labelCol="Target")
# Fit the model
rf_model = rf.fit(trainingData)
rf_predictions = rf_model.transform(testData)
rf_predictions.select('prediction','Features').show()

+------------------+--------------------+
|        prediction|            Features|
+------------------+--------------------+
|15.492531522527804|[1.0,0.0,2.0,17.0...|
| 16.07256616288987|[1.0,0.0,3.0,14.0...|
|15.492531522527804|[1.0,0.0,4.0,15.0...|
|17.108843474027875|[1.0,0.0,4.0,31.0...|
|18.823733195680532|[1.0,0.0,5.0,28.0...|
|119.17584857232058|[1.0,0.0,6.0,72.0...|
| 62.05518282637926|[1.0,0.0,7.0,36.0...|
|141.91294733802906|[1.0,0.0,9.0,88.0...|
|16.875159011285266|[1.0,0.0,10.0,14....|
| 23.58997579726961|[1.0,0.0,11.0,36....|
|16.980861630075903|[1.0,0.0,14.0,17....|
| 25.00017950155124|[1.0,0.0,17.0,28....|
|16.926347661468302|[1.0,0.0,18.0,11....|
|  86.0863976648339|[1.0,0.0,19.0,65....|
|215.93030016040734|[1.0,0.0,21.0,107...|
| 93.01413925007589|[1.0,0.0,26.0,81....|
| 95.87508891909306|[1.0,0.0,27.0,77....|
|142.63396947264255|[1.0,0.0,34.0,100...|
|20.591870515432092|[1.0,0.0,41.0,19....|
| 47.30009240649021|[1.0,0.0,41.0,32....|
+------------------+--------------

In [33]:
evaluator = RegressionEvaluator(labelCol="Target",predictionCol="prediction")
rf_rmse = evaluator.evaluate(rf_predictions, {evaluator.metricName: "rmse"})
print("Random Forest Model's RMSE is %f"%rf_rmse)
# r^2 metric
rf_r2 = evaluator.evaluate(rf_predictions, {evaluator.metricName: "r2"})
print("Random Forest Model's R2 is %f"%rf_r2)
# mean absolute error 
rf_mae = evaluator.evaluate(rf_predictions, {evaluator.metricName: "mae"})
print("Random Forest Model's MAE is %f"%rf_mae)
# mean squared error
rf_mse = evaluator.evaluate(rf_predictions, {evaluator.metricName: "mse"})
print("Random Forest Model's MSE is %f"%rf_mse)

Random Forest Model's RMSE is 36.938606
Random Forest Model's R2 is 0.731055
Random Forest Model's MAE is 23.647403
Random Forest Model's MSE is 1364.460602


In [34]:
featureImportances = rf_model.featureImportances
display(featureImportances)

SparseVector(10, {0: 0.0122, 1: 0.0018, 2: 0.0516, 3: 0.2622, 4: 0.5438, 5: 0.0583, 6: 0.0165, 7: 0.0435, 9: 0.0102})

In [35]:
treeModel = rf_model
# summary only
print(treeModel)

RandomForestRegressionModel (uid=rfr_e7874167af0d) with 20 trees


In [36]:
# Import Decision Tree model
from pyspark.ml.regression import DecisionTreeRegressor

In [37]:
# Initialize Decision Tree model
dt = DecisionTreeRegressor(labelCol="Target", featuresCol="indexedFeatures")
# Fit the model
model_dt = dt.fit(trainingData)
dt_predictions = model_dt.transform(testData)
dt_predictions.select('prediction','Features').show()

+------------------+--------------------+
|        prediction|            Features|
+------------------+--------------------+
|14.105080027835768|[1.0,0.0,2.0,17.0...|
|14.105080027835768|[1.0,0.0,3.0,14.0...|
|14.105080027835768|[1.0,0.0,4.0,15.0...|
|14.105080027835768|[1.0,0.0,4.0,31.0...|
|16.242380261248186|[1.0,0.0,5.0,28.0...|
|169.97552742616034|[1.0,0.0,6.0,72.0...|
| 69.36691410392365|[1.0,0.0,7.0,36.0...|
|169.97552742616034|[1.0,0.0,9.0,88.0...|
|14.105080027835768|[1.0,0.0,10.0,14....|
|16.242380261248186|[1.0,0.0,11.0,36....|
|14.105080027835768|[1.0,0.0,14.0,17....|
|33.736559139784944|[1.0,0.0,17.0,28....|
|14.105080027835768|[1.0,0.0,18.0,11....|
| 95.90480167014614|[1.0,0.0,19.0,65....|
| 229.4227642276423|[1.0,0.0,21.0,107...|
| 69.36691410392365|[1.0,0.0,26.0,81....|
| 95.90480167014614|[1.0,0.0,27.0,77....|
| 186.9644128113879|[1.0,0.0,34.0,100...|
|14.105080027835768|[1.0,0.0,41.0,19....|
|33.736559139784944|[1.0,0.0,41.0,32....|
+------------------+--------------

In [38]:
evaluator = RegressionEvaluator(labelCol="Target",predictionCol="prediction")
dt_rmse = evaluator.evaluate(dt_predictions, {evaluator.metricName: "rmse"})
print("Decision Tree Model's RMSE is %f"%dt_rmse)
# r^2 metric
dt_r2 = evaluator.evaluate(dt_predictions, {evaluator.metricName: "r2"})
print("Decision Tree Model's R2 is %f"%dt_r2)
# mean absolute error 
dt_mae = evaluator.evaluate(dt_predictions, {evaluator.metricName: "mae"})
print("Decision Tree Model's MAE is %f"%dt_mae)
# mean squared error
dt_mse = evaluator.evaluate(dt_predictions, {evaluator.metricName: "mse"})
print("Decision Tree Model's MSE is %f"%dt_mse)

Decision Tree Model's RMSE is 38.501183
Decision Tree Model's R2 is 0.707820
Decision Tree Model's MAE is 24.633771
Decision Tree Model's MSE is 1482.341105


In [39]:
# show the feature importance in decision tree model
featureImportances = model_dt.featureImportances
display(featureImportances)

SparseVector(10, {0: 0.018, 2: 0.0058, 3: 0.1381, 4: 0.7709, 5: 0.0063, 6: 0.0004, 7: 0.0605})

In [40]:
# summary only
display(model_dt)

DecisionTreeRegressionModel (uid=DecisionTreeRegressor_4d2ca4fae82ffce81d2a) of depth 5 with 63 nodes

In [41]:
r2_result = spark.createDataFrame(
            [("LR", 0.721782, 37.5316, 25.8532),
             ("RF", 0.739855, 36.2630, 23.5430),
             ("DT", 0.715949, 38.6912, 24.7507)],
            ["Model","R2 metric","Root Mean Squared Error", "Mean Absolute Error"])
r2_result.show()

+-----+---------+-----------------------+-------------------+
|Model|R2 metric|Root Mean Squared Error|Mean Absolute Error|
+-----+---------+-----------------------+-------------------+
|   LR| 0.721782|                37.5316|            25.8532|
|   RF| 0.739855|                 36.263|             23.543|
|   DT| 0.715949|                38.6912|            24.7507|
+-----+---------+-----------------------+-------------------+



In [42]:
rf_featureImportance = spark.createDataFrame(
                        [('month',0.0096),
                         ('hour',0.0012),
                         ('SO2',0.0575),
                         ('NO2',0.2559),
                         ('CO',0.5711),
                         ('O3',0.0294),
                         ('TEMP',0.0115),
                         ('DEWP',0.0538),
                         ('RAIN',0.0003),
                         ('WSPM',0.0096)],
                        ["Features","RF's Feature Importance"])
rf_featureImportance.show()

+--------+-----------------------+
|Features|RF's Feature Importance|
+--------+-----------------------+
|   month|                 0.0096|
|    hour|                 0.0012|
|     SO2|                 0.0575|
|     NO2|                 0.2559|
|      CO|                 0.5711|
|      O3|                 0.0294|
|    TEMP|                 0.0115|
|    DEWP|                 0.0538|
|    RAIN|                 3.0E-4|
|    WSPM|                 0.0096|
+--------+-----------------------+



In [43]:
dt_featureImportance = spark.createDataFrame(
                        [('month',0.0212),
                         ('hour',0.0000),
                         ('SO2',0.0038),
                         ('NO2',0.1414),
                         ('CO',0.7707),
                         ('O3',0.0192),
                         ('TEMP',0.0002),
                         ('DEWP',0.0435),
                         ('RAIN',0.0000),
                         ('WSPM',0.0000)],
                        ["Features","DT's Feature Importance"])
dt_featureImportance.show()

+--------+-----------------------+
|Features|DT's Feature Importance|
+--------+-----------------------+
|   month|                 0.0212|
|    hour|                    0.0|
|     SO2|                 0.0038|
|     NO2|                 0.1414|
|      CO|                 0.7707|
|      O3|                 0.0192|
|    TEMP|                 2.0E-4|
|    DEWP|                 0.0435|
|    RAIN|                    0.0|
|    WSPM|                    0.0|
+--------+-----------------------+



(trainingData1,testData1)=dataset1.randomSplit([0.8,0.2])
trainingData1.describe().show()
testData1.describe().show()

In [50]:
# Initialize Random Forest model
rf1 = RandomForestRegressor(featuresCol="Features",labelCol="Target")
# Fit the model
rf1_model = rf1.fit(trainingData1)
rf1_predictions = rf1_model.transform(testData1)
evaluator = RegressionEvaluator(labelCol="Target",predictionCol="prediction")
rf_rmse = evaluator.evaluate(rf1_predictions, {evaluator.metricName: "rmse"})
print("Random Forest Model's RMSE is %f"%rf_rmse)
# r^2 metric
rf_r2 = evaluator.evaluate(rf1_predictions, {evaluator.metricName: "r2"})
print("Random Forest Model's R2 is %f"%rf_r2)
# mean absolute error 
rf_mae = evaluator.evaluate(rf1_predictions, {evaluator.metricName: "mae"})
print("Random Forest Model's MAE is %f"%rf_mae)
# mean squared error
rf_mse = evaluator.evaluate(rf1_predictions, {evaluator.metricName: "mse"})
print("Random Forest Model's MSE is %f"%rf_mse)
featureImportances = rf1_model.featureImportances
display(featureImportances)

Random Forest Model's RMSE is 36.314099
Random Forest Model's R2 is 0.736437
Random Forest Model's MAE is 23.359890
Random Forest Model's MSE is 1318.713790


SparseVector(10, {0: 0.0087, 1: 0.0017, 2: 0.0633, 3: 0.2595, 4: 0.5477, 5: 0.0496, 6: 0.0136, 7: 0.0444, 8: 0.0004, 9: 0.0111})

In [51]:
rf_featureImportance = spark.createDataFrame(
                        [('month',0.0087),
                         ('hour',0.0017),
                         ('SO2',0.0633),
                         ('NO2',0.2595),
                         ('CO',0.5477),
                         ('O3',0.0496),
                         ('TEMP',0.0136),
                         ('DEWP',0.0444),
                         ('RAIN',0.0004),
                         ('WSPM',0.0111)],
                        ["Features","RF's Feature Importance"])
rf_featureImportance.show()

+--------+-----------------------+
|Features|RF's Feature Importance|
+--------+-----------------------+
|   month|                 0.0087|
|    hour|                 0.0017|
|     SO2|                 0.0633|
|     NO2|                 0.2595|
|      CO|                 0.5477|
|      O3|                 0.0496|
|    TEMP|                 0.0136|
|    DEWP|                 0.0444|
|    RAIN|                 4.0E-4|
|    WSPM|                 0.0111|
+--------+-----------------------+

