In [1]:
from pyspark.sql import SparkSession
import numpy as np
spark = SparkSession.builder.master("local[2]")\
                            .appName("COM6012 Assignment Question——1")\
                            .config("spark.driver.memory", "8g")\
                            .config("spark.executor.cores", 8)\
                            .getOrCreate()

In [2]:
rawdata = spark.read.csv('.\Dataset\HIGGS.csv.gz')

In [3]:
feature_names = ['lepton pT', 'lepton eta', 'lepton phi', 
                 'missing energy magnitude', 'missing energy phi', 
                 'jet 1 pt', 'jet 1 eta', 'jet 1 phi', 'jet 1 b-tag', 
                 'jet 2 pt', 'jet 2 eta', 'jet 2 phi', 'jet 2 b-tag', 
                 'jet 3 pt', 'jet 3 eta', 'jet 3 phi', 'jet 3 b-tag', 
                 'jet 4 pt', 'jet 4 eta', 'jet 4 phi', 'jet 4 b-tag', 
                 'm_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']

In [4]:
schemaNames = rawdata.schema.names

In [5]:
ncolumns = len(rawdata.columns)
rawdata = rawdata.withColumnRenamed(schemaNames[0],'labels')
for i in range(ncolumns-1):
     rawdata = rawdata.withColumnRenamed(schemaNames[i+1], feature_names[i])
schemaNames = rawdata.schema.names

In [6]:
from pyspark.sql.types import DoubleType
for i in range(ncolumns):
    rawdata = rawdata.withColumn(schemaNames[i], rawdata[schemaNames[i]].cast(DoubleType()))

In [7]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols = schemaNames[1:ncolumns], outputCol = 'features') 
raw_plus_vector = assembler.transform(rawdata)
data = raw_plus_vector.select('features','labels')

In [8]:
small_data = data.sample(False, 0.0005,47)

In [9]:
small_data.printSchema()
small_data.cache()

root
 |-- features: vector (nullable = true)
 |-- labels: double (nullable = true)



DataFrame[features: vector, labels: double]

In [11]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier

rf = RandomForestClassifier(featuresCol="features",labelCol="labels")

gbt = GBTClassifier(featuresCol="features",labelCol="labels")

from pyspark.ml import Pipeline

pipeline_rf = Pipeline(stages=[rf])
pipeline_gbt = Pipeline(stages=[gbt])

In [13]:
from pyspark.ml.tuning import ParamGridBuilder
import numpy as np


#parameter grid for random forest 
paramGrid_rf = ParamGridBuilder() \
    .addGrid(rf.numTrees, [3, 5, 7]) \
    .addGrid(rf.maxDepth, [3, 5, 7]) \
    .addGrid(rf.maxBins, [3, 5, 7])\
    .build()

#parameter grid for gradient boost 
paramGrid_gbt = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [2,4,6]) \
    .addGrid(gbt.maxDepth, [3, 5, 7]) \
    .addGrid(gbt.maxBins, [3, 5, 7])\
    .build()

In [14]:
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import time

evaluator = MulticlassClassificationEvaluator(labelCol="labels", predictionCol="prediction", metricName="accuracy")

crossval_rf = CrossValidator(estimator=pipeline_rf,
                          estimatorParamMaps=paramGrid_rf,
                          evaluator=evaluator,
                          numFolds=3)

crossval_gbt = CrossValidator(estimator=pipeline_gbt,
                          estimatorParamMaps=paramGrid_gbt,
                          evaluator=evaluator,
                          numFolds=3)

(trainingData, testData) = small_data.randomSplit([0.8, 0.2],47)

start = time.time()
rfModel = crossval_rf.fit(trainingData)
rf_predictions = rfModel.transform(testData)
end = time.time()
print('Random Forest execution time:',end-start)

start = time.time()
gbtModel = crossval_gbt.fit(trainingData)
gbt_predictions = gbtModel.transform(testData)
end = time.time()
print('GBT execution time:',end-start)


Random Forest execution time: 307.8934078216553
GBT execution time: 111.27849268913269


In [15]:
evaluator = MulticlassClassificationEvaluator(labelCol="labels", predictionCol="prediction", metricName="accuracy")

rf_accuracy_score = evaluator.evaluate(rf_predictions)

gbt_accuracy_score = evaluator.evaluate(gbt_predictions)

evaluator = BinaryClassificationEvaluator(labelCol="labels",rawPredictionCol='rawPrediction', metricName="areaUnderROC")

rf_auc_score = evaluator.evaluate(rf_predictions)

gbt_auc_score = evaluator.evaluate(gbt_predictions)

print('-----Random Forest Score-----')
print('accuracy score:',rf_accuracy_score)
print('AUC score:',rf_auc_score)
print('                             ')
print('--------GBT Score------------')
print('accuracy score:',gbt_accuracy_score)
print('AUC score:',gbt_auc_score)


bestPipeline_rf, bestPipeline_gbt = rfModel.bestModel, gbtModel.bestModel

bestModel_rf, bestModel_gbt = bestPipeline_rf.stages[0], bestPipeline_gbt.stages[0]


-----Random Forest Score-----
accuracy score: 0.6513513513513514
AUC score: 0.7089434678314426
                             
--------GBT Score------------
accuracy score: 0.6342342342342342
AUC score: 0.6934946833267692


In [16]:
print('-------Random Forest best model-----------')
print('numTrees - ', bestModel_rf.getNumTrees)
print('maxDepth - ', bestModel_rf.getOrDefault('maxDepth'))
print('maxBins - ', bestModel_rf.getOrDefault('maxBins'))
print('------------------------------------------')


print('---------------GBT best model-------------')
print('maxIter - ', bestModel_gbt.getOrDefault('maxIter'))
print('maxDepth - ', bestModel_gbt.getOrDefault('maxDepth'))
print('maxBins - ', bestModel_gbt.getOrDefault('maxBins'))
print('------------------------------------------')

-------Random Forest best model-----------
numTrees -  7
maxDepth -  7
maxBins -  7
------------------------------------------
---------------GBT best model-------------
maxIter -  6
maxDepth -  3
maxBins -  5
------------------------------------------


In [17]:
#training with whole data with best parameter
print('----------------------------------')
print('Start to train with best parameter...')
print('----------------------------------')
best_rf = RandomForestClassifier(featuresCol="features",labelCol="labels",
                            numTrees = bestModel_rf.getNumTrees,
                           maxDepth = bestModel_rf.getOrDefault('maxDepth'),
                           maxBins = bestModel_rf.getOrDefault('maxBins'))

best_gbt = GBTClassifier(featuresCol="features",labelCol="labels",
                        maxIter=bestModel_gbt.getOrDefault('maxIter'),
                        maxDepth = bestModel_gbt.getOrDefault('maxDepth'),
                        maxBins = bestModel_gbt.getOrDefault('maxBins'))

new_data = data.sample(False, 0.005,47)
(trainingData, testData) = new_data.randomSplit([0.8, 0.2],47)

trainingData.cache()
testData.cache()


start = time.time()
best_rfModel = best_rf.fit(trainingData)
best_rf_predictions = best_rfModel.transform(testData)
end = time.time()
print('Random Forest execution time:',end-start)

start = time.time()
best_gbtModel = best_gbt.fit(trainingData)
best_gbt_predictions = best_gbtModel.transform(testData)
end = time.time()
print('GBT execution time:',end-start)

----------------------------------
Start to train with best parameter...
----------------------------------
Random Forest execution time: 212.2647671699524
GBT execution time: 2.8081161975860596


In [18]:
evaluator = MulticlassClassificationEvaluator(labelCol="labels", predictionCol="prediction", metricName="accuracy")

rf_accuracy_score = evaluator.evaluate(best_rf_predictions)

gbt_accuracy_score = evaluator.evaluate(best_gbt_predictions)

evaluator = BinaryClassificationEvaluator(labelCol="labels",rawPredictionCol='rawPrediction', metricName="areaUnderROC")

rf_auc_score = evaluator.evaluate(best_rf_predictions)

gbt_auc_score = evaluator.evaluate(best_gbt_predictions)

print('-----Random Forest Score-----')
print('accuracy score:',rf_accuracy_score)
print('AUC score:',rf_auc_score)
print('                             ')
print('--------GBT Score------------')
print('accuracy score:',gbt_accuracy_score)
print('AUC score:',gbt_auc_score)

-----Random Forest Score-----
accuracy score: 0.685397378311486
AUC score: 0.7474682491901027
                             
--------GBT Score------------
accuracy score: 0.6545971216426804
AUC score: 0.7161060715676998


In [19]:
import pandas as pd

importances = best_rfModel.featureImportances
df_relevance = pd.DataFrame(importances.toArray())
df_relevance.columns = ['relevance']
df_relevance.index = feature_names
print('-------The most relevant Feature in random forest model---------')
print(df_relevance.sort_values(by=['relevance'], ascending=False).head(3))

-------The most relevant Feature in random forest model---------
        relevance
m_bb     0.324462
m_wwbb   0.140082
m_wbb    0.119549


In [20]:
importances = best_gbtModel.featureImportances
df_relevance = pd.DataFrame(importances.toArray())
df_relevance.columns = ['relevance']
df_relevance.index = feature_names
print('-------The most relevant Feature in GBT model--------------------')
print(df_relevance.sort_values(by=['relevance'], ascending=False).head(3))

-------The most relevant Feature in GBT model--------------------
          relevance
m_wwbb     0.196856
jet 1 pt   0.178697
m_jjj      0.173763
