# Start a Spark session 

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('RF').getOrCreate()

# Load data from HDFS

In [2]:
row_df = spark.read.format("csv")\
         .option("header", "true")\
         .option("delimiter", "\t")\
         .load("hdfs://mycluster/user/dtree/data/train.tsv")

# Pre-preparation of data

In [3]:
#set up UDF Function
from pyspark.sql.functions import udf

def replace_question(x):
    return ("0" if x=="?" else x)

#transform replace_question into DataFrames UDF 
replace_question=udf(replace_question)

In [4]:
from pyspark.sql.functions import col 

df = row_df.select(
    ['url', 'alchemy_category']+  #select columns without further transformation
    [replace_question(col(column)).cast("double").alias(column) 
    for column in row_df.columns[4:]])

# Split data into training set & test set

In [5]:
train_df, test_df = df.randomSplit([0.7, 0.3])

In [6]:
%%html
<img src="spark_pipeline.PNG", width=450, height=200>

# Build a pipeline

In [9]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

StringIndexer = StringIndexer(
                    inputCol='alchemy_category',
                    outputCol='alchemy_category_Index')

encoder = OneHotEncoderEstimator(dropLast=False,
                       inputCols=["alchemy_category_Index"],
                       outputCols=["alchemy_category_IndexVec"])

assemblerInputs=['alchemy_category_IndexVec']+row_df.columns[4:-1]
assembler = VectorAssembler(
            inputCols=assemblerInputs,
            outputCol="features")

rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=15)

#to build models with different parameter set -> 2(impurity)*3(depth)*3(bins)=18 models
paramGrid = ParamGridBuilder()\
    .addGrid(rf.impurity, ["gini","entropy"])\
    .addGrid(rf.maxDepth, [5, 10, 15])\
    .addGrid(rf.maxBins, [10,15,20])\
    .build()

evaluator = BinaryClassificationEvaluator(
                rawPredictionCol="rawPrediction",
                labelCol="label",
                metricName="areaUnderROC")

#numFolds=3 ->two-third of training set is sub-training set, 
#one-third of training set is validation set
#each model will be validated for three times beacuse numfolds=3
#thus, the number of iteration will be 18*3=54 times
cv=CrossValidator(estimator=rf, evaluator=evaluator,
                  estimatorParamMaps=paramGrid, numFolds=3)

cvrf_pipeline = Pipeline(stages=[StringIndexer,encoder,assembler, cv])

# Create a pipeline model

In [10]:
#.fit -> to train a pipeline model 
cvrf_pipelineModel=cv_pipeline.fit(train_df)

In [12]:
type(cvrf_pipelineModel)

pyspark.ml.pipeline.PipelineModel

In [13]:
#CrossValidator will pick up the best model
cvrf_pipeline.getStages()

[StringIndexer_4596a97e4b17e04ddab1,
 OneHotEncoderEstimator_45e5a8024363f84beb40,
 VectorAssembler_43d984074f85909f245c,
 CrossValidator_434da1a36fde8b6f72fd]

# Check the best model 

In [21]:
RFModel=cvrf_pipelineModel.stages[3]

In [22]:
type(RFModel)

pyspark.ml.classification.RandomForestClassificationModel

In [23]:
#have a look on the parameter set for the best model
RFModel.extractParamMap()

{Param(parent='RandomForestClassifier_482ba72f553de98eb810', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees.'): False,
 Param(parent='RandomForestClassifier_482ba72f553de98eb810', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext'): 10,
 Param(parent='RandomForestClassifier_482ba72f553de98eb810', name='featureSubsetStrategy', doc='The number of features to consider for splits at each tree node. Supported options: auto, all, onethird, sqrt, log2, (0.0-1.0], [1-n].'): 'auto',
 Param(parent='RandomForestClassifier_482ba72f553de98eb810', name='featuresCol', doc='features column name'): 'features',
 Param(parent='Random

# Evaluate the model with AUC

In [24]:
# output a new DataFrame with predicted labels appended as a column
predictions = cvrf_pipelineModel.transform(test_df)
auc = evaluator.evaluate(predictions)
auc

0.7570790188377101

# Save and load the model

In [30]:
cvrf_pipelineModel.save("hdfs://mycluster/user/oracle/rf/rf_model")

In [29]:
type(cvrf_pipelineModel)

pyspark.ml.pipeline.PipelineModel

In [32]:
from pyspark.ml import PipelineModel
reloaded_cv_model= PipelineModel.load("hdfs://mycluster/user/oracle/rf/rf_model")

In [33]:
reloaded_cv_model

PipelineModel_4635a3d49d3e6b4546c9