# Start a SparkSession

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('DTC').getOrCreate()

# RDD & DataFrame

In [None]:
row_df = spark.read.format("csv")\
         .option("header", "true")\
         .option("delimiter", "\t")\
         .load("hdfs://mycluster/user/oracle/dtc/dtc_data/covtype.data")

In [None]:
# textfile will be transformed into dataframe using spark.read.format
type(row_df)

In [2]:
rawData = sc.textFile("hdfs://mycluster/user/oracle/dtc/dtc_data/covtype.data")

In [3]:
#transform textfile into RDD
type(rawData)

pyspark.rdd.RDD

# Transform RDD into dataframe

In [4]:
lines = rawData.map(lambda x:x.split(","))
#581012 records in total 
print(lines.count())

#each record contains 55 columns
fieldnum = len(lines.first())
print(fieldnum)

581012
55


In [5]:
from pyspark.sql.types import StringType, StructField, StructType
#create a list of columns 
#StructField(column_name, column_type, True-> cannot be null)
#create schema using StructType
schema = StructType(
                    [StructField("f"+str(i), StringType(), True)
                     for i in range(fieldnum)]
                    )

In [None]:
schema

In [6]:
df = spark.createDataFrame(lines, schema)

In [None]:
df.printSchema()

In [7]:
#change the datatype from StringType to DoubleType
from pyspark.sql.functions import col
df = df.select([col(column).cast("double").alias(column)
                for column in df.columns])

In [8]:
df.printSchema()

root
 |-- f0: double (nullable = true)
 |-- f1: double (nullable = true)
 |-- f2: double (nullable = true)
 |-- f3: double (nullable = true)
 |-- f4: double (nullable = true)
 |-- f5: double (nullable = true)
 |-- f6: double (nullable = true)
 |-- f7: double (nullable = true)
 |-- f8: double (nullable = true)
 |-- f9: double (nullable = true)
 |-- f10: double (nullable = true)
 |-- f11: double (nullable = true)
 |-- f12: double (nullable = true)
 |-- f13: double (nullable = true)
 |-- f14: double (nullable = true)
 |-- f15: double (nullable = true)
 |-- f16: double (nullable = true)
 |-- f17: double (nullable = true)
 |-- f18: double (nullable = true)
 |-- f19: double (nullable = true)
 |-- f20: double (nullable = true)
 |-- f21: double (nullable = true)
 |-- f22: double (nullable = true)
 |-- f23: double (nullable = true)
 |-- f24: double (nullable = true)
 |-- f25: double (nullable = true)
 |-- f26: double (nullable = true)
 |-- f27: double (nullable = true)
 |-- f28: double (nullabl

In [9]:
df.show(1)

+------+----+---+-----+---+-----+-----+-----+-----+------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|    f0|  f1| f2|   f3| f4|   f5|   f6|   f7|   f8|    f9|f10|f11|f12|f13|f14|f15|f16|f17|f18|f19|f20|f21|f22|f23|f24|f25|f26|f27|f28|f29|f30|f31|f32|f33|f34|f35|f36|f37|f38|f39|f40|f41|f42|f43|f44|f45|f46|f47|f48|f49|f50|f51|f52|f53|f54|
+------+----+---+-----+---+-----+-----+-----+-----+------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|2596.0|51.0|3.0|258.0|0.0|510.0|221.0|232.0|148.0|6279.0|1.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|1.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|5.0|
+------+----+---+-----+---+-----+-----+-----

In [10]:
#let f0~f53 columns be features 
featuresCol=df.columns[:54]

In [11]:
#.withColumn -> create the "label" column
#-1 -> the value of label should be in the 0-6 range 
df=df.withColumn("label", df["f54"]-1).drop("f54")

In [None]:
df.show(1)

# Split data into training set & test set

In [12]:
train_df, test_df = df.randomSplit([0.7,0.3])

In [13]:
train_df.count(), test_df.count()

(406288, 174724)

# Build a pipeline

In [14]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

#combine featuresCol into one vector
assembler = VectorAssembler(
            inputCols=featuresCol,
            outputCol="features")

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")

#to build models with different parameter set -> 2(impurity)*3(depth)*3(bins)=18 models
paramGrid = ParamGridBuilder()\
    .addGrid(dt.impurity, ["gini","entropy"])\
    .addGrid(dt.maxDepth, [10, 15, 25])\
    .addGrid(dt.maxBins, [30,40,50])\
    .build()

evaluator = BinaryClassificationEvaluator(
                rawPredictionCol="rawPrediction",
                labelCol="label",
                metricName="areaUnderROC")

#numFolds=3 ->two-third of training set is sub-training set, 
#one-third of training set is validation set
#each model will be validated for three times beacuse numfolds=3
#thus, the number of iteration will be 18*3=54 times
cv=CrossValidator(estimator=dt, evaluator=evaluator,
                  estimatorParamMaps=paramGrid, numFolds=3)

cv_pipeline = Pipeline(stages=[assembler, cv])

In [15]:
cv_pipeline.getStages()

[VectorAssembler_43079bb5ca450aca7aaa, CrossValidator_40c4bb881d1897f77d01]

# Build a pipeline model

In [None]:
cv_pipelineModel=cv_pipeline.fit(train_df)

In [None]:
cv_pipelineModel.stages[1].bestModel

# Make prediction

In [None]:
predictions=cv_pipelineModel.transform(test_df)


In [None]:
predictions.select( 'rawPrediction','probability', 'label', 'prediction').take(10)

In [None]:
result=predictions.withColumnRenamed("f0","latitude")\
                    .withColumnRenamed("f1", "direction")\
                    .withColumnRenamed("f2", "slope")\
                    .withColumnRenamed("f3", "vertical distance")\
                    .withColumnRenamed("f4", "horizontal distance")\
                    .withColumnRenamed("f5", "shadow")
result.select("latitude", "direction", "slope", "vertical distance", "horizontal distance", "shadow", "label", "prediction").show(10)

In [None]:
type(predictions)

In [None]:
accuracy = evaluator.evaluate(predictions)
accuracy

# Save and load pipeline model

In [None]:
cv_pipelineModel.save("hdfs://mycluster/user/oracle/dtc/dtc_model")

In [None]:
from pyspark.ml import Pipeline
reloaded_cv_model= Pipeline.load("hdfs://mycluster/user/oracle/dtc/dtc_model")

In [None]:
reloaded_cv_model