In [42]:
import os
os.environ['SPARK_HOME']="/usr/hdp/current/spark2-client"

In [43]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder \
        .master("yarn") \
        .appName("ssds2_kb") \
        .getOrCreate()
sc = spark.sparkContext
sc

In [44]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType

colNames = ["Elevation", "Aspect", "Slope",
"Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology",
"Horizontal_Distance_To_Roadways",
"Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm",
"Horizontal_Distance_To_Fire_Points"]
for i in range(4):
    colNames += ["Wilderness_Area_"+str(i),]
for i in range(40):
    colNames += ["Soil_Type_"+str(i),]
colNames += ["Cover_Type",]

In [45]:
# Create schema
schema = StructType()
for name in colNames:
    if name == "Cover_Type":
        schema.add(StructField(name, DoubleType(), True))
    else:
        schema.add(StructField(name, IntegerType(), True))

# Load CSV with schema
data = spark.read.csv("covtype.data", header=False, schema=schema)

In [46]:
# train/test data split
(trainData, testData) = data.randomSplit([0.9, 0.1])
# remove target label
inputCols = trainData.drop('Cover_Type').columns

In [47]:
# load library
from pyspark.ml.feature import VectorAssembler

# asemble features to vector
assembler = VectorAssembler(
    inputCols=inputCols,
    outputCol="featureVector")
assembledTrainData = assembler.transform(trainData)
assembledTestData = assembler.transform(testData)

## Logistic Regression

In [48]:
%%time
# import library
from pyspark.ml.classification import LogisticRegression

# create logistic regression model
logi = LogisticRegression(labelCol="Cover_Type", featuresCol="featureVector", family="multinomial", predictionCol="prediction")

# train model with train data
model = logi.fit(assembledTrainData)

CPU times: user 94.4 ms, sys: 23.6 ms, total: 118 ms
Wall time: 51 s


In [49]:
%%time
# load library
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# create evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="Cover_Type", predictionCol="prediction")
# transform test data using the fitted model
predictions = model.transform(assembledTestData)

print(evaluator.setMetricName("accuracy").evaluate(predictions))
print(evaluator.setMetricName("f1").evaluate(predictions))

0.7124149220298096
0.7000750643021649
CPU times: user 55.4 ms, sys: 11.6 ms, total: 67 ms
Wall time: 24.2 s


## Decision Tree

In [50]:
%%time
# load library
from pyspark.ml.classification import DecisionTreeClassifier

#create model
classifier = DecisionTreeClassifier(labelCol="Cover_Type", featuresCol="featureVector", predictionCol="prediction")

# train model with train data
model = classifier.fit(assembledTrainData)

CPU times: user 68.5 ms, sys: 23.3 ms, total: 91.8 ms
Wall time: 29.6 s


In [51]:
%%time
# transform test data using the fitted model
predictions = model.transform(assembledTestData)

print(evaluator.setMetricName("accuracy").evaluate(predictions))
print(evaluator.setMetricName("f1").evaluate(predictions))

0.7003015421728267
0.6828278611040922
CPU times: user 56.4 ms, sys: 13 ms, total: 69.4 ms
Wall time: 22.6 s


## RandomForest

In [52]:
%%time
# load library
from pyspark.ml.classification import RandomForestClassifier

#create model
rf = RandomForestClassifier(labelCol="Cover_Type", featuresCol="featureVector", predictionCol="prediction")

# train model with train data
model = rf.fit(assembledTrainData)

CPU times: user 80.6 ms, sys: 24.8 ms, total: 105 ms
Wall time: 35.1 s


In [53]:
%%time
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="Cover_Type", predictionCol="prediction")

# transform test data using the fitted model
predictions = model.transform(assembledTestData)

print(evaluator.setMetricName("accuracy").evaluate(predictions))
print(evaluator.setMetricName("f1").evaluate(predictions))

0.6722667355905919
0.6415333743553542
CPU times: user 68.6 ms, sys: 18.8 ms, total: 87.4 ms
Wall time: 26.3 s


## SVM

In [54]:
assembledTrainDataBinary = assembledTrainData.where("Cover_Type = 1 or Cover_Type = 2")
assembledTestDataBinary = assembledTestData.where("Cover_Type = 1 or Cover_Type = 2")

In [55]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf

def convertTo01(value):
    if value == 1.0: label=0.0
    else: label=1.0
    return float(label)

convertTo01_udf = udf(convertTo01, DoubleType())
assembledTrainDataBinary = assembledTrainDataBinary.withColumn("label", convertTo01_udf(assembledTrainDataBinary["Cover_Type"]))
assembledTestDataBinary = assembledTestDataBinary.withColumn("label", convertTo01_udf(assembledTestDataBinary["Cover_Type"]))

In [56]:
%%time
# load library
from pyspark.ml.classification import LinearSVC
#create model
svm = LinearSVC(labelCol="label", featuresCol="featureVector")
# train model with train data
model = svm.fit(assembledTrainDataBinary)

CPU times: user 2.62 s, sys: 979 ms, total: 3.6 s
Wall time: 30min 54s


In [57]:
model.coefficients

DenseVector([-0.0005, -0.0, 0.0105, 0.0001, 0.0003, -0.0, 0.0035, 0.0047, 0.0013, 0.0, 0.0766, 0.0245, 0.047, 0.3767, 0.0, 0.5592, 0.4037, 0.1198, 0.0, 0.1807, 0.2886, 0.1669, 0.0477, 0.2457, 0.1947, 0.1894, 0.1652, 0.0, 0.0, 0.1517, 0.1368, 0.0321, -1.7001, 0.0874, -1.8943, -1.7477, -1.7179, -1.5856, 0.302, 0.1809, -1.8032, 0.3283, 0.1522, 0.126, 0.0931, 0.1258, 0.0909, 0.1855, -1.8385, 0.1485, 0.0, -1.7567, -1.859, -1.7692])

In [58]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# transform test data using the fitted model
predictions = model.transform(assembledTestDataBinary)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

print(evaluator.setMetricName("accuracy").evaluate(predictions))
print(evaluator.setMetricName("f1").evaluate(predictions))

0.6906042711535351
0.6767494856185956


In [59]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
predictions = model.transform(assembledTestDataBinary)
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="prediction")
print(evaluator.evaluate(predictions)) # AUROC

0.6632417690693648
