In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('/FileStore/tables/giafz0vd1480560564317/trainBD.csv')


In [2]:
labelIndexer = StringIndexer(inputCol="Cover_Type", outputCol="indexedLabel").fit(df)
assembler = VectorAssembler(
    inputCols=["Elevation","Slope","Aspect","Horizontal_Distance_To_Hydrology","Vertical_Distance_To_Hydrology","Horizontal_Distance_To_Roadways","Hillshade_9am","Hillshade_Noon","Hillshade_3pm","Horizontal_Distance_To_Fire_Points","Soil","Wilderness"],
    outputCol="features")

In [3]:
(trainingData, testData) = df.randomSplit([0.8, 0.2])
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features")
pipeline = Pipeline(stages=[labelIndexer,assembler, rf])

model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % (accuracy))
evaluator2 = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="f1")
f1s = evaluator2.evaluate(predictions)
print("F1 score = %g " % (f1s))
#69%

In [10]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="features")

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, assembler, dt])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g " % (accuracy))
evaluator2 = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="f1")
recall = evaluator2.evaluate(predictions)
print("F1 score = %g " % (recall))

#66%