# Analyze Forest Coverage

### Only for jupyter notebook (remove this cell in databrics)

In [1]:
import findspark
findspark.init()

import pyspark
sc = pyspark.SparkContext()

In [3]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import LogisticRegression

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler

#### Load the data into an RDD

In [11]:
#read txt file, gzipped files will be auto-unzipped
myDataRDD = sc.textFile("covtype.data.gz")
myReadmeRDD = sc.textFile("covtype.info")

print (myDataRDD.count())
myDataRDD.take(5)

581012


['2596,51,3,258,0,510,221,232,148,6279,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5',
 '2590,56,2,212,-6,390,220,235,151,6225,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5',
 '2804,139,9,268,65,3180,234,238,135,6121,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2',
 '2785,155,18,242,118,3090,238,238,122,6211,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2',
 '2595,45,2,153,-1,391,220,234,150,6172,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5']

#### Readme File

In [11]:
for l in myReadmeRDD.collect():
  print l
  

SyntaxError: Missing parentheses in call to 'print' (<ipython-input-11-cc82b96e3a83>, line 2)

In [9]:
myCleanData = myDataRDD.map(lambda l: [long(i) for i in l.replace(" ","").split(',')])
myCleanData.count()

In [10]:
# Lets create the schema
QuantFields = [ "Elevation", "Aspect" , "Slope", "HD_To_Hydrlgy", "VD_To_Hydrlgy", "HD_To_Rdwys", "Hllshd_9am", "Hllshd_Noon", "Hllshd_3pm", "HD_To_Fire_Pnts"]
WildernessArea = ["WA"+str(i) for i in range(1,5)]
SoilType = ["S"+str(i) for i in range(1,41)]
schema = QuantFields + WildernessArea + SoilType + ["Cover_Type"]
len(schema)

In [11]:
#create the dataframe, and re-label columns
myDF = myCleanData.toDF()
oldCol = myDF.schema.names

for i in range(0,len(schema)):
  myDF = myDF.withColumnRenamed(oldCol[i],schema[i]) 
  
myDF.take(1)

In [12]:
# Quantitative Fields
quantSmry = myDF.select(QuantFields).describe()
quantSmry.show()

In [13]:
# Wilderness Area

display(myDF.groupBy(WildernessArea).count().orderBy("count", ascending = False))

In [14]:
# SoilType

soilGB = myDF.groupBy(SoilType)

display(soilGB.count().orderBy("count", ascending = False))

In [15]:
# Assemble the data we want to train
assembler =  VectorAssembler(inputCols = schema[:-1], outputCol = "features")
data = assembler.transform(myDF).select(["features", schema[-1]])
data.show(3)

In [16]:
(trainingData, testData) = data.randomSplit([0.7,0.3])
trainingData.count()

In [17]:
rf = RandomForestClassifier()\
          .setFeaturesCol("features")\
          .setLabelCol("Cover_Type")\
          .setMaxBins(50)\
          .setMaxDepth(10)\
          .setNumTrees(20)
          
model = rf.fit(trainingData)

In [18]:
def getAccuracy(x):
   prediction = x.transform(testData)
   accuracy = MulticlassClassificationEvaluator(labelCol="Cover_Type").evaluate(prediction)
   return accuracy
 

rfAccuracy = getAccuracy(model) * 100
print "Accuracy = %.2f%%" % rfAccuracy

In [19]:
dtc =  DecisionTreeClassifier(probabilityCol="probability", maxDepth=10, maxBins=50) \
          .setFeaturesCol("features")\
          .setLabelCol("Cover_Type")

In [20]:
dtcModel = dtc.fit(trainingData)

In [21]:
dtcAccuracy = getAccuracy(dtcModel) * 100
print "Accuracy = %.2f%%" % dtcAccuracy

In [22]:
log = LogisticRegression(maxIter=10)\
                        .setFeaturesCol("features")\
                        .setLabelCol("Cover_Type")

logModel = log.fit(trainingData)
logAccuracy = getAccuracy(logModel) * 100
print "Accuracy = %.2f%%" % logAccuracy


In [23]:

print "+----------Final Comparision of Different Classifiers ------------"
print "Random Forest Classifier Accuracy = %.2f%%" % rfAccuracy
print "Decision Tree Classifier Accuracy = %.2f%%" % dtcAccuracy
print "Logistic Regression Accuracy      = %.2f%%" % logAccuracy
print "+-----------------------------------------------------------------"