In [None]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
!update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
!java -version
!pip install pyspark


In [86]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier, DecisionTreeClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Read file
spark = SparkSession.builder.getOrCreate()
leafDF = spark.read.option("inferSchema","true").csv("leaf.csv")
leafDF = leafDF.withColumnRenamed("_c0","class")

# Make index for label
indexer = StringIndexer(inputCol="class",outputCol="label", stringOrderType="alphabetDesc")
leafDF = indexer.fit(leafDF).transform(leafDF)

# Create features column from columns with VectorAssembler
vec = VectorAssembler(inputCols=leafDF.columns[2:-1],outputCol="features")
leafDF = vec.transform(leafDF)

leafDF = leafDF.select("features","label")
leafDF.show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                                    |label|
+----------------------------------------------------------------------------------------------------------------------------+-----+
|[0.72694,1.4742,0.32396,0.98535,1.0,0.83592,0.0046566,0.0039465,0.04779,0.12795,0.016108,0.0052323,2.7477E-4,1.1756]        |29.0 |
|[0.74173,1.5257,0.36116,0.98152,0.99825,0.79867,0.0052423,0.0050016,0.02416,0.090476,0.0081195,0.002708,7.4846E-5,0.69659]  |29.0 |
|[0.76722,1.5725,0.38998,0.97755,1.0,0.80812,0.0074573,0.010121,0.011897,0.057445,0.0032891,9.2068E-4,3.7886E-5,0.44348]     |29.0 |
|[0.73797,1.4597,0.35376,0.97566,1.0,0.81697,0.0068768,0.0086068,0.01595,0.065491,0.0042707,0.0011544,6.6272E-5,0.58785]     |29.0 |
|[0.82301,1.7707,0.44462,0.97698,1.0,0.75493,0.007428,0.010042,0.0079

In [87]:
# train test split
trainDF,testDF= leafDF.randomSplit([0.8,0.2])

# using random classifier
rf = RandomForestClassifier()
eva = MulticlassClassificationEvaluator(metricName="accuracy")

myParams = ParamGridBuilder().addGrid(rf.maxDepth,[10,15,20])\
                                .addGrid(rf.numTrees,[5,10,20])\
                                .addGrid(rf.impurity,["gini","entropy"])\
                                .build()

validator = CrossValidator(estimator=rf,
                                 evaluator=eva,
                                 numFolds=5,
                                 parallelism=8,
                                 estimatorParamMaps=myParams
                                )

# Create Model
modelRF = validator.fit(trainDF)
print("Best Parameters")

# Best Parameters for Random Forest Classifier 
print("Max Depth : ", modelRF.bestModel.getMaxDepth())
print("Num Trees : ", modelRF.bestModel.getNumTrees)
print("Impurity  : ", modelRF.bestModel.getImpurity())

# Test Accuracy
resultDF = modelRF.transform(testDF)

# Success Rate 
eva = MulticlassClassificationEvaluator(metricName="accuracy")
successRate = eva.evaluate(resultDF)
print("Success Rate : ",successRate)

Best Parameters
Max Depth :  15
Num Trees :  20
Impurity  :  gini
Success Rate :  0.7160493827160493


In [88]:
leafDF = leafDF.select("features","label")

# train test split
trainDF,testDF= leafDF.randomSplit([0.8,0.2])

# Using Decision Tree
dt = DecisionTreeClassifier()
eva = MulticlassClassificationEvaluator(metricName="accuracy")

myParams = ParamGridBuilder().addGrid(dt.maxDepth,[5,15,20])\
                                .addGrid(dt.maxBins,[32,64])\
                                .addGrid(dt.impurity,["gini","entropy"])\
                                .build()

validator = CrossValidator(estimator=dt,
                                 evaluator=eva,
                                 numFolds=5,
                                 parallelism=8,
                                 estimatorParamMaps=myParams
                                )

# Create Model
modelDT = validator.fit(trainDF)
print("Best Parameters for Decision Tree")

# Best Parameters for Decision Tree Classifier
print("Max Depth : ", modelDT.bestModel.getMaxDepth())
print("Max Bins : ", modelDT.bestModel.getMaxBins())
print("Impurity  : ", modelDT.bestModel.getImpurity())


Best Parameters for Decision Tree
Max Depth :  15
Max Bins :  64
Impurity  :  entropy


In [89]:
# Test Accuracy
resultDF = modelDT.transform(testDF)

# Success Rate 
eva = MulticlassClassificationEvaluator(metricName="accuracy")
successRate = eva.evaluate(resultDF)
print("Success Rate : ",successRate)

Success Rate :  0.5588235294117647


In [90]:
leafDF = leafDF.select("features","label")

# train test split
trainDF,testDF= leafDF.randomSplit([0.8,0.2])


# Using Logistic Regression
lr = LogisticRegression()
eva = MulticlassClassificationEvaluator(metricName="accuracy")

myParams = ParamGridBuilder().addGrid(lr.regParam,[0.01,0.1,1.0])\
                                .addGrid(lr.elasticNetParam,[0.0, 0.5, 1.0])\
                                .addGrid(lr.maxIter,[10, 100, 1000])\
                                .build()

validator = CrossValidator(estimator=lr,
                                 evaluator=eva,
                                 numFolds=5,
                                 parallelism=8,
                                 estimatorParamMaps=myParams
                                )

# Create Model
modelLR = validator.fit(trainDF)

print("Best Parameters for Logistic Regression")

# Best Parameters for Logistic Regression
print("Reg Param : ", modelLR.bestModel.getRegParam())
print("Elastic Net Param : ", modelLR.bestModel.getElasticNetParam())
print("Max Iter : ", modelLR.bestModel.getMaxIter())

Best Parameters for Logistic Regression
Reg Param :  0.01
Elastic Net Param :  0.0
Max Iter :  100


In [92]:
# Test Accuracy 
resultDF = modelLR.transform(testDF)

# Success Rate 
eva = MulticlassClassificationEvaluator(metricName="accuracy")
successRate = eva.evaluate(resultDF)
print("Success Rate : ",successRate)

Success Rate :  0.7142857142857143
