# **1. Install spark**

In [389]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.0.3/spark-3.0.3-bin-hadoop2.7.tgz
!tar xf spark-3.0.3-bin-hadoop2.7.tgz
!pip install -q findspark

In [390]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.3-bin-hadoop2.7"

In [391]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

# **2. Load dataset**

In [392]:
'''
load models
'''
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorIndexer, IndexToString
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [393]:
'''
load data
load the dataset to google Drive. Then copy the link of the data file
'''
data = spark.read.format("libsvm").load("/content/drive/MyDrive/Colab Notebooks/sample_multiclass_classification_data.txt")

In [None]:
data.select("features").show(1,False)

In [None]:
data.dtypes

In [396]:
'''
label indexer 
map a string column of labels to an ML column of label indices
'''
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)

In [397]:
'''
class for indexing categorical feature columns in a dataset of Vector
'''
featureIndexer =VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

In [398]:
'''
split dataset to training and testing 
'''
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# **2. Decision Tree**
Run below codes and answer question 1.

reference:

model:
https://spark.apache.org/docs/latest/mllib-decision-tree.html

evaluation:
https://spark.apache.org/docs/latest/mllib-evaluation-metrics.html#multiclass-classification

## **Model**

In [399]:
from pyspark.ml.classification import DecisionTreeClassifier

In [436]:
dt = DecisionTreeClassifier( maxDepth=2,featuresCol="indexedFeatures",labelCol="indexedLabel")

In [437]:
dt_pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

In [440]:
dt_model = dt_pipeline.fit(trainingData)

In [441]:
dt_predictions = dt_model.transform(testData)

In [None]:
print(dt_model.stages[2])

In [None]:
dt_predictions.show(5)

## **Model Evaluation**
You finish codes on the f1 and recall parts and run the code. Answer the question 1.

Accurancy

In [None]:
acc_evaluator_dt = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy",)
acc_dt = acc_evaluator_dt.evaluate(dt_predictions)
print("accurancy:"+str(acc_dt))

Precision

In [None]:
pr_evaluator_dt = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precisionByLabel")
precision_dt = pr_evaluator_dt.evaluate(dt_predictions)
print("precision:"+str(precision_dt))

F1_score

Recall

# **3. Random forest**
Run below codes and answer question 2.

reference:

model:
https://spark.apache.org/docs/latest/mllib-ensembles.html#random-forests 

evaluation:
https://spark.apache.org/docs/latest/mllib-evaluation-metrics.html#multiclass-classification

## **Model**

In [408]:
from pyspark.ml.classification import RandomForestClassifier

In [409]:
rf = RandomForestClassifier(numTrees=3,featuresCol="indexedFeatures",labelCol="indexedLabel")

In [410]:
rf_pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])

In [411]:
rf_model = rf_pipeline.fit(trainingData)

In [412]:
rf_predictions = rf_model.transform(testData)

In [None]:
print(rf_model.stages[2])

In [None]:
rf_predictions.show(5)

## **Model Evaluation**
You finish codes on the precision and recall parts and run the code. Answer the question 2.

Accurancy

In [None]:
acc_evaluator_rf = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy",)
acc_rf = acc_evaluator_rf.evaluate(rf_predictions)
print("accurancy:"+str(acc_rf))

F1_score

In [None]:
f_evaluator_rf = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1")
f1_score_rf = f_evaluator_rf.evaluate(rf_predictions)
print("f1 score:"+str(f1_score_rf))

Precision

Recall