# Setup Spark environment 

In [1]:
import time
import os

Start=time.time()
# Download and install tools 

# Install Java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Download and Install Spark
!wget  -q http://apache.osuosl.org/spark/spark-2.4.7/spark-2.4.7-bin-hadoop2.7.tgz
!tar xf spark-2.4.7-bin-hadoop2.7.tgz

# Install findspark
!pip install -q findspark

# Set environment variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.7-bin-hadoop2.7"
import findspark
findspark.init()

print(f"\nIt took {(time.time()-Start)} seconds to install all dependencies for spark to run on Google Colab. \n")



It took 39.47919535636902 seconds to install all dependencies for spark to run on Google Colab. 



# All files present in the data/mllib folder



- Spark ships with a good number of test data sets that can be used for all kinds of training and testing.

- This data can be explored by browsing to the installation path of Spark and checking out the folder marked `data`. 

In [2]:
from pathlib import Path
from IPython.display import HTML

PATH = "/content/spark-2.4.7-bin-hadoop2.7/data/mllib"

files = [str(x).replace(PATH + "/", "") for x in Path(PATH).glob("**/*") if x.is_file()]
files.sort()
folders = [
    f"<font color='rgba(0, 0, 0, 87)' size='1'>{'/'.join(f.split('/')[:-1])}/</font>"  # folder part
    f"{f.split('/')[-1]}"  # file part
    for f in files
    if "/" in f
]
files = folders + [f for f in files if "/" not in f]

HTML(
    f"<font face='courier' size='2'>"
    f"<strong>All files present in the data/mllib folder:</strong><br />"
    f"{''.join([f'<li>{str(f)}</li>' for f in files])}"
    f"</font>"
)



# Classification


## Logistic regression classifier  

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# Load training data
PATH = "/content/spark-2.4.7-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt"
data = spark.read.format("libsvm").load(PATH)


# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

logr = LogisticRegression()

# Fit the model
logrModel = logr.fit(trainingData)


# Use model to predict Samples 

predictions_train = logrModel.transform(trainingData)

predictions_train.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute  accuracy
evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

acc= evaluator_acc.evaluate(predictions_train)

print(f"Accuracy [Training] = {100*acc}%")

# Make predictions.
predictions = logrModel.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test accuracy
evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

acc = evaluator_acc.evaluate(predictions)
print(f"Accuracy [Testing] = {100*acc}%")


+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[95,96,97,12...|
|       0.0|  0.0|(692,[100,101,102...|
|       0.0|  0.0|(692,[121,122,123...|
|       0.0|  0.0|(692,[122,123,124...|
|       0.0|  0.0|(692,[123,124,125...|
+----------+-----+--------------------+
only showing top 5 rows

Accuracy [Training] = 100.0%
+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[98,99,100,1...|
|       0.0|  0.0|(692,[122,123,148...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[126,127,128...|
+----------+-----+--------------------+
only showing top 5 rows

Accuracy [Testing] = 100.0%


## Decision Tree classifier 

In [4]:

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# Load training data
PATH = "/content/spark-2.4.7-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt"
data = spark.read.format("libsvm").load(PATH)


# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

DT = DecisionTreeClassifier()

# Fit the model
DTClass = DT.fit(trainingData)


# Use model to predict Samples 

predictions_train = DTClass.transform(trainingData)

predictions_train.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute  accuracy
evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

acc= evaluator_acc.evaluate(predictions_train)

print(f"Accuracy [Training] = {100*acc}%")

# Make predictions.
predictions = DTClass.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test accuracy
evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

acc = evaluator_acc.evaluate(predictions)
print(f"Accuracy [Testing] = {100*acc}%")


+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[95,96,97,12...|
|       0.0|  0.0|(692,[98,99,100,1...|
|       0.0|  0.0|(692,[121,122,123...|
|       0.0|  0.0|(692,[122,123,124...|
|       0.0|  0.0|(692,[123,124,125...|
+----------+-----+--------------------+
only showing top 5 rows

Accuracy [Training] = 100.0%
+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[100,101,102...|
|       0.0|  0.0|(692,[122,123,148...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[124,125,126...|
+----------+-----+--------------------+
only showing top 5 rows

Accuracy [Testing] = 100.0%


## Random Forest classifier 

In [5]:

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# Load training data
PATH = "/content/spark-2.4.7-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt"
data = spark.read.format("libsvm").load(PATH)


# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

RF = RandomForestClassifier()

# Fit the model
RFClass = RF.fit(trainingData)


# Use model to predict Samples 

predictions_train = RFClass.transform(trainingData)

predictions_train.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute  accuracy
evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

acc= evaluator_acc.evaluate(predictions_train)

print(f"Accuracy [Training] = {100*acc}%")

# Make predictions.
predictions = RFClass.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test accuracy
evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

acc = evaluator_acc.evaluate(predictions)
print(f"Accuracy [Testing] = {100*acc}%")



+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[95,96,97,12...|
|       0.0|  0.0|(692,[121,122,123...|
|       0.0|  0.0|(692,[122,123,124...|
|       0.0|  0.0|(692,[122,123,148...|
|       0.0|  0.0|(692,[123,124,125...|
+----------+-----+--------------------+
only showing top 5 rows

Accuracy [Training] = 100.0%
+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[98,99,100,1...|
|       0.0|  0.0|(692,[100,101,102...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[124,125,126...|
+----------+-----+--------------------+
only showing top 5 rows

Accuracy [Testing] = 100.0%


In [6]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# Load training data
PATH = "/content/spark-2.4.7-bin-hadoop2.7/data/mllib/sample_libsvm_data.txt"
data = spark.read.format("libsvm").load(PATH)


# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

GB = GBTClassifier()

# Fit the model
GBClass = GB.fit(trainingData)


# Use model to predict Samples 

predictions_train = GBClass.transform(trainingData)

predictions_train.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute  accuracy
evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

acc= evaluator_acc.evaluate(predictions_train)

print(f"Accuracy [Training] = {100*acc}%")

# Make predictions.
predictions = GBClass.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test accuracy
evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

acc = evaluator_acc.evaluate(predictions)
print(f"Accuracy [Testing] = {100*acc}%")




+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[95,96,97,12...|
|       0.0|  0.0|(692,[98,99,100,1...|
|       0.0|  0.0|(692,[121,122,123...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[123,124,125...|
+----------+-----+--------------------+
only showing top 5 rows

Accuracy [Training] = 100.0%
+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[100,101,102...|
|       0.0|  0.0|(692,[122,123,124...|
|       0.0|  0.0|(692,[122,123,148...|
|       0.0|  0.0|(692,[124,125,126...|
|       0.0|  0.0|(692,[126,127,128...|
+----------+-----+--------------------+
only showing top 5 rows

Accuracy [Testing] = 96.96969696969697%
