In [None]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.functions import col,max,count,sum,mean,stddev_pop,hour,countDistinct,expr,stddev,window,column
from pyspark.ml.feature import RFormula
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import Row
import math

In [2]:
# sc.stop()

# Initialization of SparkConf which is required for Spark Context
conf = SparkConf().setAppName('myapp').setMaster('local')

# Initialization of SparkContext
sc = SparkContext().getOrCreate(conf=conf)
sc.setLogLevel("OFF")

# Initialization of SparkSession into spark variable
spark = SparkSession(sc)

### Logistic Regression - Spark Classifier - Please classify the following animals into either a Mammal or not a Mammal after apply a logistic regression on the provided dataset.

In [41]:
# load the ZOO dataset:
zoo_data=spark.read.csv("zoo.csv",inferSchema=True,header=True)
zoo_data = zoo_data.withColumn("IsMammal", expr("CASE WHEN Type = 1 THEN 1 ELSE 0 END"))

# preprocess dataset using RFormula
preprocessed_data = RFormula(formula= "IsMammal ~ Hair + Feathers + Eggs + Milk + Airborne + Aquatic +" + 
                             " Predator + Toothed + Backbone + Venomous + Fins + Legs+" +
                            "Tail + Domestic + Catsize")

preprocessed_data = preprocessed_data.fit(zoo_data)
preprocessed_data = preprocessed_data.transform(zoo_data)

# split dataset into training and test data
train, test = preprocessed_data.randomSplit([0.7, 0.3])

# configure classifier
lr = LogisticRegression(labelCol="label", featuresCol="features")

# train classifier
fittedLR = lr.fit(train)

# classify test data set
result = fittedLR.transform(preprocessed_data)

# Displaying only the prediction of selected animals
result.select('AnimalName', 'label', 'prediction').where(expr("AnimalName in ('lobster', 'hawk', 'goat', 'crayfish', 'clam', 'hamster')")).toPandas()

# Printing all
result_extracted = result.select("AnimalName", "IsMammal", "prediction")
result_extracted.show(200)

+----------+--------+----------+
|AnimalName|IsMammal|prediction|
+----------+--------+----------+
|  aardvark|       1|       1.0|
|  antelope|       1|       1.0|
|      bass|       0|       0.0|
|      bear|       1|       1.0|
|      boar|       1|       1.0|
|   buffalo|       1|       1.0|
|      calf|       1|       1.0|
|      carp|       0|       0.0|
|   catfish|       0|       0.0|
|      cavy|       1|       1.0|
|   cheetah|       1|       1.0|
|   chicken|       0|       0.0|
|      chub|       0|       0.0|
|      clam|       0|       0.0|
|      crab|       0|       0.0|
|  crayfish|       0|       0.0|
|      crow|       0|       0.0|
|      deer|       1|       1.0|
|   dogfish|       0|       0.0|
|   dolphin|       1|       1.0|
|      dove|       0|       0.0|
|      duck|       0|       0.0|
|  elephant|       1|       1.0|
|  flamingo|       0|       0.0|
|      flea|       0|       0.0|
|      frog|       0|       0.0|
|      frog|       0|       0.0|
|  fruitba