# ML Packages


**https://spark.apache.org/docs/2.3.0/api/python/pyspark.ml.html**

In [2]:
import findspark
findspark.init()
findspark.find()

from pyspark.sql import SparkSession

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import RandomForestClassifier

from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
spark = SparkSession.builder.appName('Classification with Spark').getOrCreate()

In [5]:
dataset = spark.read.option("inferSchema", "true").csv("diabetes.csv", header=True)

In [8]:
dataset.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|31.0|                   0.248| 26|      1|


In [9]:
dataset.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)



# Check Missing Value

In [11]:
from pyspark.sql.functions import col, count, isnan, when
# checking for null or nan value
dataset.select([count(when(col(c).isNull(), c)).alias(c) for c in dataset.columns]).show()

+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|          0|      0|            0|            0|      0|  0|                       0|  0|      0|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+



In [12]:
cols = dataset.columns
cols.remove('Outcome')
assembler = VectorAssembler(inputCols=cols,outputCol="features")
data = assembler.transform(dataset)

In [13]:
data.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|            features|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|[6.0,148.0,72.0,3...|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|[1.0,85.0,66.0,29...|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|[8.0,183.0,64.0,0...|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|[1.0,89.0,66.0,23...|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|[0.0,137.0,40.0,3...|
|          5|    116|           

In [17]:
data.select("features", "Outcome").show(truncate=False)

+-------------------------------------------+-------+
|features                                   |Outcome|
+-------------------------------------------+-------+
|[6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0]  |1      |
|[1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0]   |0      |
|[8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0]   |1      |
|[1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0]  |0      |
|[0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0]|1      |
|[5.0,116.0,74.0,0.0,0.0,25.6,0.201,30.0]   |0      |
|[3.0,78.0,50.0,32.0,88.0,31.0,0.248,26.0]  |1      |
|[10.0,115.0,0.0,0.0,0.0,35.3,0.134,29.0]   |0      |
|[2.0,197.0,70.0,45.0,543.0,30.5,0.158,53.0]|1      |
|[8.0,125.0,96.0,0.0,0.0,0.0,0.232,54.0]    |1      |
|[4.0,110.0,92.0,0.0,0.0,37.6,0.191,30.0]   |0      |
|[10.0,168.0,74.0,0.0,0.0,38.0,0.537,34.0]  |1      |
|[10.0,139.0,80.0,0.0,0.0,27.1,1.441,57.0]  |0      |
|[1.0,189.0,60.0,23.0,846.0,30.1,0.398,59.0]|1      |
|[5.0,166.0,72.0,19.0,175.0,25.8,0.587,51.0]|1      |
|[7.0,100.0,0.0,0.0,0.0,30.0

# Scaling

In [19]:
standardscalar = StandardScaler().setInputCol("features").setOutputCol("scaled_features")
data = standardscalar.fit(data).transform(data)

In [20]:
data.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+--------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|            features|     scaled_features|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+--------------------+--------------------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|[6.0,148.0,72.0,3...|[1.78063837321943...|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|[1.0,85.0,66.0,29...|[0.29677306220323...|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|[8.0,183.0,64.0,0...|[2.37418449762590...|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|[1.0,89.0,66.0,23...|[0.29677306220323...|
|     

In [23]:
data.select("features", "outcome", "scaled_features").show()

+--------------------+-------+--------------------+
|            features|outcome|     scaled_features|
+--------------------+-------+--------------------+
|[6.0,148.0,72.0,3...|      1|[1.78063837321943...|
|[1.0,85.0,66.0,29...|      0|[0.29677306220323...|
|[8.0,183.0,64.0,0...|      1|[2.37418449762590...|
|[1.0,89.0,66.0,23...|      0|[0.29677306220323...|
|[0.0,137.0,40.0,3...|      1|[0.0,4.2849165233...|
|[5.0,116.0,74.0,0...|      0|[1.48386531101619...|
|[3.0,78.0,50.0,32...|      1|[0.89031918660971...|
|[10.0,115.0,0.0,0...|      0|[2.96773062203238...|
|[2.0,197.0,70.0,4...|      1|[0.59354612440647...|
|[8.0,125.0,96.0,0...|      1|[2.37418449762590...|
|[4.0,110.0,92.0,0...|      0|[1.18709224881295...|
|[10.0,168.0,74.0,...|      1|[2.96773062203238...|
|[10.0,139.0,80.0,...|      0|[2.96773062203238...|
|[1.0,189.0,60.0,2...|      1|[0.29677306220323...|
|[5.0,166.0,72.0,1...|      1|[1.48386531101619...|
|[7.0,100.0,0.0,0....|      1|[2.07741143542266...|
|[0.0,118.0,

In [24]:
data.select("features", "outcome", "scaled_features").show(truncate=False)

+-------------------------------------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------+
|features                                   |outcome|scaled_features                                                                                                                                         |
+-------------------------------------------+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------+
|[6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0]  |1      |[1.7806383732194306,4.628960915766174,3.7198138711154307,2.1940523222807116,0.0,4.261709381170972,1.8923811872495484,4.251616970894646]                 |
|[1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0]   |0      |[0.29677306220323846,2.658524850271114,3.4098293818558116,1.8179290670325896,0.0,3.3738532600936866,1.0593712866420917,

In [25]:
assembled_data = data.select("scaled_features", "Outcome")
assembled_data.show()

+--------------------+-------+
|     scaled_features|Outcome|
+--------------------+-------+
|[1.78063837321943...|      1|
|[0.29677306220323...|      0|
|[2.37418449762590...|      1|
|[0.29677306220323...|      0|
|[0.0,4.2849165233...|      1|
|[1.48386531101619...|      0|
|[0.89031918660971...|      1|
|[2.96773062203238...|      0|
|[0.59354612440647...|      1|
|[2.37418449762590...|      1|
|[1.18709224881295...|      0|
|[2.96773062203238...|      1|
|[2.96773062203238...|      0|
|[0.29677306220323...|      1|
|[1.48386531101619...|      1|
|[2.07741143542266...|      1|
|[0.0,3.6906580274...|      1|
|[2.07741143542266...|      1|
|[0.29677306220323...|      0|
|[0.29677306220323...|      1|
+--------------------+-------+
only showing top 20 rows



# Train Test split

In [26]:
train, test = assembled_data.randomSplit([0.7,0.3])
train.show()
test.show()

+--------------------+-------+
|     scaled_features|Outcome|
+--------------------+-------+
|(8,[0,1,6,7],[0.5...|      0|
|(8,[0,1,6,7],[0.8...|      0|
|(8,[0,1,6,7],[2.9...|      1|
|(8,[1,5,6,7],[3.0...|      0|
|(8,[1,5,6,7],[3.6...|      0|
|(8,[1,5,6,7],[3.7...|      1|
|(8,[1,5,6,7],[4.0...|      1|
|(8,[1,5,6,7],[4.5...|      1|
|(8,[1,5,6,7],[5.2...|      1|
|(8,[1,6,7],[2.940...|      0|
|[0.0,1.7827754878...|      0|
|[0.0,2.0955431172...|      0|
|[0.0,2.3144804578...|      0|
|[0.0,2.4395875096...|      0|
|[0.0,2.6272480873...|      0|
|[0.0,2.6898016132...|      0|
|[0.0,2.8461854279...|      0|
|[0.0,2.8461854279...|      0|
|[0.0,2.9087389538...|      0|
|[0.0,2.9087389538...|      0|
+--------------------+-------+
only showing top 20 rows

+--------------------+-------+
|     scaled_features|Outcome|
+--------------------+-------+
|(8,[0,1,6,7],[0.5...|      0|
|(8,[0,1,6,7],[1.7...|      0|
|(8,[0,1,6,7],[2.0...|      0|
|(8,[1,5,6,7],[2.2...|      0|
|(8,[1,5,6,7]

# Logistic Regression

In [28]:
log_reg = LogisticRegression(labelCol="Outcome",featuresCol="scaled_features", maxIter=40)
model = log_reg.fit(train)

In [29]:
prediction_test = model.transform(test)

In [30]:
prediction_test.show()

+--------------------+-------+--------------------+--------------------+----------+
|     scaled_features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[0,1,6,7],[0.5...|      0|[4.73921820248123...|[0.99133033963880...|       0.0|
|(8,[0,1,6,7],[1.7...|      0|[3.28241472526556...|[0.96382058059948...|       0.0|
|(8,[0,1,6,7],[2.0...|      0|[3.30818070781806...|[0.96470839331305...|       0.0|
|(8,[1,5,6,7],[2.2...|      0|[3.40944069619142...|[0.96799828115023...|       0.0|
|(8,[1,5,6,7],[4.3...|      1|[-1.0661727739189...|[0.25613159783818...|       1.0|
|(8,[1,5,6,7],[4.4...|      1|[-0.7620398151421...|[0.31820356418762...|       1.0|
|[0.0,2.6272480873...|      0|[2.67830391972837...|[0.93573420396164...|       0.0|
|[0.0,2.9087389538...|      0|[2.52990899573733...|[0.92621213411471...|       0.0|
|[0.0,3.0651227685...|      0|[3.31978318604995...|[0.96510128973941...|    

In [40]:
from pyspark.sql.types import DoubleType
prediction_test.select("Outcome","prediction").show(10)

+-------+----------+
|Outcome|prediction|
+-------+----------+
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      1|       1.0|
|      1|       1.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
+-------+----------+
only showing top 10 rows



In [41]:
predictionAndLabel = prediction_test.select("Outcome","prediction").withColumn("Outcome", col("Outcome").cast(DoubleType())).rdd
predictionAndLabel.collect()

[Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=1.0, prediction=0.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=1.0, prediction=1.0),
 Row(Outcome=0.0, prediction=0.0),
 Row(Outcome=0.0, pr

In [42]:
metrics = BinaryClassificationMetrics(predictionAndLabel)
print("Area unser ROC = %s" % metrics.areaUnderROC)



Area unser ROC = 0.7539808917197452


In [46]:
evaluator = MulticlassClassificationEvaluator(labelCol="Outcome", predictionCol="prediction", metricName="accuracy")
accuracy_LR = evaluator.evaluate(prediction_test)
print('Accuracy = ', accuracy_LR)

Accuracy =  0.755656108597285


# Train Test all model

In [67]:
def model_train_predict(model):
    model = model.fit(train)
    prediction_test = model.transform(test)
    prediction_test.show()
    prediction_test.select("Outcome", "prediction").show()
    predictionAndLabel = prediction_test.select("Outcome", "prediction").withColumn("Outcome",col("Outcome").cast(DoubleType())).rdd
    metrics = BinaryClassificationMetrics(predictionAndLabel)
    print("Area unser ROC = %s" % metrics.areaUnderROC)
    evaluator = MulticlassClassificationEvaluator(labelCol="Outcome", predictionCol="prediction", metricName="accuracy")
    accuracy_LR = evaluator.evaluate(prediction_test)
    print('Accuracy = ', accuracy_LR)

# NaiveBayes

In [68]:
model_train_predict(NaiveBayes(labelCol="Outcome",featuresCol="scaled_features", smoothing=1.0))

+--------------------+-------+--------------------+--------------------+----------+
|     scaled_features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[0,1,6,7],[0.5...|      0|[-12.121264761166...|[0.59676481029766...|       0.0|
|(8,[0,1,6,7],[1.7...|      0|[-16.940334177665...|[0.53597558808998...|       0.0|
|(8,[0,1,6,7],[2.0...|      0|[-17.933682172424...|[0.51309652173598...|       0.0|
|(8,[1,5,6,7],[2.2...|      0|[-14.871637724031...|[0.65282906875543...|       0.0|
|(8,[1,5,6,7],[4.3...|      1|[-25.800266671008...|[0.59476309259863...|       0.0|
|(8,[1,5,6,7],[4.4...|      1|[-22.009680748316...|[0.65940645865365...|       0.0|
|[0.0,2.6272480873...|      0|[-33.293983608548...|[0.74985746447192...|       0.0|
|[0.0,2.9087389538...|      0|[-30.306179268040...|[0.69690824916166...|       0.0|
|[0.0,3.0651227685...|      0|[-27.947704137695...|[0.74535420606424...|    

# GBTClassifier

In [71]:
gradient_boost_tree = GBTClassifier(labelCol="Outcome", featuresCol="scaled_features")

In [72]:
model_train_predict(gradient_boost_tree)

+--------------------+-------+--------------------+--------------------+----------+
|     scaled_features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[0,1,6,7],[0.5...|      0|[1.59060978949655...|[0.96012138787618...|       0.0|
|(8,[0,1,6,7],[1.7...|      0|[0.50041507868389...|[0.73122176616379...|       0.0|
|(8,[0,1,6,7],[2.0...|      0|[0.19498154586583...|[0.59627381435100...|       0.0|
|(8,[1,5,6,7],[2.2...|      0|[1.59445812523406...|[0.96041503902070...|       0.0|
|(8,[1,5,6,7],[4.3...|      1|[-1.8518766914916...|[0.02403880602771...|       1.0|
|(8,[1,5,6,7],[4.4...|      1|[-0.0102919172261...|[0.49485422307196...|       1.0|
|[0.0,2.6272480873...|      0|[1.30261736129321...|[0.93119771858991...|       0.0|
|[0.0,2.9087389538...|      0|[1.42948453988230...|[0.94578045844147...|       0.0|
|[0.0,3.0651227685...|      0|[1.56311008243455...|[0.95796143720356...|    

# RandomForestClassifier

In [73]:
model_train_predict(RandomForestClassifier(labelCol="Outcome", featuresCol="scaled_features", numTrees=40))

+--------------------+-------+--------------------+--------------------+----------+
|     scaled_features|Outcome|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|(8,[0,1,6,7],[0.5...|      0|[38.6584056110129...|[0.96646014027532...|       0.0|
|(8,[0,1,6,7],[1.7...|      0|[33.8541572011839...|[0.84635393002959...|       0.0|
|(8,[0,1,6,7],[2.0...|      0|[26.1911181125313...|[0.65477795281328...|       0.0|
|(8,[1,5,6,7],[2.2...|      0|[37.3222158217492...|[0.93305539554373...|       0.0|
|(8,[1,5,6,7],[4.3...|      1|[16.1781357152239...|[0.40445339288059...|       1.0|
|(8,[1,5,6,7],[4.4...|      1|[18.9100967178901...|[0.47275241794725...|       1.0|
|[0.0,2.6272480873...|      0|[32.6332804894483...|[0.81583201223621...|       0.0|
|[0.0,2.9087389538...|      0|[37.3476251614805...|[0.93369062903701...|       0.0|
|[0.0,3.0651227685...|      0|[38.5098097624170...|[0.96274524406042...|    