# Trying out PySpark

Useful reference: Databricks binary classifcation documentation

---

 https://docs.databricks.com/_static/notebooks/binary-classification.html

## Setup

Downloading Packages

In [None]:
'''
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xf spark-3.1.2-bin-hadoop3.2.tgz
!pip install pyspark
!pip install findspark
'''

Set up environment 

In [None]:
'''
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"
'''

Load Spark

In [None]:
'''
import findspark
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
findspark.init()
spark = SparkSession.builder.getOrCreate()
spark
'''


Data Source

  P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. 
  Modeling wine preferences by data mining from physicochemical properties.
  In Decision Support Systems, Elsevier, 47(4):547-553. ISSN: 0167-9236.

  Available at: [@Elsevier] http://dx.doi.org/10.1016/j.dss.2009.05.016
                [Pre-press (pdf)] http://www3.dsi.uminho.pt/pcortez/winequality09.pdf
                [bib] http://www3.dsi.uminho.pt/pcortez/dss09.bib

Download the data

In [None]:
'''
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv
!ls
'''

## Working with the data

Load data

In [None]:
red = spark.read.csv("winequality-red.csv",header=True,sep=';', inferSchema = True)
red.show(5)
red.printSchema()

Modify the quality column 

In [None]:
def modify(x):
    if x < 7:
        return 0
    return 1

udfModify = F.udf(modify, IntegerType())
red = red.withColumn("class",udfModify("quality"))
red = red.drop("quality")
red.show()

Assemble the columns

In [None]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
stages = []
class_string = StringIndexer(inputCol="class",outputCol="label")
stages += [class_string]
assembler = VectorAssembler().setInputCols(["fixed acidity","volatile acidity","citric acid","residual sugar","chlorides","free sulfur dioxide","total sulfur dioxide","density","pH","sulphates","alcohol"]).setOutputCol("features")
stages += [assembler]

Setup the pipeline

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
pipe = Pipeline().setStages(stages)
pipeModel = pipe.fit(red)
input = pipeModel.transform(red)
lrModel = LogisticRegression().fit(input)

Split the data

In [None]:
(train, test) = input.randomSplit([0.7, 0.3])
print(train.count())
print(train.show())
print(test.count())
print(test.show())

Fit the model

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=5)
lrModel = lr.fit(train)
predictions = lrModel.transform(test)
results = predictions.select("label", "prediction", "probability")
results.show()

Evaluate the model

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)
