# White Wine Regression Using PySpark

## Setting up PySpark

In [None]:
"""
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xf spark-3.1.2-bin-hadoop3.2.tgz
!pip install pyspark
!pip install findspark
"""

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

In [None]:
import findspark
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *
findspark.init()
spark = SparkSession.builder.getOrCreate()
spark

## Getting the data

In [None]:
#!wget https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv

In [None]:
df = spark.read.csv("winequality-white.csv",header=True,sep=";",inferSchema=True)
df.show()
df.printSchema()
df.describe().show()

## Building and evaluating the model

In [None]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["fixed acidity","volatile acidity","citric acid","residual sugar","chlorides","free sulfur dioxide","total sulfur dioxide","density","pH","sulphates","alcohol"],outputCol="features")
output = assembler.transform(df).select("features","quality")
output.show(truncate=False)

In [None]:
train,test = output.randomSplit([0.7,0.3])
train.show(5)
train.describe().show()
test.show(5)
test.describe().show()

In [None]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol="features",labelCol="quality")
model = lr.fit(train)
print("The coeffecients are {} and the intercept is {} ".format(model.coefficients,model.intercept))

In [None]:
predictions = model.transform(test)
predictions.show()

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="quality",predictionCol="prediction",metricName="rmse")
eval = evaluator.evaluate(predictions)
print("The RMSE is {}".format(eval))