In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('wine').getOrCreate()

In [2]:
spark

In [3]:
df = spark.read.csv(r"C:\Users\tsd95\CSV Files\Model\winequalityN.csv",header=True,inferSchema=True)

In [4]:
df.show()

+-----+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
| type|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|
+-----+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|white|          7.0|            0.27|       0.36|          20.7|    0.045|               45.0|               170.0|  1.001| 3.0|     0.45|    8.8|      6|
|white|          6.3|             0.3|       0.34|           1.6|    0.049|               14.0|               132.0|  0.994| 3.3|     0.49|    9.5|      6|
|white|          8.1|            0.28|        0.4|           6.9|     0.05|               30.0|                97.0| 0.9951|3.26|     0.44|   10.1|      6|
|white|          7.2|            0.23|       0.32|           8.5

In [6]:
df.printSchema()

root
 |-- type: string (nullable = true)
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [8]:
df.describe().show()

+-------+-----+------------------+-------------------+-------------------+-----------------+-------------------+-------------------+--------------------+--------------------+-------------------+-------------------+------------------+------------------+
|summary| type|     fixed acidity|   volatile acidity|        citric acid|   residual sugar|          chlorides|free sulfur dioxide|total sulfur dioxide|             density|                 pH|          sulphates|           alcohol|           quality|
+-------+-----+------------------+-------------------+-------------------+-----------------+-------------------+-------------------+--------------------+--------------------+-------------------+-------------------+------------------+------------------+
|  count| 6497|              6487|               6489|               6494|             6495|               6495|               6497|                6497|                6497|               6488|               6493|              6497|        

In [9]:
df.columns

['type',
 'fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

In [5]:
from pyspark.ml.feature import StringIndexer

In [13]:
ob = StringIndexer(inputCol='type',outputCol='En_type')

In [15]:
df1 = ob.fit(df).transform(df) 

In [16]:
df1.show()

+-----+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+-------+
| type|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|En_type|
+-----+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+-------+
|white|          7.0|            0.27|       0.36|          20.7|    0.045|               45.0|               170.0|  1.001| 3.0|     0.45|    8.8|      6|    0.0|
|white|          6.3|             0.3|       0.34|           1.6|    0.049|               14.0|               132.0|  0.994| 3.3|     0.49|    9.5|      6|    0.0|
|white|          8.1|            0.28|        0.4|           6.9|     0.05|               30.0|                97.0| 0.9951|3.26|     0.44|   10.1|      6|    0.0|
|white|         

In [17]:
df2 = df1.dropna()

In [19]:
df2.show()

+-----+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+-------+
| type|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|En_type|
+-----+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+-------+
|white|          7.0|            0.27|       0.36|          20.7|    0.045|               45.0|               170.0|  1.001| 3.0|     0.45|    8.8|      6|    0.0|
|white|          6.3|             0.3|       0.34|           1.6|    0.049|               14.0|               132.0|  0.994| 3.3|     0.49|    9.5|      6|    0.0|
|white|          8.1|            0.28|        0.4|           6.9|     0.05|               30.0|                97.0| 0.9951|3.26|     0.44|   10.1|      6|    0.0|
|white|         

In [26]:
df3 = df2.drop('type')

In [27]:
df3.show(5)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+-------+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|En_type|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+-------+
|          7.0|            0.27|       0.36|          20.7|    0.045|               45.0|               170.0|  1.001| 3.0|     0.45|    8.8|      6|    0.0|
|          6.3|             0.3|       0.34|           1.6|    0.049|               14.0|               132.0|  0.994| 3.3|     0.49|    9.5|      6|    0.0|
|          8.1|            0.28|        0.4|           6.9|     0.05|               30.0|                97.0| 0.9951|3.26|     0.44|   10.1|      6|    0.0|
|          7.2|            0.23|       0.32|        

In [20]:
from pyspark.ml.feature import VectorAssembler

In [29]:
obj = VectorAssembler(inputCols=['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides',
 'free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol','quality'],outputCol='Independant Feature')

In [30]:
X = obj.transform(df3)

In [31]:
X.show()

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+-------+--------------------+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|En_type| Independant Feature|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+-------+--------------------+
|          7.0|            0.27|       0.36|          20.7|    0.045|               45.0|               170.0|  1.001| 3.0|     0.45|    8.8|      6|    0.0|[7.0,0.27,0.36,20...|
|          6.3|             0.3|       0.34|           1.6|    0.049|               14.0|               132.0|  0.994| 3.3|     0.49|    9.5|      6|    0.0|[6.3,0.3,0.34,1.6...|
|          8.1|            0.28|        0.4|           6.9|     0.05|               30.0|                

In [32]:
X1 = X.select('Independant Feature')

In [33]:
X1.show()

+--------------------+
| Independant Feature|
+--------------------+
|[7.0,0.27,0.36,20...|
|[6.3,0.3,0.34,1.6...|
|[8.1,0.28,0.4,6.9...|
|[7.2,0.23,0.32,8....|
|[7.2,0.23,0.32,8....|
|[8.1,0.28,0.4,6.9...|
|[6.2,0.32,0.16,7....|
|[7.0,0.27,0.36,20...|
|[6.3,0.3,0.34,1.6...|
|[8.1,0.22,0.43,1....|
|[8.1,0.27,0.41,1....|
|[8.6,0.23,0.4,4.2...|
|[7.9,0.18,0.37,1....|
|[6.6,0.16,0.4,1.5...|
|[8.3,0.42,0.62,19...|
|[6.6,0.17,0.38,1....|
|[6.3,0.48,0.04,1....|
|[7.4,0.34,0.42,1....|
|[6.5,0.31,0.14,7....|
|[6.2,0.66,0.48,1....|
+--------------------+
only showing top 20 rows



In [36]:
F_df = X.select('Independant Feature','En_type')

In [37]:
F_df.show()

+--------------------+-------+
| Independant Feature|En_type|
+--------------------+-------+
|[7.0,0.27,0.36,20...|    0.0|
|[6.3,0.3,0.34,1.6...|    0.0|
|[8.1,0.28,0.4,6.9...|    0.0|
|[7.2,0.23,0.32,8....|    0.0|
|[7.2,0.23,0.32,8....|    0.0|
|[8.1,0.28,0.4,6.9...|    0.0|
|[6.2,0.32,0.16,7....|    0.0|
|[7.0,0.27,0.36,20...|    0.0|
|[6.3,0.3,0.34,1.6...|    0.0|
|[8.1,0.22,0.43,1....|    0.0|
|[8.1,0.27,0.41,1....|    0.0|
|[8.6,0.23,0.4,4.2...|    0.0|
|[7.9,0.18,0.37,1....|    0.0|
|[6.6,0.16,0.4,1.5...|    0.0|
|[8.3,0.42,0.62,19...|    0.0|
|[6.6,0.17,0.38,1....|    0.0|
|[6.3,0.48,0.04,1....|    0.0|
|[7.4,0.34,0.42,1....|    0.0|
|[6.5,0.31,0.14,7....|    0.0|
|[6.2,0.66,0.48,1....|    0.0|
+--------------------+-------+
only showing top 20 rows



In [38]:
from pyspark.ml.classification import LogisticRegression

In [39]:
train_data,test_data = F_df.randomSplit([0.75,0.25])

In [40]:
print(train_data.count())
print(test_data.count())

4854
1609


In [41]:
model = LogisticRegression(featuresCol='Independant Feature',labelCol='En_type')

In [42]:
F_model = model.fit(train_data)

In [44]:
yPred = F_model.evaluate(test_data)

In [45]:
yPred.accuracy

0.9919204474829086

In [46]:
yPred.predictions.show()

+--------------------+-------+--------------------+--------------------+----------+
| Independant Feature|En_type|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|[4.4,0.46,0.1,2.8...|    0.0|[9.55015970712838...|[0.99992881517501...|       0.0|
|[4.7,0.145,0.29,1...|    0.0|[7.72421676579188...|[0.99955820238308...|       0.0|
|[4.8,0.17,0.28,2....|    0.0|[11.2013417282228...|[0.99998634432516...|       0.0|
|[4.8,0.26,0.23,10...|    0.0|[13.0080953872247...|[0.99999775789999...|       0.0|
|[4.8,0.34,0.0,6.5...|    0.0|[10.3185806831488...|[0.99996698715060...|       0.0|
|[4.9,0.345,0.34,1...|    0.0|[8.40039936035100...|[0.99977527297457...|       0.0|
|[5.0,0.2,0.4,1.9,...|    0.0|[9.29770692051670...|[0.99990837428098...|       0.0|
|[5.0,0.235,0.27,1...|    0.0|[11.1353773674063...|[0.99998541317675...|       0.0|
|[5.0,0.24,0.19,5....|    0.0|[7.85456472820715...|[0.99961217355684...|    

