In [1]:
#Instantiate SparkSession
from pyspark.sql import SparkSession
spark=SparkSession \
			.builder \
			.appName('Predicting the grape variety from wine characteristics') \
			.getOrCreate()

In [2]:
spark

In [3]:
# https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data
# gsutil cp wine.data gs://dexdebra-123/datasets
# Use sparkSession to read csv file			
rawData=spark.read \
				.format('csv') \
				.option('header','false') \
				.load('gs://dexdebra-123/datasets/wine.data')

In [4]:
rawData

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string, _c8: string, _c9: string, _c10: string, _c11: string, _c12: string, _c13: string]

In [5]:
rawData.show(5)

+---+-----+----+----+----+---+----+----+---+----+----+----+----+----+
|_c0|  _c1| _c2| _c3| _c4|_c5| _c6| _c7|_c8| _c9|_c10|_c11|_c12|_c13|
+---+-----+----+----+----+---+----+----+---+----+----+----+----+----+
|  1|14.23|1.71|2.43|15.6|127| 2.8|3.06|.28|2.29|5.64|1.04|3.92|1065|
|  1| 13.2|1.78|2.14|11.2|100|2.65|2.76|.26|1.28|4.38|1.05| 3.4|1050|
|  1|13.16|2.36|2.67|18.6|101| 2.8|3.24| .3|2.81|5.68|1.03|3.17|1185|
|  1|14.37|1.95| 2.5|16.8|113|3.85|3.49|.24|2.18| 7.8| .86|3.45|1480|
|  1|13.24|2.59|2.87|  21|118| 2.8|2.69|.39|1.82|4.32|1.04|2.93| 735|
+---+-----+----+----+----+---+----+----+---+----+----+----+----+----+
only showing top 5 rows



In [6]:
#Assign meaningful Headers
dataSet=rawData.toDF('Label',
						'Alcohol','MalicAcid','Ash','AshAlkanity','Magnesium','TotalPhenols',
						'Flavanoids','NonflavanoidPhenols','Proanthocyanins','ColorIntensity',
						'Hue','OD','Proline'
					)	

In [7]:
dataSet

DataFrame[Label: string, Alcohol: string, MalicAcid: string, Ash: string, AshAlkanity: string, Magnesium: string, TotalPhenols: string, Flavanoids: string, NonflavanoidPhenols: string, Proanthocyanins: string, ColorIntensity: string, Hue: string, OD: string, Proline: string]

In [9]:
dataSet.toPandas().head()

Unnamed: 0,Label,Alcohol,MalicAcid,Ash,AshAlkanity,Magnesium,TotalPhenols,Flavanoids,NonflavanoidPhenols,Proanthocyanins,ColorIntensity,Hue,OD,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [8]:
# Extracts the labels and features from  dataframe and converts features to a densevector
from pyspark.ml.linalg import Vectors

def vectorize(data):
     return data.rdd.map( lambda r: [ r[0], Vectors.dense(r[1:])]).toDF(['label','features'])


    
vectorizedData = vectorize(dataSet)
vectorizedData.show(5)

# First Column: label       - Cultivators that produced the wine ( 1,2 and 3)
# second Column: features   - Wine features- All features are lumped together into a single column 'feature'

# Each Row in the Spark DataFrame is a Row Object with label and features

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    1|[14.23,1.71,2.43,...|
|    1|[13.2,1.78,2.14,1...|
|    1|[13.16,2.36,2.67,...|
|    1|[14.37,1.95,2.5,1...|
|    1|[13.24,2.59,2.87,...|
+-----+--------------------+
only showing top 5 rows



In [9]:
# The labels in the DataFrame event tho' have values as 1,2 and 3 is in String form
# Convert those to numeric form
# Convert Number  Values in String Format to numeric using StringIndexer

from pyspark.ml.feature import StringIndexer

labelIndexer=StringIndexer( inputCol='label',outputCol='indexedLabel' )


In [10]:
labelIndexer

StringIndexer_420fb6e9f2b6f655ec33

In [11]:
# Fit method on the labelIndexer will generate the corresponding float value for numeric data in String format and 
# apply these float point values to vectorized data , using transform method 

indexedData=labelIndexer.fit(vectorizedData).transform(vectorizedData)
indexedData.take(2)

[Row(label=u'1', features=DenseVector([14.23, 1.71, 2.43, 15.6, 127.0, 2.8, 3.06, 0.28, 2.29, 5.64, 1.04, 3.92, 1065.0]), indexedLabel=1.0),
 Row(label=u'1', features=DenseVector([13.2, 1.78, 2.14, 11.2, 100.0, 2.65, 2.76, 0.26, 1.28, 4.38, 1.05, 3.4, 1050.0]), indexedLabel=1.0)]

In [12]:
# Display Structure of indexedData
indexedData

DataFrame[label: string, features: vector, indexedLabel: double]

In [14]:
# Display unique values for 'label' and 'indexedLabel' columns
indexedData.select('label').distinct().show()
indexedData.select('indexedLabel').distinct().show()

+-----+
|label|
+-----+
|    3|
|    1|
|    2|
+-----+

+------------+
|indexedLabel|
+------------+
|         0.0|
|         1.0|
|         2.0|
+------------+



In [15]:
# split data set into training and test data set 80:20 ratio
(trainingData,testData)=indexedData.randomSplit([0.8,0.2])

In [17]:
trainingData

DataFrame[label: string, features: vector, indexedLabel: double]

In [16]:
testData

DataFrame[label: string, features: vector, indexedLabel: double]

In [17]:
trainingData.count()

142

In [18]:
testData.count()

36

In [19]:
trainingData.toPandas().head()

Unnamed: 0,label,features,indexedLabel
0,1,"[12.93, 3.8, 2.65, 18.6, 102.0, 2.41, 2.41, 0....",1.0
1,1,"[13.05, 1.65, 2.55, 18.0, 98.0, 2.45, 2.43, 0....",1.0
2,1,"[13.07, 1.5, 2.1, 15.5, 98.0, 2.4, 2.64, 0.28,...",1.0
3,1,"[13.16, 2.36, 2.67, 18.6, 101.0, 2.8, 3.24, 0....",1.0
4,1,"[13.2, 1.78, 2.14, 11.2, 100.0, 2.65, 2.76, 0....",1.0


In [22]:
testData.toPandas().head()

Unnamed: 0,label,features,indexedLabel
0,1,"[12.93, 3.8, 2.65, 18.6, 102.0, 2.41, 2.41, 0....",1.0
1,1,"[13.05, 1.73, 2.04, 12.4, 92.0, 2.72, 3.27, 0....",1.0
2,1,"[13.05, 2.05, 3.22, 25.0, 124.0, 2.63, 2.68, 0...",1.0
3,1,"[13.24, 3.98, 2.29, 17.5, 103.0, 2.64, 2.63, 0...",1.0
4,1,"[13.64, 3.1, 2.56, 15.2, 116.0, 2.7, 3.03, 0.1...",1.0


In [20]:
# Instantiate DecisionTreeClassifier
from pyspark.ml.classification  import DecisionTreeClassifier

# DecisionTreeClassifier is an Estimator object which fits on the training data to produce DecisionTree ML model
# indexedLabel is the column  which contains numerical values
# featuresCol is the one with features (all features lumped into a single column)

dtree=DecisionTreeClassifier(
		labelCol='indexedLabel',
		featuresCol='features', # contains all features lumped into single column
		maxDepth=3,
		impurity='gini'
		)

In [21]:
dtree
model=dtree.fit(trainingData)

In [22]:
# Evaluate how well the model performs
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
	
evaluator=MulticlassClassificationEvaluator(
												labelCol='indexedLabel', # column which represents actual labels
												predictionCol='prediction', # columns which represent predictions
												metricName='f1' # harmonic mean of precision and recall
)

In [24]:
evaluator

MulticlassClassificationEvaluator_4821b378f4be1219c271

In [25]:
# Since ML Model is a transformer, we can call transform on test data
# This is what we will use for predictions

transformed_data=model.transform(testData)
transformed_data.show(5)

+-----+--------------------+------------+--------------+--------------------+----------+
|label|            features|indexedLabel| rawPrediction|         probability|prediction|
+-----+--------------------+------------+--------------+--------------------+----------+
|    1|[12.85,1.6,2.52,1...|         1.0| [1.0,0.0,0.0]|       [1.0,0.0,0.0]|       0.0|
|    1|[13.05,1.73,2.04,...|         1.0|[0.0,40.0,0.0]|       [0.0,1.0,0.0]|       1.0|
|    1|[13.05,1.77,2.1,1...|         1.0|[0.0,40.0,0.0]|       [0.0,1.0,0.0]|       1.0|
|    1|[13.05,2.05,3.22,...|         1.0|[52.0,1.0,0.0]|[0.98113207547169...|       0.0|
|    1|[13.24,3.98,2.29,...|         1.0|[0.0,40.0,0.0]|       [0.0,1.0,0.0]|       1.0|
+-----+--------------------+------------+--------------+--------------------+----------+
only showing top 5 rows



In [26]:
# 3 Columns are added to the dataframe as below.
# 1.rawPrediction - 
# 2.probablity - Probablity of Individual Predictions
# 3.prediction - Final Predicted Label 

# The values in the last "prediction" column is the final predicted result.

# We will compare 'prediction' column to actual label in 'indexedLabel' column


# We will use MulticlassClassificationEvaluator to evaluate on testData
print(evaluator.getMetricName(),'accuracy:',evaluator.evaluate(transformed_data))

('f1', 'accuracy:', 0.8888888888888888)
