# White-wine data reading and cleaning

In [1]:
#Import the white-wine file which is in CSV format 
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()
dataFrame=spark.read.option("delimiter", ";").option("inferSchema", "true").option("header","true").csv(r"C:\Users\varsh\OneDrive\Desktop\MSDA\SEM2\MACHINE_LEARNING\Project\datasets\winequality-white.csv")
dataFrame

DataFrame[fixed acidity: double, volatile acidity: double, citric acid: double, residual sugar: double, chlorides: double, free sulfur dioxide: double, total sulfur dioxide: double, density: double, pH: double, sulphates: double, alcohol: double, quality: int]

In [2]:
#checking and renaming headers into single value:
dataFrame = dataFrame.withColumnRenamed('fixed acidity', 'fixed_acidity')
dataFrame = dataFrame.withColumnRenamed('volatile acidity', 'volatile_acidity')
dataFrame = dataFrame.withColumnRenamed('citric acid', 'citric_acid')
dataFrame = dataFrame.withColumnRenamed('free sulfur dioxide', 'free_sulfur_dioxide')
dataFrame = dataFrame.withColumnRenamed('residual sugar', 'residual_sugar')
dataFrame = dataFrame.withColumnRenamed('total sulfur dioxide', 'total_sulfur_dioxide')
dataFrame.show(3)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|fixed_acidity|volatile_acidity|citric_acid|residual_sugar|chlorides|free_sulfur_dioxide|total_sulfur_dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|          7.0|            0.27|       0.36|          20.7|    0.045|               45.0|               170.0|  1.001| 3.0|     0.45|    8.8|      6|
|          6.3|             0.3|       0.34|           1.6|    0.049|               14.0|               132.0|  0.994| 3.3|     0.49|    9.5|      6|
|          8.1|            0.28|        0.4|           6.9|     0.05|               30.0|                97.0| 0.9951|3.26|     0.44|   10.1|      6|
+-------------+----------------+-----------+--------------+---------+-------------------+-----------

In [3]:
#checking for categorical values
dataFrame.printSchema()

root
 |-- fixed_acidity: double (nullable = true)
 |-- volatile_acidity: double (nullable = true)
 |-- citric_acid: double (nullable = true)
 |-- residual_sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free_sulfur_dioxide: double (nullable = true)
 |-- total_sulfur_dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [4]:
#Checking and removing of duplicates:
print("The number of non-distinct data is :",dataFrame.count())
print("The number of distinct data is :",dataFrame.distinct().count()) 
print("Therefore, the number of duplicates are:",dataFrame.count()-dataFrame.distinct().count()) 
dataFrame=dataFrame.dropDuplicates()
print("After dropping the duplicates, the count of the dictinct data is:",dataFrame.count()) 

The number of non-distinct data is : 4898
The number of distinct data is : 3961
Therefore, the number of duplicates are: 937
After dropping the duplicates, the count of the dictinct data is: 3961


In [5]:
#checking for nulls or missing values:
nulls = {col:dataFrame.filter(dataFrame[col].isNull()).count() for col in dataFrame.columns}
nulls

{'fixed_acidity': 0,
 'volatile_acidity': 0,
 'citric_acid': 0,
 'residual_sugar': 0,
 'chlorides': 0,
 'free_sulfur_dioxide': 0,
 'total_sulfur_dioxide': 0,
 'density': 0,
 'pH': 0,
 'sulphates': 0,
 'alcohol': 0,
 'quality': 0}

In [6]:
#Create a user defined function to convert the interger values of 'quality' column into binary values such as 0 and 1.
import pyspark.sql.functions as func
def pred_col(feat1):
    return func.when((func.col(feat1)>6),1).otherwise(0)

quality_transformed=dataFrame.withColumn('label',pred_col('quality'))

df=quality_transformed[['fixed_acidity',
'volatile_acidity',
'citric_acid',
'residual_sugar',
'chlorides',
'free_sulfur_dioxide',
'total_sulfur_dioxide',
'density',
'pH',
'sulphates',
'alcohol','label']]
df.columns

['fixed_acidity',
 'volatile_acidity',
 'citric_acid',
 'residual_sugar',
 'chlorides',
 'free_sulfur_dioxide',
 'total_sulfur_dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'label']

In [7]:
#Feature engineering: convert the features into feature vectors.
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler
vectorAssembler=VectorAssembler(inputCols=['fixed_acidity',
 'volatile_acidity',
 'citric_acid',
 'residual_sugar',
 'chlorides',
 'free_sulfur_dioxide',
 'total_sulfur_dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol'],outputCol='feature_vector')

df = vectorAssembler.transform(df)
df = df.select(['feature_vector', 'label'])

df.show()

+--------------------+-----+
|      feature_vector|label|
+--------------------+-----+
|[6.7,0.23,0.26,1....|    0|
|[7.4,0.2,0.36,1.2...|    0|
|[6.7,0.17,0.5,2.1...|    0|
|[6.4,0.24,0.32,14...|    0|
|[7.2,0.55,0.09,1....|    0|
|[6.6,0.24,0.29,2....|    0|
|[6.5,0.26,0.28,12...|    0|
|[7.4,0.24,0.31,8....|    0|
|[7.2,0.37,0.15,2....|    1|
|[7.5,0.23,0.32,9....|    0|
|[9.0,0.31,0.48,6....|    0|
|[6.2,0.27,0.49,1....|    0|
|[7.6,0.26,0.58,7....|    0|
|[10.0,0.23,0.27,1...|    0|
|[6.8,0.31,0.09,1....|    0|
|[7.7,0.24,0.31,1....|    0|
|[6.3,0.18,0.22,1....|    0|
|[7.6,0.1,0.33,1.0...|    0|
|[6.6,0.45,0.43,7....|    0|
|[7.1,0.27,0.28,1....|    0|
+--------------------+-----+
only showing top 20 rows



In [8]:
#Standardize the feature-vector values.
from pyspark.ml.feature import StandardScaler  
Scalerizer=StandardScaler().setInputCol("feature_vector").setOutputCol("features")
df=Scalerizer.fit(df).transform(df)
df=df.select(['features','label'])

In [9]:
#split the data into training and test data.
train,test=df.randomSplit([0.7,0.3]) # spitting the dataframe data into training data(70%) and test data(30%)
train.count(),test.count()

(2793, 1168)

# White-wine Regression 

In [10]:
#create a decision tree regression model using the training dataset.
from pyspark.ml.regression import DecisionTreeRegressor
DTRegModel=DecisionTreeRegressor(featuresCol='features',labelCol='label').fit(train)
DTRegModel.featureImportances

SparseVector(11, {0: 0.0187, 1: 0.0878, 2: 0.0258, 4: 0.0428, 5: 0.0866, 6: 0.0382, 7: 0.0131, 8: 0.0497, 9: 0.0192, 10: 0.6182})

In [11]:
#Test the model with the test dataset for predictions.
DTReg_predictions=DTRegModel.transform(test)
DTReg_predictions.show()

+--------------------+-----+--------------------+
|            features|label|          prediction|
+--------------------+-----+--------------------+
|[6.57545821272352...|    0|  0.5302013422818792|
|[7.15225279278698...|    0|0.007312614259597806|
|[7.26761170879968...|    0| 0.22727272727272727|
|[7.26761170879968...|    0|0.007312614259597806|
|[7.26761170879968...|    0|0.007312614259597806|
|[7.38297062481237...|    0| 0.07246376811594203|
|[7.61368845683776...|    0| 0.32432432432432434|
|[7.61368845683776...|    0|0.007312614259597806|
|[8.07512412088853...|    1|                 0.0|
|[7.03689387677429...|    1|   0.746268656716418|
|[7.61368845683776...|    0|0.007312614259597806|
|[8.30584195291392...|    0| 0.06153846153846154|
|[9.11335436500277...|    0|  0.4714285714285714|
|[9.45943111304085...|    0|0.007312614259597806|
|[7.03689387677429...|    0|0.007312614259597806|
|[7.84440628886314...|    0|  0.3340909090909091|
|[7.84440628886314...|    0|0.041666666666666664|


In [12]:
#Check the r2-score value.
from pyspark.ml.evaluation import RegressionEvaluator
dt_evaluator = RegressionEvaluator(metricName='r2')
dt_r2 = dt_evaluator.evaluate(DTReg_predictions)
print(f'The r-square value of DecisionTreeRegressor is {round((dt_r2*100),2)}%')


The r-square value of DecisionTreeRegressor is 16.22%


In [13]:
#check the root mean squared error value.
dt_evaluator = RegressionEvaluator(metricName='rmse')
dt_rmse = dt_evaluator.evaluate(DTReg_predictions)
print(f'The rmse value of DecisionTreeRegressor is {round((dt_rmse*100),2)}%')


The rmse value of DecisionTreeRegressor is 36.64%


# White-wine Classification 

In [14]:
#create a decision tree classification model using the training dataset.
from pyspark.ml.classification import DecisionTreeClassifier

DTClassModel=DecisionTreeClassifier(featuresCol='features',labelCol='label').fit(train)
DTClassModel.featureImportances

SparseVector(11, {0: 0.0252, 1: 0.0076, 4: 0.0418, 5: 0.1046, 6: 0.0176, 8: 0.0443, 10: 0.7588})

In [15]:
#Test the model with the test dataset for predictions
DTClass_predictions=DTClassModel.transform(test)
DTClass_predictions.select("features","label","prediction").show()

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|[6.57545821272352...|    0|       1.0|
|[7.15225279278698...|    0|       0.0|
|[7.26761170879968...|    0|       0.0|
|[7.26761170879968...|    0|       0.0|
|[7.26761170879968...|    0|       0.0|
|[7.38297062481237...|    0|       0.0|
|[7.61368845683776...|    0|       0.0|
|[7.61368845683776...|    0|       0.0|
|[8.07512412088853...|    1|       0.0|
|[7.03689387677429...|    1|       1.0|
|[7.61368845683776...|    0|       0.0|
|[8.30584195291392...|    0|       0.0|
|[9.11335436500277...|    0|       0.0|
|[9.45943111304085...|    0|       0.0|
|[7.03689387677429...|    0|       0.0|
|[7.84440628886314...|    0|       0.0|
|[7.84440628886314...|    0|       0.0|
|[7.95976520487584...|    0|       0.0|
|[8.07512412088853...|    0|       1.0|
|[8.65191870095200...|    0|       0.0|
+--------------------+-----+----------+
only showing top 20 rows



In [16]:
#check the areaUnderROC value.
from pyspark.ml.evaluation import BinaryClassificationEvaluator
dt_evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
dt_auroc = dt_evaluator.evaluate(DTClass_predictions)
print(f'The auc value of Decision Tree Classifier Modelis {round((dt_auroc*100),2)}%')


The auc value of Decision Tree Classifier Modelis 29.44%


In [17]:
#check the areaUnderPR value.
dt_evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
dt_aupr = dt_evaluator.evaluate(DTClass_predictions)
print(f'The aupr value of Decision Tree Model is {round((dt_aupr*100),2)}%')

The aupr value of Decision Tree Model is 13.66%


In [18]:
#creation of confusion matrix
truePositive = DTClass_predictions[(DTClass_predictions["label"] == 1) & (DTClass_predictions['prediction']==1)].count()
trueNegative = DTClass_predictions[(DTClass_predictions["label"] == 0) & (DTClass_predictions['prediction']== 0)].count()
falsePositive = DTClass_predictions[(DTClass_predictions["label"] == 0) & (DTClass_predictions['prediction']== 1)].count()
falseNegative = DTClass_predictions[(DTClass_predictions["label"] == 1) & (DTClass_predictions['prediction']== 0)].count()

In [19]:
#Measuring the accuracy of the classification model
accuracy=float((truePositive+trueNegative) /(DTClass_predictions.count()))
print("the accuracy is {0}%".format( round( accuracy*100,2)))

the accuracy is 81.42%


In [20]:

#Measuring the recall rate of the classification model
recall = float(truePositive)/(truePositive + falseNegative)
print("the recall rate is {0}%".format( round( recall*100,2)))

the recall rate is 41.45%


In [21]:
#Measuring the precision of the classification model
precision = float(truePositive)/(truePositive + falsePositive)
print("the precision value is {0}%".format( round( precision*100,2)))

the precision value is 54.8%


In [22]:
Error_rate = (1-accuracy)*100
print("The error rate is {0}%".format( round( Error_rate,2)))

The error rate is 18.58%


# Red-wine data reading and cleaning

In [23]:
#Import the red-wine file which is in CSV format
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()
dataFrame1=spark.read.option("delimiter", ",").option("inferSchema", "true").option("header","true").csv(r"C:\Users\varsh\OneDrive\Desktop\MSDA\SEM2\MACHINE_LEARNING\Project\datasets\winequality-red.csv")
dataFrame1

DataFrame[fixed acidity: double, volatile acidity: double, citric acid: double, residual sugar: double, chlorides: double, free sulfur dioxide: double, total sulfur dioxide: double, density: double, pH: double, sulphates: double, alcohol: double, quality: int]

In [24]:
#checking and renaming headers into single value:
dataFrame1 = dataFrame1.withColumnRenamed('fixed acidity', 'fixed_acidity')
dataFrame1 = dataFrame1.withColumnRenamed('volatile acidity', 'volatile_acidity')
dataFrame1 = dataFrame1.withColumnRenamed('citric acid', 'citric_acid')
dataFrame1 = dataFrame1.withColumnRenamed('free sulfur dioxide', 'free_sulfur_dioxide')
dataFrame1 = dataFrame1.withColumnRenamed('residual sugar', 'residual_sugar')
dataFrame1 = dataFrame1.withColumnRenamed('total sulfur dioxide', 'total_sulfur_dioxide')
dataFrame1.show(3)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|fixed_acidity|volatile_acidity|citric_acid|residual_sugar|chlorides|free_sulfur_dioxide|total_sulfur_dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|          7.4|             0.7|        0.0|           1.9|    0.076|               11.0|                34.0| 0.9978|3.51|     0.56|    9.4|      5|
|          7.8|            0.88|        0.0|           2.6|    0.098|               25.0|                67.0| 0.9968| 3.2|     0.68|    9.8|      5|
|          7.8|            0.76|       0.04|           2.3|    0.092|               15.0|                54.0|  0.997|3.26|     0.65|    9.8|      5|
+-------------+----------------+-----------+--------------+---------+-------------------+-----------

In [25]:
#checking for categorical values
dataFrame1.printSchema()

root
 |-- fixed_acidity: double (nullable = true)
 |-- volatile_acidity: double (nullable = true)
 |-- citric_acid: double (nullable = true)
 |-- residual_sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free_sulfur_dioxide: double (nullable = true)
 |-- total_sulfur_dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [26]:
#Checking and removing of duplicates:
print("The number of non-distinct data is :",dataFrame1.count())
print("The number of distinct data is :",dataFrame1.distinct().count()) 
print("Therefore, the number of duplicates are:",dataFrame1.count()-dataFrame1.distinct().count()) 
dataFrame1=dataFrame1.dropDuplicates()
print("After dropping the duplicates, the count of the dictinct data is:",dataFrame1.count()) 

The number of non-distinct data is : 1599
The number of distinct data is : 1359
Therefore, the number of duplicates are: 240
After dropping the duplicates, the count of the dictinct data is: 1359


In [27]:
#checking for nulls or missing values:
nulls = {col:dataFrame1.filter(dataFrame1[col].isNull()).count() for col in dataFrame1.columns}
nulls

{'fixed_acidity': 0,
 'volatile_acidity': 0,
 'citric_acid': 0,
 'residual_sugar': 0,
 'chlorides': 0,
 'free_sulfur_dioxide': 0,
 'total_sulfur_dioxide': 0,
 'density': 0,
 'pH': 0,
 'sulphates': 0,
 'alcohol': 0,
 'quality': 0}

In [28]:
#Create a user defined function to convert the interger values of 'quality' column into binary values such as 0 and 1.
import pyspark.sql.functions as func
def pred_col(feat1):
    return func.when((func.col(feat1)>6),1).otherwise(0)

quality_transformed=dataFrame1.withColumn('label',pred_col('quality'))

df1=quality_transformed[['fixed_acidity',
'volatile_acidity',
'citric_acid',
'residual_sugar',
'chlorides',
'free_sulfur_dioxide',
'total_sulfur_dioxide',
'density',
'pH',
'sulphates',
'alcohol','label']]
df1.columns

['fixed_acidity',
 'volatile_acidity',
 'citric_acid',
 'residual_sugar',
 'chlorides',
 'free_sulfur_dioxide',
 'total_sulfur_dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'label']

In [29]:
#Feature engineering: convert the features into feature vectors.
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler
vectorAssembler=VectorAssembler(inputCols=['fixed_acidity',
 'volatile_acidity',
 'citric_acid',
 'residual_sugar',
 'chlorides',
 'free_sulfur_dioxide',
 'total_sulfur_dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol'],outputCol='feature_vector')

df1 = vectorAssembler.transform(df1)
df1 = df1.select(['feature_vector', 'label'])

df1.show()

+--------------------+-----+
|      feature_vector|label|
+--------------------+-----+
|[8.9,0.61,0.49,2....|    0|
|[8.9,0.59,0.5,2.0...|    0|
|[8.1,0.87,0.0,3.3...|    0|
|[11.0,0.2,0.48,2....|    0|
|[6.1,0.21,0.4,1.4...|    0|
|[8.8,0.44,0.49,2....|    0|
|[8.0,0.43,0.36,2....|    0|
|[10.2,0.34,0.48,2...|    1|
|[8.8,0.33,0.41,5....|    1|
|[7.5,0.57,0.08,2....|    0|
|[7.8,0.815,0.01,2...|    0|
|[7.8,0.76,0.04,2....|    0|
|[9.9,0.35,0.41,2....|    0|
|[10.7,0.9,0.34,6....|    0|
|[7.3,0.67,0.02,2....|    0|
|[6.9,0.685,0.0,2....|    0|
|[7.8,0.5,0.17,1.6...|    0|
|[9.7,0.31,0.47,1....|    0|
|[8.9,0.84,0.34,1....|    0|
|[6.6,0.725,0.2,7....|    0|
+--------------------+-----+
only showing top 20 rows



In [30]:
#Standardize the feature-vector values.
from pyspark.ml.feature import StandardScaler   #can be done once the feature vector is created
Scalerizer=StandardScaler().setInputCol("feature_vector").setOutputCol("features")
df1=Scalerizer.fit(df1).transform(df1)
df1=df1.select(['features','label'])
df1.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[5.12380669213203...|    0|
|[5.12380669213203...|    0|
|[4.66323979845724...|    0|
|[6.33279478802835...|    0|
|[3.51182256427027...|    0|
|[5.06623583042268...|    0|
|[4.60566893674789...|    0|
|[5.87222789435356...|    1|
|[5.06623583042268...|    1|
|[4.31781462820115...|    0|
|[4.4905272133292,...|    0|
|[4.4905272133292,...|    0|
|[5.69951530922552...|    0|
|[6.16008220290031...|    0|
|[4.20267290478245...|    0|
|[3.97238945794506...|    0|
|[4.4905272133292,...|    0|
|[5.58437358580682...|    0|
|[5.12380669213203...|    0|
|[3.79967687281701...|    0|
+--------------------+-----+
only showing top 20 rows



In [31]:
#split the data into training and test data.
train,test=df1.randomSplit([0.7,0.3]) # spitting the dataframe data into training data(70%) and test data(30%)
train.count(),test.count()

(949, 410)

# Red-wine regression

In [32]:
#create a decision tree regression model using the training dataset.
from pyspark.ml.regression import DecisionTreeRegressor
DTRegModel=DecisionTreeRegressor(featuresCol='features',labelCol='label').fit(train)
DTRegModel.featureImportances

SparseVector(11, {0: 0.0562, 1: 0.1197, 2: 0.0383, 3: 0.1453, 5: 0.0347, 7: 0.0341, 8: 0.0424, 9: 0.1551, 10: 0.3742})

In [33]:
#Test the model with the test data for predictions.
DTReg_predictions=DTRegModel.transform(test)
DTReg_predictions.show()

+--------------------+-----+--------------------+
|            features|label|          prediction|
+--------------------+-----+--------------------+
|[4.31781462820115...|    0| 0.06451612903225806|
|[4.20267290478245...|    0| 0.06451612903225806|
|[6.16008220290031...|    0|0.005934718100890208|
|[5.12380669213203...|    0|0.005934718100890208|
|[4.83595238358529...|    1|                 1.0|
|[7.13878685195924...|    0| 0.04477611940298507|
|[2.87854308546743...|    0|                0.08|
|[4.37538548991050...|    1|                0.08|
|[5.00866496871333...|    0|0.005934718100890208|
|[5.06623583042268...|    0| 0.04477611940298507|
|[4.77838152187594...|    0| 0.06451612903225806|
|[5.00866496871333...|    0|0.005934718100890208|
|[5.64194444751617...|    1|0.005934718100890208|
|[5.23894841555073...|    0| 0.06451612903225806|
|[5.75708617093487...|    0| 0.14285714285714285|
|[3.85724773452636...|    0| 0.06451612903225806|
|[5.29651927726008...|    0|                 0.0|


In [34]:
#Check the r2-score value.
from pyspark.ml.evaluation import RegressionEvaluator
dt_evaluator = RegressionEvaluator(metricName='r2')
dt_r2 = dt_evaluator.evaluate(DTReg_predictions)
print(f'The r-square value of DecisionTreeRegressor is {round((dt_r2*100),2)}%')

The r-square value of DecisionTreeRegressor is 13.91%


In [35]:
#check the root mean squared error value.
dt_evaluator = RegressionEvaluator(metricName='rmse')
dt_rmse = dt_evaluator.evaluate(DTReg_predictions)
print(f'The rmse value of DecisionTreeRegressor is {round((dt_rmse*100),2)}%')

The rmse value of DecisionTreeRegressor is 29.56%


# Red-wine Classification

In [36]:
#create a decision tree classification model using the training dataset.
from pyspark.ml.classification import DecisionTreeClassifier

DTClassModel=DecisionTreeClassifier(featuresCol='features',labelCol='label').fit(train)

In [37]:
#Test the model with the test dataset for predictions
DTClass_predictions=DTClassModel.transform(test)
DTClass_predictions.select("features","label","prediction").show()

+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|[4.31781462820115...|    0|       0.0|
|[4.20267290478245...|    0|       0.0|
|[6.16008220290031...|    0|       0.0|
|[5.12380669213203...|    0|       0.0|
|[4.83595238358529...|    1|       1.0|
|[7.13878685195924...|    0|       0.0|
|[2.87854308546743...|    0|       0.0|
|[4.37538548991050...|    1|       0.0|
|[5.00866496871333...|    0|       0.0|
|[5.06623583042268...|    0|       0.0|
|[4.77838152187594...|    0|       0.0|
|[5.00866496871333...|    0|       0.0|
|[5.64194444751617...|    1|       0.0|
|[5.23894841555073...|    0|       0.0|
|[5.75708617093487...|    0|       0.0|
|[3.85724773452636...|    0|       0.0|
|[5.29651927726008...|    0|       0.0|
|[5.92979875606291...|    0|       0.0|
|[5.98736961777226...|    0|       0.0|
|[3.62696428768896...|    0|       0.0|
+--------------------+-----+----------+
only showing top 20 rows



In [38]:
#check the areaUnderROC value
from pyspark.ml.evaluation import BinaryClassificationEvaluator
dt_evaluator = BinaryClassificationEvaluator(metricName='areaUnderROC')
dt_auroc = dt_evaluator.evaluate(DTClass_predictions)
print(f'The auc value of Decision Tree Classifier Modelis {round((dt_auroc*100),2)}%')

The auc value of Decision Tree Classifier Modelis 44.33%


In [39]:
#check the areaUnderPR value.
dt_evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')
dt_aupr = dt_evaluator.evaluate(DTClass_predictions)
print(f'The aupr value of Decision Tree Model is {round((dt_aupr*100),2)}%')

The aupr value of Decision Tree Model is 32.18%


In [40]:
#creation of confusion matrix
truePositive = DTClass_predictions[(DTClass_predictions["label"] == 1) & (DTClass_predictions['prediction']==1)].count()
trueNegative = DTClass_predictions[(DTClass_predictions["label"] == 0) & (DTClass_predictions['prediction']== 0)].count()
falsePositive = DTClass_predictions[(DTClass_predictions["label"] == 0) & (DTClass_predictions['prediction']== 1)].count()
falseNegative = DTClass_predictions[(DTClass_predictions["label"] == 1) & (DTClass_predictions['prediction']== 0)].count()

In [41]:
#Measuring the accuracy of the classification model
accuracy=float((truePositive+trueNegative) /(DTClass_predictions.count()))
print("the accuracy is {0}%".format( round( accuracy*100,2)))

the accuracy is 89.27%


In [42]:
#Measuring the recall rate of the classification model
recall = float(truePositive)/(truePositive + falseNegative)
print("the recall rate is {0}%".format( round( recall*100,2)))

the recall rate is 48.94%


In [43]:
#Measuring the precision of the classification model
precision = float(truePositive)/(truePositive + falsePositive)
print("the precision value is {0}%".format( round( precision*100,2)))

the precision value is 53.49%


In [44]:
Error_rate = (1-accuracy)*100
print("The error rate is {0}%".format( round( Error_rate,2)))

The error rate is 10.73%
