In [None]:
!pip install pyspark

In [None]:
#method 1

In [11]:
#Random forest classifier 

# Import the necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql.functions import when
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Create a SparkSession
spark = SparkSession.builder.appName("CarEvaluation_RF").getOrCreate()

# Read the data from the CSV files
data1 = spark.read.option("header", "true").option("inferSchema", "true").csv("/content/car_evaluation_0.csv")
data2 = spark.read.option("header", "true").option("inferSchema", "true").csv("/content/car_evaluation_1.csv")
data3 = spark.read.option("header", "true").option("inferSchema", "true").csv("/content/car_evaluation_2.csv")
data4 = spark.read.option("header", "true").option("inferSchema", "true").csv("/content/car_evaluation_3.csv")

# Combine the data frames into a single data frame
data = data1.union(data2).union(data3).union(data4)

indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(data) for column in ['buyPrice', 'maintCost', 
                                                                                               'noDoors', 'noPersons', 'bootLuggage', 'safety', 'decision']]
pipeline = Pipeline(stages=indexers)
indexed_data = pipeline.fit(data).transform(data)

# Split the data into training and testing sets
(trainingData, testData) = indexed_data.randomSplit([0.7, 0.3], seed=42)

# Create the feature vector by combining all input features using VectorAssembler
assembler = VectorAssembler(inputCols=['buyPrice_index', 'maintCost_index', 'noDoors_index', 'noPersons_index', 'bootLuggage_index', 'safety_index'], outputCol='features')
trainingData = assembler.transform(trainingData)
testData = assembler.transform(testData)

# Create the Random Forest Classifier
rf = RandomForestClassifier(labelCol="decision_index", featuresCol="features")

# Train the model
model = rf.fit(trainingData)

# Make predictions on the testing data
predictions = model.transform(testData)

# Convert the predicted indexes to corresponding class labels
predictions = predictions.withColumn("predicted_class", when(predictions["prediction"] == 0,"Unacceptable").when(predictions["prediction"] == 1, "Acceptable").when(predictions["prediction"] == 2, "Good").otherwise("Very Good"))
# Print the data with the predicted classes
predictions.select("buyPrice", "maintCost", "noDoors", "noPersons", "bootLuggage", "safety", "decision", "predicted_class").show(5)

# Evaluate the performance of the model
evaluator = MulticlassClassificationEvaluator(labelCol="decision_index", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy: {:.2f}%".format(accuracy * 100))

# Stop the SparkSession
spark.stop()

+--------+---------+-------+---------+-----------+------+--------+---------------+
|buyPrice|maintCost|noDoors|noPersons|bootLuggage|safety|decision|predicted_class|
+--------+---------+-------+---------+-----------+------+--------+---------------+
|   vhigh|     high|      2|        2|        big|   low|   unacc|   Unacceptable|
|   vhigh|     high|      2|        2|        med|   med|   unacc|   Unacceptable|
|   vhigh|     high|      2|        2|      small|   low|   unacc|   Unacceptable|
|   vhigh|     high|      2|        2|      small|   med|   unacc|   Unacceptable|
|   vhigh|     high|      2|        4|        med|  high|   unacc|   Unacceptable|
+--------+---------+-------+---------+-----------+------+--------+---------------+
only showing top 5 rows

Accuracy: 90.85%


In [10]:
#Decision tree

# Import the necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.sql.functions import when
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Create a SparkSession
spark = SparkSession.builder.appName("CarEvaluation_DT").getOrCreate()

# Read the data from the CSV files
data1 = spark.read.option("header", "true").option("inferSchema", "true").csv("/content/car_evaluation_0.csv")
data2 = spark.read.option("header", "true").option("inferSchema", "true").csv("/content/car_evaluation_1.csv")
data3 = spark.read.option("header", "true").option("inferSchema", "true").csv("/content/car_evaluation_2.csv")
data4 = spark.read.option("header", "true").option("inferSchema", "true").csv("/content/car_evaluation_3.csv")

# Combine the data frames into a single data frame
data = data1.union(data2).union(data3).union(data4)


indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(data) for column in ['buyPrice', 'maintCost','noDoors', 'noPersons', 'bootLuggage', 'safety', 'decision']]
pipeline = Pipeline(stages=indexers)
indexed_data = pipeline.fit(data).transform(data)

# Split the data into training and testing sets
(trainingData, testData) = indexed_data.randomSplit([0.7, 0.3], seed=42)

# Create the feature vector by combining all input features using VectorAssembler
assembler = VectorAssembler(inputCols=['buyPrice_index', 'maintCost_index', 'noDoors_index', 'noPersons_index', 'bootLuggage_index', 'safety_index'], outputCol='features')
trainingData = assembler.transform(trainingData)
testData = assembler.transform(testData)

# Create the Decision Tree Classifier
dt = DecisionTreeClassifier(labelCol="decision_index", featuresCol="features")

# Train the model
model = dt.fit(trainingData)

# Make predictions on the testing data
predictions = model.transform(testData)

# Convert the predicted indexes to corresponding class labels
predictions = predictions.withColumn("predicted_class", when(predictions["prediction"] == 0,"Unacceptable").when(predictions["prediction"] == 1, "Acceptable").when(predictions["prediction"] == 2, "Good").otherwise("Very Good"))

# Print the data with the predicted classes
predictions.select("buyPrice", "maintCost", "noDoors", "noPersons", "bootLuggage", "safety", "decision", "predicted_class").show(5)

# Evaluate the performance of the model
evaluator = MulticlassClassificationEvaluator(labelCol="decision_index", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy: {:.2f}%".format(accuracy * 100))

# Stop the SparkSession
spark.stop()

+--------+---------+-------+---------+-----------+------+--------+---------------+
|buyPrice|maintCost|noDoors|noPersons|bootLuggage|safety|decision|predicted_class|
+--------+---------+-------+---------+-----------+------+--------+---------------+
|   vhigh|     high|      2|        2|        big|   low|   unacc|   Unacceptable|
|   vhigh|     high|      2|        2|        med|   med|   unacc|   Unacceptable|
|   vhigh|     high|      2|        2|      small|   low|   unacc|   Unacceptable|
|   vhigh|     high|      2|        2|      small|   med|   unacc|   Unacceptable|
|   vhigh|     high|      2|        4|        med|  high|   unacc|   Unacceptable|
+--------+---------+-------+---------+-----------+------+--------+---------------+
only showing top 5 rows

Accuracy: 86.48%
