In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier

In [2]:
# Set up Spark configuration
conf = SparkConf().setAppName("WineQualityPrediction")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [3]:
# Load training dataset
training_data = spark.read.csv("TrainingDataset.csv", header=True, inferSchema=True)

# Load validation dataset
validation_data = spark.read.csv("ValidationDataset.csv", header=True, inferSchema=True)

In [4]:
# Preprocess data
feature_columns = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
                   'chlorides', 'free sulfur dioxide', 'total sulfur dioxide',
                   'density', 'pH', 'sulphates', 'alcohol']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
training_data = assembler.transform(training_data)
validation_data = assembler.transform(validation_data)

In [5]:
# Evaluate model performance
rf = RandomForestClassifier(labelCol="quality", featuresCol="features", numTrees=10)
model = rf.fit(training_data)

In [6]:
# Make predictions on validation data
predictions = model.transform(validation_data)

In [18]:
# Evaluate model performance
evaluator = MulticlassClassificationEvaluator(labelCol="quality", predictionCol="prediction", metricName="f1")
f1_score = evaluator.evaluate(predictions)

In [8]:
f1_score

0.5610330784574469