In [13]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

In [14]:
# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Project 1 - Modelling")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '8g')
    .config('spark.executor.memory', '8g')
    .getOrCreate()
)

In [15]:
year1_sdf = spark.read.parquet('../data/tlc_data/curated/2023_tlc.parquet/')
year2_sdf = spark.read.parquet('../data/tlc_data/curated/2024_tlc.parquet/')

In [16]:
from pyspark.sql import functions as F

# Calculate the 75th percentile for profitability
quantile_value = year1_sdf.approxQuantile("profitability", [0.75], 0.05)[0]

# Create a boolean column where 1 indicates profitability above the 75th percentile
year1_sdf = year1_sdf.withColumn("profitability", (F.col("profitability") > quantile_value).cast("int"))
year2_sdf = year2_sdf.withColumn("profitability", (F.col("profitability") > quantile_value).cast("int"))

                                                                                

In [17]:
from pyspark.ml.feature import VectorAssembler

# Features to include in the model
feature_cols = ['month', 'date_of_month', 'hour', 'day_of_week', 'pulocationid', 'passenger_count', 'temp', 'dwpt', 'rhum', 'prcp', 'wspd', 'pres']

# Assemble features into a single vector column
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
train_sdf = assembler.transform(year1_sdf)
test_sdf = assembler.transform(year2_sdf)

train_sdf = train_sdf.select('features', 'profitability')
test_sdf = test_sdf.select('features', 'profitability')

In [18]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol="features", outputCol="s_features", withMean=True, withStd=True)

scaler_model = scaler.fit(train_sdf)

train_sdf = scaler_model.transform(train_sdf)
test_sdf = scaler_model.transform(test_sdf)  # Transform the test data using the same scaler

                                                                                

In [19]:
feature_col = 's_features'
#feature_col = 'features'

In [20]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator_accuracy = MulticlassClassificationEvaluator(labelCol="profitability", metricName="accuracy")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="profitability", metricName="f1")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="profitability", metricName="weightedRecall")
evaluator_precision = MulticlassClassificationEvaluator(labelCol="profitability", metricName="weightedPrecision")

In [21]:
from pyspark.ml.classification import LogisticRegression

# Initialize the Logistic Regression model
lr = LogisticRegression(featuresCol=feature_col, labelCol='profitability')

# Train the model
lr_model = lr.fit(train_sdf)

# Make predictions
predictions_lr = lr_model.transform(test_sdf)

                                                                                

In [22]:
from pyspark.ml.classification import LinearSVC

# Initialize the SVM model
svm = LinearSVC(featuresCol=feature_col, labelCol='profitability')

# Train the model
svm_model = svm.fit(train_sdf)

# Make predictions
predictions_svm = svm_model.transform(test_sdf)

                                                                                

In [23]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

# Define the layers of the neural network
layers = [len(feature_cols), 10, 10, 2]  # input layer, two hidden layers, and output layer

# Initialize the Multilayer Perceptron model
mlp = MultilayerPerceptronClassifier(featuresCol=feature_col, labelCol='profitability', layers=layers, seed=1234)

# Train the model
mlp_model = mlp.fit(train_sdf)

# Make predictions
predictions_mlp = mlp_model.transform(test_sdf)

                                                                                

In [24]:
# Evaluate the model
accuracy = evaluator_accuracy.evaluate(predictions_lr)
print(f"Logistic Regression - Accuracy: {accuracy}")
# Evaluate using Recall
recall = evaluator_recall.evaluate(predictions_lr)
print(f"Logistic Regression - Recall: {recall}")
# Evaluate using Precision
precision = evaluator_precision.evaluate(predictions_lr)
print(f"Logistic Regression - Precision: {precision}")
# Evaluate using F1 score
f1 = evaluator_f1.evaluate(predictions_lr)
print(f"Logistic Regression - F1 Score: {f1}")

# Evaluate using accuracy
accuracy = evaluator_accuracy.evaluate(predictions_svm)
print(f"SVM - Accuracy: {accuracy}")
# Evaluate using Recall
recall = evaluator_recall.evaluate(predictions_svm)
print(f"SVM - Recall: {recall}")
# Evaluate using Precision
precision = evaluator_precision.evaluate(predictions_svm)
print(f"SVM - Precision: {precision}")
# Evaluate using F1 score
f1 = evaluator_f1.evaluate(predictions_svm)
print(f"SVM - F1 Score: {f1}")

# Evaluate using accuracy
accuracy_mlp = evaluator_accuracy.evaluate(predictions_mlp)
print(f"Neural Network - Accuracy: {accuracy_mlp}")
# Evaluate using Recall
recall_mlp = evaluator_recall.evaluate(predictions_mlp)
print(f"Neural Network - Recall: {recall_mlp}")
# Evaluate using Precision
precision_mlp = evaluator_precision.evaluate(predictions_mlp)
print(f"Neural Network - Precision: {precision_mlp}")
# Evaluate using F1 score
f1_mlp = evaluator_f1.evaluate(predictions_mlp)
print(f"Neural Network - F1 Score: {f1_mlp}")

                                                                                

Logistic Regression - Accuracy: 0.742244942897953


                                                                                

Logistic Regression - Recall: 0.742244942897953


                                                                                

Logistic Regression - Precision: 0.5509275552575854


                                                                                

Logistic Regression - F1 Score: 0.6324340988945024


                                                                                

SVM - Accuracy: 0.742244942897953


                                                                                

SVM - Recall: 0.742244942897953


                                                                                

SVM - Precision: 0.5509275552575854


                                                                                

SVM - F1 Score: 0.6324340988945024


                                                                                

Neural Network - Accuracy: 0.7422219878514312


                                                                                

Neural Network - Recall: 0.7422219878514312


                                                                                

Neural Network - Precision: 0.6682807295508146




Neural Network - F1 Score: 0.632640877234734


                                                                                