**10-fold cross-validation with the Gradient Boosting Regression model**

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('DLD_project_cross').getOrCreate()

In [2]:
from google.colab import drive
drive.mount('/content/drive_new')

Mounted at /content/drive_new


In [3]:
df = spark.read.format("parquet").load("/content/drive_new/MyDrive/cleaned_dataset")

In [4]:
splits = df.randomSplit([1.0] * 10)

In [6]:
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler, OneHotEncoder, StringIndexer, StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

categorical_cols = [field for (field, dataType) in df.dtypes if dataType == "string"]
index_output_cols = [x + "Index" for x in categorical_cols]
ohe_output_cols = [x + "OHE" for x in categorical_cols]

string_indexer = StringIndexer(inputCols = categorical_cols, outputCols = index_output_cols, handleInvalid = "skip")
ohe_encoder = OneHotEncoder(inputCols = index_output_cols, outputCols = ohe_output_cols)

numeric_cols = [field for (field, dataType) in df.dtypes if ((dataType == "double") and (field != "actual_worth"))]

assembler_inputs = ohe_output_cols + numeric_cols
vec_assembler = VectorAssembler(inputCols = assembler_inputs, outputCol = "features")

scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True)

gbt = GBTRegressor(featuresCol="scaled_features", labelCol="actual_worth", maxIter=20, maxDepth=5)
pipeline_gbt = Pipeline(stages = [string_indexer, ohe_encoder, vec_assembler, scaler, gbt])

In [7]:
schema = df.schema
errors_rmse = []
errors_r2 = []

for i in range(len(splits)):
  test_df = splits[i]
  train_df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

  for j in range(len(splits)):
    if j != i:
      train_df = train_df.union(splits[j])

  pipeline_model_gbt = pipeline_gbt.fit(train_df)
  prediction_gbt = pipeline_model_gbt.transform(test_df)

  regression_evaluator_gbt = RegressionEvaluator(predictionCol = "prediction", labelCol = "actual_worth", metricName = "rmse")
  rmse_dt = regression_evaluator_gbt.evaluate(prediction_gbt)
  r2_dt = regression_evaluator_gbt.setMetricName("r2").evaluate(prediction_gbt)
  errors_rmse.append(rmse_dt)
  errors_r2.append(r2_dt)

In [8]:
average_rmse_error = sum(errors_rmse) / len(errors_rmse)
average_r2_error = sum(errors_r2) / len(errors_r2)

print(f"Average RMSE: {average_rmse_error:.4f}")
print(f"Average R2: {average_r2_error:.4f}")

Average RMSE: 677974.3822
Average R2: 0.6150


**10-fold cross-validation with the Decision tree model**

In [10]:
from pyspark.ml.regression import DecisionTreeRegressor

assembler_inputs_tree = index_output_cols + numeric_cols
vec_assembler_tree = VectorAssembler(inputCols=assembler_inputs_tree, outputCol="features")

dt = DecisionTreeRegressor(labelCol="actual_worth", featuresCol="features")
dt.setMaxBins(135)

stages_tree = [string_indexer, vec_assembler_tree, dt]
pipeline_tr = Pipeline(stages=stages_tree)

In [11]:
errors_dt_rmse = []
errors_dt_r2 = []

for i in range(len(splits)):
  test_df = splits[i]
  train_df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

  for j in range(len(splits)):
    if j != i:
      train_df = train_df.union(splits[j])

  pipeline_model_dt = pipeline_tr.fit(train_df)
  prediction_dt = pipeline_model_dt.transform(test_df)

  regression_evaluator_dt = RegressionEvaluator(predictionCol = "prediction", labelCol = "actual_worth", metricName = "rmse")
  rmse_dt = regression_evaluator_dt.evaluate(prediction_dt)
  r2_dt = regression_evaluator_dt.setMetricName("r2").evaluate(prediction_dt)
  errors_dt_rmse.append(rmse_dt)
  errors_dt_r2.append(r2_dt)

In [12]:
average_dt_rmse_error = sum(errors_dt_rmse) / len(errors_dt_rmse)
average_dt_r2_error = sum(errors_dt_r2) / len(errors_dt_r2)

print(f"Average RMSE: {average_dt_rmse_error:.4f}")
print(f"Average R2: {average_dt_r2_error:.4f}")

Average RMSE: 686825.5547
Average R2: 0.6049
