In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
#hdfs://namenode:9000/data2/cleaned_data.parquet
df = spark.read.parquet("../include/cleaned_data.parquet")

In [4]:
df.printSchema()

root
 |-- vendorid: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- ratecodeid: long (nullable = true)
 |-- store_and_fwd_flag: long (nullable = true)
 |-- pulocationid: long (nullable = true)
 |-- dolocationid: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- trip_duration_seconds: double (nullable = true)



In [None]:

from pyspark.ml.feature import VectorAssembler, StandardScaler, MinMaxScaler

inputCols = ["trip_duration_seconds", "trip_distance", "passenger_count"]
outputCol = "total_amount"

assembler = VectorAssembler(inputCols=inputCols, outputCol="assembledInputFeatures")
dfAssembled = assembler.transform(df).select("assembledInputFeatures", outputCol)

scaler = StandardScaler(inputCol="assembledInputFeatures", outputCol="scaledInputFeatures", withMean=True, withStd=True)
scalerModel = scaler.fit(dfAssembled)
dfScaled = scalerModel.transform(dfAssembled)

minmax = MinMaxScaler(inputCol="scaledInputFeatures", outputCol="features")
minmaxModel = minmax.fit(dfScaled)
dfFinal = minmaxModel.transform(dfScaled).select("features", outputCol)

testSize = 0.2
trainSize = 1 - testSize
trainDf, testDf = dfFinal.randomSplit([trainSize, testSize], seed=42)

In [None]:
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [None]:
lr = LinearRegression(featuresCol="features", labelCol=outputCol)

paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

evaluator = RegressionEvaluator(labelCol=outputCol, predictionCol="prediction", metricName="rmse")

cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid,
                    evaluator=evaluator, numFolds=3, parallelism=2)

cvModel = cv.fit(trainDf)

rf = RandomForestRegressor(featuresCol="features", labelCol=outputCol, numTrees=50, maxDepth=5)
rfModel = rf.fit(trainDf)

In [None]:
def evaluateModel(model, data, labelCol):
    predictions = model.transform(data)
    rmse = evaluator.evaluate(predictions)
    r2 = RegressionEvaluator(labelCol=labelCol, predictionCol="prediction", metricName="r2").evaluate(predictions)
    print(f"RMSE: {rmse:.4f}")
    print(f"R2: {r2:.4f}")
    return predictions

In [None]:
print("Cross-validation LR model's accuracy:")
evaluateModel(cvModel.bestModel, testDf, outputCol)

print("\Random Forest Regressor's accuracy:")
evaluateModel(rfModel, testDf, outputCol)

weightsPath = "models/mainLRModel"
cvModel.bestModel.write().overwrite().save(weightsPath)