In [0]:
#creating SparkSession

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('TDSQL').getOrCreate()

In [0]:
#creating dataframe
df = spark.read.csv('/FileStore/tables/taxi_data.csv',inferSchema=True,header=True)

In [0]:
#Converted Date columns from StringType to timestamp 
from pyspark.sql.functions import *

spark.conf.set("spark.sql.legacy.timeParserPolicy", "Legacy")
data=df.withColumn('tpep_pickup_datetime',to_timestamp(df.tpep_pickup_datetime, 'MM/dd/yyyy HH:mm')).\
        withColumn('tpep_dropoff_datetime',to_timestamp(df.tpep_dropoff_datetime, 'MM/dd/yyyy HH:mm'))

In [0]:
import pyspark.sql.functions as f

# Engineered features
# Get rid of 27 categories into a dummy for condition. bad conditions include any of the categories below
# Temperature should only make a difference when causes an inconvenience/discomfort
# Get date, year, month, hour, day from tpep_pickup to perform analysis 
# Get trip duration and Covid

data = data.withColumn("good_condition", when(f.col("condition") == "Snow", 0) \
                        .when(f.col("condition") == "Rain / Windy", 0) \
                        .when(f.col("condition") == "Heavy Rain", 0) \
                        .when(f.col("condition") == "Rain", 0) \
                        .when(f.col("condition") == "Heavy T-Storm", 0) \
                        .when(f.col("condition") == "Thunder in the Vicinity", 0) \
                        .when(f.col("condition") == "Thunder", 0) \
                        .when(f.col("condition") == "Light Rain with Thunder", 0) \
                        .when(f.col("condition") == "Thunder / Windy", 0) \
                        .when(f.col("condition") == "T-Storm", 0) \
                        .otherwise(1)) \
            .withColumn("extreme_temp", when((f.col("temperature") > 86) | (f.col("temperature") < 21), 1).otherwise(0)) \
            .withColumn('date',to_date(data.tpep_pickup_datetime)) \
            .withColumn('year',year(data.tpep_pickup_datetime)) \
            .withColumn('month',month(data.tpep_pickup_datetime)) \
            .withColumn('hour', hour(data.tpep_pickup_datetime)) \
            .withColumn('day', dayofweek(data.tpep_pickup_datetime)) \
            .withColumn('trip_time', unix_timestamp(data.tpep_dropoff_datetime) - unix_timestamp(data.tpep_pickup_datetime)) \
            .withColumn('covid', when(f.col("tpep_pickup_datetime") > "2020-03-08 00:00:00", 1).otherwise(0)) 

data.show(1)      

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+----------+--------------+----------+---------+-------+------------+--------------+----------+--------------+----------+---------+-------+------------+--------------+-----------+-----------+--------+----------+--------+------+----------+--------------+------------+----------+----+-----+----+---+---------+-----+
|vendorid|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|ratecodeid|store_and_fwd_flag|pulocationid|dolocationid|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|borough_pu|median_rlst_pu|tourist_pu|entert_pu|park_pu|workplace_pu|residential_pu|borough_do|median_rlst_do|tourist_do|entert_do|park_do|workplace_do|residential_do|  rat

In [0]:
# Drop null values
data=data.dropna()

# Filter data with fare_amount > 2.5 considering Minimum Fare
data=data.filter("fare_amount > 2.5")

# Count
data.count()

Out[7]: 994464

In [0]:
# Create a 70-30 train test split

train_data,test_data=data.randomSplit([0.7,0.3])

In [0]:
# Import the required libraries

from pyspark.ml.regression import LinearRegression,DecisionTreeRegressor,GBTRegressor,RandomForestRegressor
from pyspark.ml.feature import VectorAssembler,StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
# Use StringIndexer to convert the categorical columns to hold numerical data

borough_pu_indexer = StringIndexer(inputCol='borough_pu',outputCol='borough_pu_index',handleInvalid='keep')

In [0]:
assembler = VectorAssembler(inputCols=["trip_time",'trip_distance','borough_pu_index','tourist_pu','entert_pu','covid',"good_condition","extreme_temp","hour"],
                            outputCol="features")

In [0]:
# Pipeline is used to pass the data through indexer and assembler simultaneously. Also, it helps to pre-rocess the test data in the same way as that of the train data

pipe = Pipeline(stages=[borough_pu_indexer,assembler])

In [0]:
# Fit and Transform train data

fitted_pipe=pipe.fit(train_data)
train_data=fitted_pipe.transform(train_data)
train_data_model = train_data.select(['features','fare_amount'])

In [0]:
# Transform the test data using the model to predict the fare_amount

test_data=fitted_pipe.transform(test_data)

### Linear Regression Model

In [0]:
regression_model = [('Linear Regression', LinearRegression(labelCol='fare_amount'))]

for name, model in regression_model:   
    
    # Fit the model on the train data
    fit_model = model.fit(train_data_model)
        
    # Transform the test data using the model to predict the fare_amount
    model_predictions = fit_model.transform(test_data)
        
    # rmse evaluation
    rmse_evaluator = RegressionEvaluator(labelCol="fare_amount", predictionCol="prediction", metricName="rmse")
    model_rmse = rmse_evaluator.evaluate(model_predictions)
        
    # r2 evaluation
    r2_evaluator = RegressionEvaluator(labelCol="fare_amount", predictionCol="prediction", metricName="r2")
    model_r2 = r2_evaluator.evaluate(model_predictions)
         
    print(name, model_r2, model_rmse)
    
    # Print the coefficients and intercept for linear regression
    print("Coefficients: %s" % str(fit_model.coefficients))
    print("Intercept: %s" % str(fit_model.intercept))

Linear Regression 0.8781824652375988 4.019421761460171
Coefficients: [5.9443161677198003e-05,2.780341490311937,0.5285954920480931,-0.6639644164110117,0.04829530760950384,-1.0314754253612226,-0.5077999698012651,0.2695014940018906,0.01656184631958848]
Intercept: 5.043060588304563


###### Positive effects - trip_time,trip_distance,borough_pu_index,entert_pu,extreme_temp,hour
###### Negative effects - tourist_pu,covid,good_condition

### Decision Tree, Gradient-Boosted Trees and Random Forest Regressor

In [0]:
regression_models = [('Decision Tree Regression', DecisionTreeRegressor(labelCol = 'fare_amount')),('GBT Regression', GBTRegressor(featuresCol = 'features', labelCol = 'fare_amount', maxIter=10)), ('Random Forest Regressor', RandomForestRegressor(featuresCol = 'features', labelCol = 'fare_amount'))]

for name, model in regression_models:   
    
    # Fit the model on the train data
    fit_model = model.fit(train_data_model)
        
    # Transform the test data using the model to predict the fare_amount
    model_predictions = fit_model.transform(test_data)
        
    # rmse evaluation
    rmse_evaluator = RegressionEvaluator(labelCol="fare_amount", predictionCol="prediction", metricName="rmse")
    model_rmse = rmse_evaluator.evaluate(model_predictions)
        
    # r2 evaluation
    r2_evaluator = RegressionEvaluator(labelCol="fare_amount", predictionCol="prediction", metricName="r2")
    model_r2 = r2_evaluator.evaluate(model_predictions)
         
    print(name, model_r2, model_rmse)
    
    #featureImportances
    print(fit_model.featureImportances)

Decision Tree Regression 0.8761115921337338 4.053442457305627
(9,[0,1,2,3,8],[0.1440136906937857,0.8495381745636229,0.0035232801543304913,0.0007176942877083667,0.00220716030055247])
GBT Regression 0.8858793064979895 3.890369729479086
(9,[0,1,2,3,7,8],[0.18399666589464103,0.7629880676285918,0.027576376223136776,0.016281144699399244,0.0014499237682216207,0.00770782178600943])
Random Forest Regressor 0.8670787381582863 4.1986135596750485
(9,[0,1,2,3,4,5,6,7,8],[0.33292617948447517,0.5562262306587793,0.10006077212176452,0.006444217394858483,0.0017618490388756046,5.208197748289745e-05,5.215706884488757e-05,2.5204070336248657e-05,0.002451308184582815])


##### feature Importances
###### Decision Tree Regression: trip_time, trip_distance, borough_pu_index, tourist_pu, hour
###### GBT Regression: trip_time, trip_distance, borough_pu_index, tourist_pu, extreme_temp, hour
###### Random Forest Regression: trip_time, trip_distance, borough_pu_index, tourist_pu, entert_pu, hour