##Getting Data

In [None]:
# ! pip install kaggle
# ! mkdir ~/.kaggle
# ! cp kaggle.json ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json
# ! kaggle competitions download nyc-taxi-trip-duration
# ! unzip nyc-taxi-trip-duration.zip
# ! unzip test.zip
# ! unzip train.zip

##Spark Installation

In [None]:
#!pip install pyspark
import pyspark
print(pyspark.__version__)
# a SparkSession object can perform the most common data processing tasks
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('test').getOrCreate() # will return existing session if one was
                                                           # created before and was not closed
spark

3.4.0


# Read data

In [None]:
import pandas as pd
import numpy as np

In [None]:
# read train.csv
train_df = pd.read_csv("train.csv")
train_df.sample(10)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
221556,id2711025,1,2016-05-16 23:30:50,2016-05-16 23:54:05,1,-74.003815,40.750889,-73.998238,40.660568,N,1395
466781,id3322663,2,2016-04-27 19:16:33,2016-04-27 19:22:43,1,-73.971542,40.75819,-73.955231,40.773357,N,370
523896,id2124583,2,2016-01-08 12:54:52,2016-01-08 13:05:11,1,-73.963783,40.76609,-73.978851,40.762329,N,619
90951,id1762336,1,2016-05-24 18:39:53,2016-05-24 18:53:42,1,-73.998772,40.724899,-73.996971,40.744717,N,829
1036322,id2962682,2,2016-06-02 01:26:45,2016-06-02 01:33:06,1,-73.978699,40.745178,-73.973053,40.738468,N,381
512882,id2400065,1,2016-02-08 07:55:07,2016-02-08 08:14:30,1,-73.959625,40.781204,-73.989296,40.772961,N,1163
629767,id3693928,1,2016-03-30 01:14:03,2016-03-30 01:20:46,1,-73.973015,40.677792,-73.949142,40.659443,N,403
743549,id0718565,1,2016-04-07 07:36:18,2016-04-07 07:40:15,1,-73.966965,40.757442,-73.974075,40.750744,N,237
37600,id2563536,1,2016-03-23 21:34:14,2016-03-23 21:50:37,1,-73.987823,40.721241,-73.990013,40.733463,N,983
1401587,id1040598,1,2016-05-06 17:28:27,2016-05-06 17:56:10,1,-74.003128,40.748985,-73.963165,40.762142,N,1663


In [None]:
# explore train.csv
train_df.dtypes

id                     object
vendor_id               int64
pickup_datetime        object
dropoff_datetime       object
passenger_count         int64
pickup_longitude      float64
pickup_latitude       float64
dropoff_longitude     float64
dropoff_latitude      float64
store_and_fwd_flag     object
trip_duration           int64
dtype: object

# preprocessing data

In [None]:
#Check for N/A values.
print(train_df.isnull().sum())

id                    0
vendor_id             0
pickup_datetime       0
dropoff_datetime      0
passenger_count       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude     0
dropoff_latitude      0
store_and_fwd_flag    0
trip_duration         0
dtype: int64


In [None]:
def preprocessing(data_df):
    # remove any rows with missing data
    data_df = data_df.dropna()

    # convert pickup and dropoff datetime to datetime format
    data_df['pickup_datetime'] = pd.to_datetime(data_df['pickup_datetime'])
    data_df['dropoff_datetime'] = pd.to_datetime(data_df['dropoff_datetime'])

    # split datetime into month, day, week and hour
    data_df['day'] = data_df['pickup_datetime'].dt.day
    data_df['hour'] = data_df['pickup_datetime'].dt.hour
    data_df['month'] = data_df['pickup_datetime'].dt.month
    data_df['week'] = data_df['pickup_datetime'].dt.dayofweek
    return data_df


In [None]:
train_df = preprocessing(train_df)
train_df.sample(10)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,day,hour,month,week
593774,id1443853,2,2016-06-30 11:07:18,2016-06-30 11:23:30,2,-73.999863,40.761337,-73.979149,40.771931,N,972,30,11,6,3
403594,id3877353,2,2016-03-21 08:12:08,2016-03-21 08:24:46,1,-73.996719,40.7421,-73.974136,40.751629,N,758,21,8,3,0
1188075,id0552833,1,2016-05-09 14:38:13,2016-05-09 15:19:43,1,-73.790176,40.646957,-73.956734,40.73354,N,2490,9,14,5,0
260030,id1893114,1,2016-05-22 03:27:15,2016-05-22 03:35:46,2,-73.992455,40.718769,-73.983086,40.696373,N,511,22,3,5,6
788642,id0038285,1,2016-05-12 08:32:34,2016-05-12 08:49:27,2,-73.963615,40.757587,-74.005692,40.705933,N,1013,12,8,5,3
1321177,id3639893,2,2016-02-19 14:41:05,2016-02-19 14:54:54,1,-73.99556,40.727074,-74.00206,40.719189,N,829,19,14,2,4
1348028,id2636697,1,2016-03-07 08:19:30,2016-03-07 08:22:48,1,-73.971375,40.746574,-73.970024,40.752602,N,198,7,8,3,0
771827,id3814018,2,2016-02-24 12:35:58,2016-02-24 13:06:18,1,-73.975204,40.755581,-73.990463,40.757408,N,1820,24,12,2,2
947188,id0067739,2,2016-04-06 19:28:38,2016-04-06 19:52:25,6,-73.885201,40.772598,-73.961548,40.806179,N,1427,6,19,4,2
123373,id1993876,2,2016-04-29 05:34:29,2016-04-29 05:38:30,1,-73.930275,40.766899,-73.940483,40.749565,N,241,29,5,4,4


In [None]:
# Get important information
print('Trip duration in seconds: {} to {}'.format(train_df.trip_duration.min(), train_df.trip_duration.max()))

# Get passenger count range 
print('Passengers: {} to {}'.format(train_df.passenger_count.min(), train_df.passenger_count.max()))

Trip duration in seconds: 1 to 3526282
Passengers: 0 to 9


In [None]:
# remove outliers (Time and passenger count)
# < 1 min or > 3 hours

train_df = train_df[~((train_df.trip_duration < 60) | (train_df.trip_duration > 3600*3))]

# checking Trip duration
print('Trip duration in seconds: {} to {}'.format(train_df.trip_duration.min(), train_df.trip_duration.max()))

# dropping trips with passenger count = 0
print('Empty trips: {}'.format(train_df[train_df.passenger_count == 0].shape[0]))
df_train = train_df[train_df.passenger_count > 0]

Trip duration in seconds: 60 to 10731
Empty trips: 17


##Feature Extraction

In [None]:
# Define a UDF to calculate distance between two points using the Haversine formula
import numpy as np

def haversine_distance(lat1, lng1, lat2, lng2):
    R = 6371  # Earth's radius in kilometers

    # Convert latitude and longitude values to radians
    lat1, lng1, lat2, lng2 = np.radians([lat1, lng1, lat2, lng2])

    # Calculate the differences between the two points
    dlat = lat2 - lat1
    dlng = lng2 - lng1

    # Apply the Haversine formula
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlng / 2) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    h = R * c

    return h


def manhattan_distance(lat1, lng1, lat2, lng2):
  return abs(lng1-lng2)+abs(lat1-lat2)


def calculate_direction(lat1, lng1, lat2, lng2):
    # Convert coordinates to radians
    lat1, lng1, lat2, lng2 = np.radians([lat1, lng1, lat2, lng2])

    # Calculate the difference in longitude
    lng_delta = lng2 - lng1

    # Calculate the y and x components of the direction vector
    y = np.sin(lng_delta) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta)

    # Calculate the direction in degrees
    direction_degrees = np.degrees(np.arctan2(y, x))

    return direction_degrees


In [None]:
# compute haversine distance
distance1 = haversine_distance(train_df.pickup_latitude.values, train_df.pickup_longitude.values, train_df.dropoff_latitude.values, train_df.dropoff_longitude.values)

# compute manhattan distance
distance2 = manhattan_distance(train_df.pickup_latitude.values, train_df.pickup_longitude.values, train_df.dropoff_latitude.values, train_df.dropoff_longitude.values)

# compute direction
distance3 = calculate_direction(train_df.pickup_latitude.values, train_df.pickup_longitude.values, train_df.dropoff_latitude.values, train_df.dropoff_longitude.values)
# add haversine_distance / manhattan distance / direction columns to train_df
train_df['haversine_distance'] = distance1
train_df['manhattan_distance'] = distance2
train_df['direction'] = distance3

In [None]:
train_df.sample(10)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,day,hour,month,week,haversine_distance,manhattan_distance,direction
1231333,id2372773,2,2016-04-26 15:58:19,2016-04-26 16:07:24,1,-74.00087,40.725891,-73.98835,40.727322,N,545,26,15,4,1,1.066933,0.01395,81.421908
887400,id1927007,2,2016-05-02 07:37:27,2016-05-02 07:57:11,1,-73.951843,40.773529,-74.002892,40.760441,N,1184,2,7,5,0,4.53873,0.064137,-108.685578
418768,id2644523,1,2016-06-22 21:14:41,2016-06-22 21:16:12,1,-73.940132,40.794216,-73.943047,40.789925,N,91,22,21,6,2,0.536575,0.007206,-152.789305
1194824,id1252511,2,2016-01-19 09:59:01,2016-01-19 10:11:36,2,-73.991608,40.754871,-73.974998,40.765148,N,755,19,9,1,1,1.806311,0.026886,50.750112
626551,id0868715,2,2016-03-20 13:46:18,2016-03-20 14:00:32,1,-73.982338,40.768269,-73.971535,40.797516,N,854,20,13,3,6,3.376954,0.040051,15.622102
679006,id3552959,1,2016-02-15 15:19:31,2016-02-15 16:59:27,1,-74.0056,40.750923,-73.782471,40.648777,N,5996,15,15,2,0,21.973224,0.325275,121.052264
98498,id3138431,1,2016-04-08 08:13:35,2016-04-08 08:25:33,1,-73.994095,40.750263,-74.00843,40.74617,N,718,8,8,4,4,1.29055,0.018429,-110.646106
160683,id0757245,1,2016-03-25 22:00:06,2016-03-25 22:09:14,2,-73.994629,40.760681,-73.981079,40.788353,N,548,25,22,3,4,3.281699,0.041222,20.341017
1188424,id1941372,2,2016-04-23 20:27:37,2016-04-23 20:55:38,1,-74.00692,40.705593,-73.997581,40.719387,N,1681,23,20,4,5,1.723977,0.023132,27.161762
1085441,id1120506,1,2016-01-01 18:51:17,2016-01-01 19:05:15,1,-73.977608,40.729328,-73.995255,40.760571,N,838,1,18,1,4,3.778717,0.048889,-23.161866


##Convert categorical data

In [None]:
# Define numerical and categorical columns
numerical_cols = ['haversine_distance', 'manhattan_distance', 'direction']
categorical_cols = ['vendor_id', 'passenger_count', 'store_and_fwd_flag', 'day', 'hour', 'month', 'week']

# Encode categorical data into numerical data using one-hot encoding
def encode_categorical(df):
    for categorical in categorical_cols:
        encoded_cat = pd.get_dummies(df[categorical], prefix=categorical, prefix_sep='_')
        df = (df.drop([categorical], axis=1)).join(encoded_cat)
    return df
# Apply encoding to training and test data
train_df = encode_categorical(train_df)

In [None]:
train_df.sample(10)

Unnamed: 0,id,pickup_datetime,dropoff_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,haversine_distance,manhattan_distance,...,month_4,month_5,month_6,week_0,week_1,week_2,week_3,week_4,week_5,week_6
941586,id1822605,2016-03-18 06:38:08,2016-03-18 06:43:58,-74.009911,40.705261,-74.005608,40.717915,350,1.452981,0.016956,...,0,0,0,0,0,0,0,1,0,0
968800,id0801056,2016-02-01 15:00:28,2016-02-01 15:20:11,-73.990868,40.749557,-73.979301,40.762001,1183,1.692221,0.02401,...,0,0,0,1,0,0,0,0,0,0
579381,id0827974,2016-02-16 13:41:03,2016-02-16 13:53:04,-73.941704,40.755402,-73.918678,40.759388,721,1.989404,0.027012,...,0,0,0,0,1,0,0,0,0,0
892154,id1667183,2016-01-04 07:57:16,2016-01-04 08:02:33,-73.999191,40.754288,-73.986374,40.76746,317,1.81952,0.02599,...,0,0,0,1,0,0,0,0,0,0
765348,id3033287,2016-05-10 10:02:08,2016-05-10 10:11:50,-73.95472,40.777344,-73.961479,40.764801,582,1.50638,0.019302,...,0,1,0,0,1,0,0,0,0,0
515592,id2699842,2016-01-16 16:27:18,2016-01-16 16:34:18,-73.993805,40.74641,-73.996582,40.732025,420,1.616585,0.017162,...,0,0,0,0,0,0,0,0,1,0
495220,id0030059,2016-05-07 03:03:43,2016-05-07 03:13:27,-73.991966,40.725948,-73.978966,40.74057,584,1.960443,0.027622,...,0,1,0,0,0,0,0,0,1,0
473658,id2378094,2016-05-15 14:45:26,2016-05-15 14:55:12,-73.985237,40.741299,-73.994041,40.724682,586,1.991067,0.025421,...,0,1,0,0,0,0,0,0,0,1
450917,id0533124,2016-02-09 08:39:34,2016-02-09 08:50:11,-73.964966,40.755043,-73.983047,40.763248,637,1.775339,0.026287,...,0,0,0,0,1,0,0,0,0,0
1275662,id0941134,2016-05-02 08:21:38,2016-05-02 08:37:36,-73.954086,40.806141,-73.96946,40.791157,958,2.109659,0.030357,...,0,1,0,1,0,0,0,0,0,0


In [None]:
train_df.columns

Index(['id', 'pickup_datetime', 'dropoff_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'trip_duration', 'haversine_distance', 'manhattan_distance',
       'direction', 'vendor_id_1', 'vendor_id_2', 'passenger_count_0',
       'passenger_count_1', 'passenger_count_2', 'passenger_count_3',
       'passenger_count_4', 'passenger_count_5', 'passenger_count_6',
       'passenger_count_8', 'passenger_count_9', 'store_and_fwd_flag_N',
       'store_and_fwd_flag_Y', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5',
       'day_6', 'day_7', 'day_8', 'day_9', 'day_10', 'day_11', 'day_12',
       'day_13', 'day_14', 'day_15', 'day_16', 'day_17', 'day_18', 'day_19',
       'day_20', 'day_21', 'day_22', 'day_23', 'day_24', 'day_25', 'day_26',
       'day_27', 'day_28', 'day_29', 'day_30', 'day_31', 'hour_0', 'hour_1',
       'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8',
       'hour_9', 'hour_10', 'hour_11', 'hour_12', 'h

In [None]:
dftrainNew = train_df.drop(['id', 'pickup_datetime', 'dropoff_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'pickup_datetime'], axis=1)

In [None]:
dftrainNew.columns

Index(['trip_duration', 'haversine_distance', 'manhattan_distance',
       'direction', 'vendor_id_1', 'vendor_id_2', 'passenger_count_0',
       'passenger_count_1', 'passenger_count_2', 'passenger_count_3',
       'passenger_count_4', 'passenger_count_5', 'passenger_count_6',
       'passenger_count_8', 'passenger_count_9', 'store_and_fwd_flag_N',
       'store_and_fwd_flag_Y', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5',
       'day_6', 'day_7', 'day_8', 'day_9', 'day_10', 'day_11', 'day_12',
       'day_13', 'day_14', 'day_15', 'day_16', 'day_17', 'day_18', 'day_19',
       'day_20', 'day_21', 'day_22', 'day_23', 'day_24', 'day_25', 'day_26',
       'day_27', 'day_28', 'day_29', 'day_30', 'day_31', 'hour_0', 'hour_1',
       'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8',
       'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14',
       'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20',
       'hour_21', 'hour_22', 'hour_23', 'month

In [None]:
dftrainNew.sample(10)

Unnamed: 0,trip_duration,haversine_distance,manhattan_distance,direction,vendor_id_1,vendor_id_2,passenger_count_0,passenger_count_1,passenger_count_2,passenger_count_3,...,month_4,month_5,month_6,week_0,week_1,week_2,week_3,week_4,week_5,week_6
400626,657,2.886017,0.039051,28.145572,1,0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,0
1372913,627,1.23871,0.015083,91.996847,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1029592,762,1.971851,0.025429,22.842838,0,1,0,1,0,0,...,0,0,1,0,0,0,0,0,1,0
300016,952,1.975845,0.028603,113.581427,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1278845,69,0.207871,0.003071,60.003377,1,0,0,1,0,0,...,0,0,1,0,0,0,1,0,0,0
1349919,757,3.049089,0.045341,49.662838,0,1,0,1,0,0,...,0,1,0,1,0,0,0,0,0,0
83060,346,1.451714,0.013916,-177.080394,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1227884,357,1.17848,0.015358,-23.901169,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
91545,373,0.879355,0.011402,-156.626954,0,1,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
716994,772,4.282951,0.05994,32.836143,1,0,0,0,1,0,...,1,0,0,1,0,0,0,0,0,0


##saving final features 

In [None]:
dftrainNew.to_csv('data.csv', index=False)

#Model

##Convert to spark

In [None]:
dfspark = spark.read.option('header','true').csv('data.csv')
from pyspark.sql.functions import col

# cast all columns to float
for col_name in dfspark.columns:
    dfspark = dfspark.withColumn(col_name, col(col_name).cast('float'))


In [None]:
dfspark.show(3)

+-------------+------------------+------------------+----------+-----------+-----------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+--------------------+--------------------+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+------+------+------+------+------+
|trip_duration|haversine_distance|manhattan_distance| direction|vendor_id_1|vendor_id_2|passenger_count_0|passenger_count_1|passenger_count_2|passenger_count_3|passenger_count_4|passenger_count_5|passenger_count_6|passen

In [None]:
# devide dataset to training features and target

X_column_names = dfspark.columns
target_colum_name = ['trip_duration']

# remove 'trip_duration' from X_column_names
X_column_names.remove('trip_duration')

print(X_column_names)


['haversine_distance', 'manhattan_distance', 'direction', 'vendor_id_1', 'vendor_id_2', 'passenger_count_0', 'passenger_count_1', 'passenger_count_2', 'passenger_count_3', 'passenger_count_4', 'passenger_count_5', 'passenger_count_6', 'passenger_count_8', 'passenger_count_9', 'store_and_fwd_flag_N', 'store_and_fwd_flag_Y', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6', 'day_7', 'day_8', 'day_9', 'day_10', 'day_11', 'day_12', 'day_13', 'day_14', 'day_15', 'day_16', 'day_17', 'day_18', 'day_19', 'day_20', 'day_21', 'day_22', 'day_23', 'day_24', 'day_25', 'day_26', 'day_27', 'day_28', 'day_29', 'day_30', 'day_31', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'week_0', 'week_1', 'week_2', 'week_3', 'week_4', 'wee

In [None]:
# convert feature columns into a columns where the vlues are feature vectors
from pyspark.ml.feature import VectorAssembler
v_asmblr = VectorAssembler(inputCols=X_column_names, outputCol='Fvec')
df = v_asmblr.transform(dfspark)
X = df.select(['Fvec','trip_duration'])
X.show(3)

+--------------------+-------------+
|                Fvec|trip_duration|
+--------------------+-------------+
|(84,[0,1,2,4,6,14...|        455.0|
|(84,[0,1,2,3,6,14...|        663.0|
|(84,[0,1,2,4,6,14...|       2124.0|
+--------------------+-------------+
only showing top 3 rows



In [None]:
X.tail(3)

[Row(Fvec=SparseVector(84, {0: 7.8246, 1: 0.1067, 2: -150.7885, 4: 1.0, 6: 1.0, 14: 1.0, 37: 1.0, 53: 1.0, 74: 1.0, 81: 1.0}), trip_duration=764.0),
 Row(Fvec=SparseVector(84, {0: 1.0926, 1: 0.0155, 2: 35.0333, 3: 1.0, 6: 1.0, 14: 1.0, 20: 1.0, 62: 1.0, 71: 1.0, 78: 1.0}), trip_duration=373.0),
 Row(Fvec=SparseVector(84, {0: 1.134, 1: 0.0156, 2: 29.9695, 3: 1.0, 6: 1.0, 14: 1.0, 20: 1.0, 61: 1.0, 74: 1.0, 78: 1.0}), trip_duration=198.0)]

In [None]:
# Split the DataFrame into training and testing sets
train_df, test_df = X.randomSplit([0.8,0.2])

In [None]:
train_df.tail(3)

[Row(Fvec=SparseVector(84, {4: 1.0, 11: 1.0, 14: 1.0, 43: 1.0, 65: 1.0, 75: 1.0, 82: 1.0}), trip_duration=1208.0),
 Row(Fvec=SparseVector(84, {4: 1.0, 11: 1.0, 14: 1.0, 43: 1.0, 68: 1.0, 74: 1.0, 80: 1.0}), trip_duration=992.0),
 Row(Fvec=SparseVector(84, {4: 1.0, 11: 1.0, 14: 1.0, 44: 1.0, 57: 1.0, 71: 1.0, 81: 1.0}), trip_duration=506.0)]

##1- Linear regression

In [None]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'Fvec', labelCol='trip_duration', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [56.88982313891074,3683.1861741208795,0.08141773530789055,-0.04152722832167021,0.04152722832285207,0.0,-10.162374396737482,18.136251881316714,21.415303771193447,32.53459205336633,-1.3764626464254759,0.0,-616.6601290859348,0.0,-34.440359027958486,34.440359027633654,-16.641525920225387,-22.198335137284634,-7.49608447696092,-6.583349286410875,-10.013600280133332,-6.667294888463872,-24.731815486013637,0.0,-4.289082843436562,-12.937645911003212,-0.6684069372242349,9.335943297031736,7.794166699422202,10.92648093298884,-1.4096652944340655,8.92129280208403,5.800419388263425,1.9706451819069428,3.0523333748050083,5.302264343891207,11.379349138854604,10.935589315314646,2.146843475272444,0.0,14.927442426999653,30.141323293862712,5.226265379782637,-15.710468667554105,-6.198717154351251,-19.943581957720376,0.0,-99.98273950679588,-127.48367000731177,-149.66233056710027,-172.13145858756337,-231.40130283740336,-340.4093668225522,-252.76880701343256,-89.44871636357195,28.41946415833361,53.

In [None]:
lr_model.evaluate(test_df).predictions.tail(20)

[Row(Fvec=SparseVector(84, {4: 1.0, 10: 1.0, 14: 1.0, 22: 1.0, 58: 1.0, 72: 1.0, 83: 1.0}), trip_duration=125.0, prediction=400.89714653564045),
 Row(Fvec=SparseVector(84, {4: 1.0, 10: 1.0, 14: 1.0, 23: 1.0, 63: 1.0, 76: 1.0, 79: 1.0}), trip_duration=216.0, prediction=693.5109988619438),
 Row(Fvec=SparseVector(84, {4: 1.0, 10: 1.0, 14: 1.0, 26: 1.0, 56: 1.0, 72: 1.0, 80: 1.0}), trip_duration=510.0, prediction=557.5284961776856),
 Row(Fvec=SparseVector(84, {4: 1.0, 10: 1.0, 14: 1.0, 28: 1.0, 62: 1.0, 76: 1.0, 77: 1.0}), trip_duration=312.0, prediction=612.121389364765),
 Row(Fvec=SparseVector(84, {4: 1.0, 10: 1.0, 14: 1.0, 29: 1.0, 50: 1.0, 75: 1.0, 82: 1.0}), trip_duration=313.0, prediction=318.8887770672435),
 Row(Fvec=SparseVector(84, {4: 1.0, 10: 1.0, 14: 1.0, 33: 1.0, 60: 1.0, 73: 1.0, 81: 1.0}), trip_duration=1089.0, prediction=579.9261835686469),
 Row(Fvec=SparseVector(84, {4: 1.0, 10: 1.0, 14: 1.0, 35: 1.0, 61: 1.0, 72: 1.0, 82: 1.0}), trip_duration=245.0, prediction=525.8137999

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

# assuming you have already trained a linear regression model called 'lr_model' on your training data

# make predictions on the test data
predictions = lr_model.transform(test_df)

# create an evaluator for regression problems and set the label and prediction columns
evaluator = RegressionEvaluator(labelCol='trip_duration', predictionCol='prediction', metricName='rmse')

# evaluate the model's root mean squared error (RMSE) on the test data
rmse = evaluator.evaluate(predictions)

# print the RMSE
print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")


Root Mean Squared Error (RMSE) on test data = 429.7017378984699


In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

# Assuming `predictions` is a DataFrame containing the predictions of your model on the test data
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='trip_duration', metricName='r2')
r2_score = evaluator.evaluate(predictions)

print('R-squared score on test data:', r2_score)


R-squared score on test data: 0.572564448071641


##2- Decision Tree

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor

# create a decision tree regression model and set its parameters
dt_model = DecisionTreeRegressor(featuresCol='Fvec', labelCol='trip_duration', maxDepth=10)

# train the model on your training data
dt_model = dt_model.fit(train_df)

# make predictions on the test data
predictions = dt_model.transform(test_df)

# evaluate the model's root mean squared error (RMSE) on the test data
evaluator = RegressionEvaluator(labelCol='trip_duration', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)

# print the RMSE
print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")


Root Mean Squared Error (RMSE) on test data = 373.4049925005402


In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

# Assuming `predictions` is a DataFrame containing the predictions of your model on the test data
evaluator_r2 = RegressionEvaluator(labelCol='trip_duration', predictionCol='prediction', metricName='r2')
r2 = evaluator_r2.evaluate(predictions)

print('R-squared score on test data:', r2_score)


R-squared score on test data: 0.572564448071641


##3- Random Forest

In [None]:
from pyspark.ml.regression import RandomForestRegressor

# create a random forest regression model and set its parameters
rf_model = RandomForestRegressor(featuresCol='Fvec', labelCol='trip_duration', numTrees=100, maxDepth=10)

# train the model on your training data
rf_model = rf_model.fit(train_df)

# make predictions on the test data
predictions = rf_model.transform(test_df)

# evaluate the model's root mean squared error (RMSE) on the test data
evaluator = RegressionEvaluator(labelCol='trip_duration', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)

# print the RMSE
print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")

Root Mean Squared Error (RMSE) on test data = 369.318470253129


In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

# Assuming `predictions` is a DataFrame containing the predictions of your model on the test data
evaluator_r2 = RegressionEvaluator(labelCol='trip_duration', predictionCol='prediction', metricName='r2')
r2 = evaluator_r2.evaluate(predictions)

print('R-squared score on test data:', r2_score)


R-squared score on test data: 0.572564448071641
