In [1]:
import pandas as pd
import pickle 

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import root_mean_squared_error as rmse

In [2]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')
df = df[['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'PULocationID', 'DOLocationID', 'trip_distance']]
df.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,DOLocationID,trip_distance
0,2023-01-01 00:32:10,2023-01-01 00:40:36,161,141,0.97
1,2023-01-01 00:55:08,2023-01-01 01:01:27,43,237,1.1
2,2023-01-01 00:25:04,2023-01-01 00:37:49,48,238,2.51
3,2023-01-01 00:03:48,2023-01-01 00:13:25,138,7,1.9
4,2023-01-01 00:10:29,2023-01-01 00:21:19,107,79,1.43


In [3]:
df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
df['duration'] = df['duration'].apply(lambda td: td.total_seconds()/60)
df.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,DOLocationID,trip_distance,duration
0,2023-01-01 00:32:10,2023-01-01 00:40:36,161,141,0.97,8.433333
1,2023-01-01 00:55:08,2023-01-01 01:01:27,43,237,1.1,6.316667
2,2023-01-01 00:25:04,2023-01-01 00:37:49,48,238,2.51,12.75
3,2023-01-01 00:03:48,2023-01-01 00:13:25,138,7,1.9,9.616667
4,2023-01-01 00:10:29,2023-01-01 00:21:19,107,79,1.43,10.833333


In [4]:
df.describe()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,PULocationID,DOLocationID,trip_distance,duration
count,3066766,3066766,3066766.0,3066766.0,3066766.0,3066766.0
mean,2023-01-17 00:22:26.288164,2023-01-17 00:38:06.427874,166.398,164.3926,3.847342,15.669
min,2008-12-31 23:01:42,2009-01-01 14:29:11,1.0,1.0,0.0,-29.2
25%,2023-01-09 16:21:57.250000,2023-01-09 16:37:06,132.0,114.0,1.06,7.116667
50%,2023-01-17 08:42:29.500000,2023-01-17 08:58:30.500000,162.0,162.0,1.8,11.51667
75%,2023-01-24 16:26:27,2023-01-24 16:42:49,234.0,234.0,3.33,18.3
max,2023-02-01 00:56:53,2023-02-02 09:28:47,265.0,265.0,258928.1,10029.18
std,,,64.24413,69.94368,249.5838,42.59435


In [5]:
new_df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
new_df.shape

(3009173, 6)

In [6]:
percentage = new_df.shape[0] / df.shape[0] * 100
print(percentage)

98.1220282212598


In [7]:
categorical = ['PULocationID', 'DOLocationID']
numerical =['trip_distance']

new_df[categorical] = new_df[categorical].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[categorical] = new_df[categorical].astype(str)


In [8]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df = df[['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'PULocationID', 'DOLocationID', 'trip_distance']]
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [9]:
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [10]:
train_dicts = new_df[categorical + numerical].to_dict(orient='records')

dv = DictVectorizer()

X_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = new_df[target].values

In [11]:
lr = LinearRegression()

lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)
print("RMSE:", rmse(y_train, y_pred))

RMSE: 7.658396898909143


In [12]:
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

target = 'duration'
y_val = df_val[target].values

In [13]:
y_pred = lr.predict(X_val)
rmse(y_val, y_pred)

7.820263388747155