In [1]:
import pandas as pd
import numpy as np
import sklearn 
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import seaborn as sns

In [2]:
jan_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')
feb_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [3]:
print(f'A1: Jan 2023 NYC-Yellow-Taxi data columns => {jan_df.shape[1]}')

A1: Jan 2023 NYC-Yellow-Taxi data columns => 19


In [4]:
jan_df.head(5)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [5]:
def prep_data(df):
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
    df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
    df['duration'] = df['duration'].apply(lambda x: x.total_seconds() / 60) # in minute
    return df
jan_df = prep_data(jan_df)
feb_df = prep_data(feb_df)

In [6]:
jan_df['duration'].describe().apply(lambda x: f'{x:.2f}')

count    3066766.00
mean          15.67
std           42.59
min          -29.20
25%            7.12
50%           11.52
75%           18.30
max        10029.18
Name: duration, dtype: object

In [7]:
num_raw_jan_df = jan_df.shape[0]

In [8]:
print(f'A2: The standard deviation of the trips duration in January => {jan_df['duration'].describe()['std']:.2f}')

A2: The standard deviation of the trips duration in January => 42.59


In [9]:
jan_df = jan_df[jan_df['duration'].between(1,60)]
feb_df = feb_df[feb_df['duration'].between(1,60)]

In [10]:
print(f'A3: Fraction of the records left after drop the outliers => {(jan_df.shape[0] / num_raw_jan_df * 100):.2f} %')

A3: Fraction of the records left after drop the outliers => 98.12 %


In [11]:
def prep_feature(df):
    categorical = ['PULocationID','DOLocationID']
    df[categorical] = df[categorical].astype(str)
    dicts = df[categorical].to_dict(orient='records')
    return dicts

dv = DictVectorizer()

X_train_dicts = prep_feature(jan_df)
X_train = dv.fit_transform(X_train_dicts)

X_validate_dicts = prep_feature(feb_df)
X_validate = dv.transform(X_validate_dicts)

In [12]:
# A4
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 6018346 stored elements and shape (3009173, 515)>

In [13]:
target = 'duration'
y_train = jan_df[target].values
y_validate = feb_df[target].values

In [14]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [15]:
def rmse(X_real, y_real):
    y_pred = lr.predict(X_real)
    mse = mean_squared_error(y_real, y_pred)
    rmse = np.sqrt(mse)
    return rmse

In [16]:
print(f'A5: rmse on X_train => {rmse(X_train, y_train):.2f}')
print(f'A6: rmse on X_validate => {rmse(X_validate, y_validate):.2f}')

A5: rmse on X_train => 7.65
A6: rmse on X_validate => 7.81
