Homework - [link](https://github.com/DataTalksClub/mlops-zoomcamp/blob/main/cohorts/2024/01-intro/homework.md)

In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction import DictVectorizer

In [3]:
from sklearn.linear_model import LinearRegression

In [4]:
from sklearn.metrics import root_mean_squared_error

In [5]:
from sklearn.linear_model import Lasso

## Q1. Downloading the data

In [6]:
for date in ['2023-01', '2023-02']:
  data_url = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{date}.parquet'
  data_path = "/".join(["data", data_url.split('/')[-1]])
  !mkdir -p 'data'
  !curl -s -S $data_url -o $data_path

In [7]:
df_training = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')
df_training

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.30,1.00,0.5,0.00,0.0,1.0,14.30,2.5,0.00
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.10,1.0,N,43,237,1,7.90,1.00,0.5,4.00,0.0,1.0,16.90,2.5,0.00
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.90,1.00,0.5,15.00,0.0,1.0,34.90,2.5,0.00
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.90,1.0,N,138,7,1,12.10,7.25,0.5,0.00,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.40,1.00,0.5,3.28,0.0,1.0,19.68,2.5,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3066761,2,2023-01-31 23:58:34,2023-02-01 00:12:33,,3.05,,,107,48,0,15.80,0.00,0.5,3.96,0.0,1.0,23.76,,
3066762,2,2023-01-31 23:31:09,2023-01-31 23:50:36,,5.80,,,112,75,0,22.43,0.00,0.5,2.64,0.0,1.0,29.07,,
3066763,2,2023-01-31 23:01:05,2023-01-31 23:25:36,,4.67,,,114,239,0,17.61,0.00,0.5,5.32,0.0,1.0,26.93,,
3066764,2,2023-01-31 23:40:00,2023-01-31 23:53:00,,3.15,,,230,79,0,18.15,0.00,0.5,4.43,0.0,1.0,26.58,,


In [8]:
len(df_training.columns)

19

## Q2. Computing duration

In [9]:
df_training['duration'] = df_training.tpep_dropoff_datetime - df_training.tpep_pickup_datetime
df_training.duration = df_training.duration.apply(lambda td: td.total_seconds() / 60)
df_training = df_training[df_training.duration > 0]
df_training.duration.std()

42.601071064370515

## Q3. Dropping outliers

In [10]:
wo_outliers = ((df_training.duration >= 1) & (df_training.duration <= 60)).mean()
df_training = df_training[(df_training.duration >= 1) & (df_training.duration <= 60)]
wo_outliers

0.9815790804219015

## Q4. One-hot encoding

Transform the DataFrame to a dictionary

In [11]:
categorical = ['PULocationID', 'DOLocationID']
df_training[categorical] = df_training[categorical].astype(str)
training_dicts = df_training[categorical].to_dict(orient='records')

Transform the dictionary to matrix (2D vectors) of shape (n_samples, n_features). Show the number of features

In [12]:
dv = DictVectorizer()
x_training = dv.fit_transform(training_dicts)
x_training.get_shape()[1]

515

## Q5. Training a model

Use vectorized location IDs as training data and durations as target data

In [13]:
target = "duration"
y_training = df_training[target].values
y_training

array([ 8.43333333,  6.31666667, 12.75      , ..., 24.51666667,
       13.        , 14.4       ])

In [14]:
lr = LinearRegression()
lr.fit(x_training, y_training)
y_prediction = lr.predict(x_training)
type(y_prediction)

numpy.ndarray

In [15]:
lr.intercept_

23.197062356725514

Get RMSE for the training data

In [16]:
root_mean_squared_error(y_training, y_prediction)

7.649261929201487

## Q6. Evaluating the model

In [17]:
df_validation = pd.read_parquet('./data/yellow_tripdata_2023-02.parquet')
df_validation['duration'] = df_validation.tpep_dropoff_datetime - df_validation.tpep_pickup_datetime
df_validation.duration = df_validation.duration.apply(lambda td: td.total_seconds() / 60)
df_validation = df_validation[(df_validation.duration >= 1) & (df_validation.duration <= 60)]
df_validation[categorical] = df_validation[categorical].astype(str)
df_validation

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration
0,1,2023-02-01 00:32:53,2023-02-01 00:34:34,2.0,0.30,1.0,N,142,163,2,4.40,3.50,0.5,0.00,0.0,1.0,9.40,2.5,0.00,1.683333
3,1,2023-02-01 00:29:33,2023-02-01 01:01:38,0.0,18.80,1.0,N,132,26,1,70.90,2.25,0.5,0.00,0.0,1.0,74.65,0.0,1.25,32.083333
4,2,2023-02-01 00:12:28,2023-02-01 00:25:46,1.0,3.22,1.0,N,161,145,1,17.00,1.00,0.5,3.30,0.0,1.0,25.30,2.5,0.00,13.300000
5,1,2023-02-01 00:52:40,2023-02-01 01:07:18,1.0,5.10,1.0,N,148,236,1,21.90,3.50,0.5,5.35,0.0,1.0,32.25,2.5,0.00,14.633333
6,1,2023-02-01 00:12:39,2023-02-01 00:40:36,1.0,8.90,1.0,N,137,244,1,41.50,3.50,0.5,3.50,0.0,1.0,50.00,2.5,0.00,27.950000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2913950,2,2023-02-28 23:46:00,2023-03-01 00:05:00,,4.65,,,249,140,0,20.22,0.00,0.5,4.84,0.0,1.0,29.06,,,19.000000
2913951,2,2023-02-28 23:26:02,2023-02-28 23:37:10,,2.47,,,186,79,0,13.66,0.00,0.5,2.65,0.0,1.0,20.31,,,11.133333
2913952,2,2023-02-28 23:24:00,2023-02-28 23:38:00,,3.49,,,158,143,0,17.64,0.00,0.5,0.00,0.0,1.0,21.64,,,14.000000
2913953,2,2023-02-28 23:03:00,2023-02-28 23:10:00,,2.13,,,79,162,0,13.56,0.00,0.5,2.63,0.0,1.0,20.19,,,7.000000


In [18]:
validation_dicts = df_validation[categorical].to_dict(orient='records')
x_validation = dv.transform(validation_dicts)
y_validation = df_validation[target].values

validation_prediction = lr.predict(x_validation)

In [19]:
root_mean_squared_error(y_validation, validation_prediction)

7.811819793542861