<a href="https://colab.research.google.com/github/yunusserhat/mlops-zoomcamp/blob/main/01-intro/duration-prediction-solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!python -V

Python 3.10.12


In [2]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [3]:
df_ = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

In [4]:
df = df_.copy()
df.shape

(3066766, 19)

In [5]:
df['duration'] =  df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [6]:
df['duration'].std()

42.59435124195458

In [7]:
before_outlier = df['duration'].shape
before_outlier = before_outlier[0]
before_outlier

3066766

In [8]:
df = df[(df.duration >= 1) & (df.duration <= 60)]
df['duration'].shape

(3009173,)

In [9]:
after_outlier = df['duration'].shape
after_outlier = after_outlier[0]
after_outlier

3009173

In [10]:
 after_outlier / before_outlier

0.9812202822125979

In [11]:
categorical = ['PULocationID', 'DOLocationID']

df[categorical] = df[categorical].astype(str)

In [12]:
df[categorical]

Unnamed: 0,PULocationID,DOLocationID
0,161,141
1,43,237
2,48,238
3,138,7
4,107,79
...,...,...
3066761,107,48
3066762,112,75
3066763,114,239
3066764,230,79


In [13]:
train_dicts = df[categorical].to_dict(orient='records')

In [14]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [15]:
X_train

<3009173x515 sparse matrix of type '<class 'numpy.float64'>'
	with 6018346 stored elements in Compressed Sparse Row format>

In [16]:
target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

In [17]:
rmse = mean_squared_error(y_train, y_pred, squared=False)
rmse

7.649261929771859

In [18]:
df_val = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [19]:
df_val['duration'] = df_val.tpep_dropoff_datetime - df_val.tpep_pickup_datetime
df_val.duration = df_val.duration.apply(lambda td: td.total_seconds() / 60)
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]
df_val[categorical] = df_val[categorical].astype(str)

In [20]:
val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)  # Use the same DictVectorizer fitted on the training data
y_val = df_val['duration'].values

In [21]:
y_pred = lr.predict(X_val)

In [22]:
rmse = mean_squared_error(y_val, y_pred, squared=False)
rmse

7.811818933419717