In [1]:
!python -V

Python 3.7.6


In [None]:
!pip install pyarrow

In [2]:
import pandas as pd

In [3]:
import pickle

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [6]:
import os
os.getcwd()

'/home/cablegirl/Documents/Courses/mlops/mlops-zoomcamp/01-intro'

# Q1. Downloading the data

We'll use the same NYC taxi dataset, but instead of "Green Taxi Trip Records", we'll use "For-Hire Vehicle Trip Records"

Download the data for January and February 2021

Note that you need "For-Hire Vehicle Trip Records", not "High Volume For-Hire Vehicle Trip Records".

Read the data for January. How many records are there?

    1054112
    1154112
    1254112
    1354112


In [7]:
df = pd.read_parquet('../data/fhv_tripdata_2021-01.parquet')
df

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037
...,...,...,...,...,...,...,...
1154107,B03266,2021-01-31 23:43:03,2021-01-31 23:51:48,7.0,7.0,,B03266
1154108,B03284,2021-01-31 23:50:27,2021-02-01 00:48:03,44.0,91.0,,
1154109,B03285,2021-01-31 23:13:46,2021-01-31 23:29:58,171.0,171.0,,B03285
1154110,B03285,2021-01-31 23:58:03,2021-02-01 00:17:29,15.0,15.0,,B03285


In [8]:
print(f'There are {df.shape[0]} records')

There are 1154112 records


# Q2. Computing duration

Now let's compute the duration variable. It should contain the duration of a ride in minutes.

What's the average trip duration in January?

    15.16
    19.16
    24.16
    29.16


In [9]:
df['duration'] = df.dropOff_datetime - df.pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [10]:
print(f'The average trip duration in January was {df.duration.mean()} minutes')

The average trip duration in January was 19.1672240937939 minutes


# Data preparation

Check the distribution of the duration variable. There are some outliners.

Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

How many records did you drop?

In [11]:
df2 = df[(df.duration >= 1) & (df.duration <= 60)].copy()

In [12]:
print(f'{df.shape[0] - df2.shape[0]} records were removed')

44286 records were removed


# Q3. Missing values# 

The features we'll user for our model are the pickup and dropoff location IDs.

But they have a lot of missing values there. Let's replace them with "-1"

What's the factions of missing values for the pickup location ID? (Or the fraction of "-1"s after you filled the NAs)

    53%
    63%
    73%
    83%


In [13]:
print(f'There is {df2.PUlocationID.isna().sum()/df2.shape[0] * 100} % of missing values for the pickup location ID')

There is 83.52732770722618 % of missing values for the pickup location ID


In [14]:
#Replace NaN by -1 for the pickup and dropoff location IDs
df2['PUlocationID'] = df2.PUlocationID.fillna(value=-1)
df2['DOlocationID'] = df2.DOlocationID.fillna(value=-1)
df2

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,-1.0,-1.0,,B00009,17.000000
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,-1.0,-1.0,,B00009,17.000000
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,-1.0,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,-1.0,61.0,,B00037,15.216667
5,B00037,2021-01-01 00:59:02,2021-01-01 01:08:05,-1.0,71.0,,B00037,9.050000
...,...,...,...,...,...,...,...,...
1154107,B03266,2021-01-31 23:43:03,2021-01-31 23:51:48,7.0,7.0,,B03266,8.750000
1154108,B03284,2021-01-31 23:50:27,2021-02-01 00:48:03,44.0,91.0,,,57.600000
1154109,B03285,2021-01-31 23:13:46,2021-01-31 23:29:58,171.0,171.0,,B03285,16.200000
1154110,B03285,2021-01-31 23:58:03,2021-02-01 00:17:29,15.0,15.0,,B03285,19.433333


# Q4. One-hot encoding

Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

    Turn the dataframe into a list of dictionaries
    Fit a dictionary vectorizer
    Get a feature matrix from it

What's the dimensionality of this matrix? (The number of columns)

    2
    152
    352
    525
    725


In [19]:
categorical = ['PUlocationID', 'DOlocationID']

In [20]:
df2[categorical] = df2[categorical].astype(str)
train_dicts = df2[categorical].to_dict(orient='records')
train_dicts

[{'PUlocationID': '-1.0', 'DOlocationID': '-1.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '-1.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '61.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '71.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '91.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '39.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '37.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '39.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '89.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '177.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '225.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '63.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '67.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '22.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '61.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '14.0'},
 {'PUlocationID': '-1.0', 'DO

In [21]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_train

<1109826x525 sparse matrix of type '<class 'numpy.float64'>'
	with 2219652 stored elements in Compressed Sparse Row format>

In [22]:
print(f'The feature matrix has {X_train.shape[1]} columns.')

The feature matrix has 525 columns.


# Q5. Training a model

Now let's use the feature matrix from the previous step to train a model.

    Train a plain linear regression model with default parameters
    Calculate the RMSE of the model on the training data

What's the RMSE on train?

    5.52
    10.52
    15.52
    20.52


In [23]:
target = 'duration'
y_train = df2[target].values

In [24]:
#Train the linear regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [25]:
#Get predicions
y_pred = lr.predict(X_train)

In [26]:
#Get model error (RSME)
print(f'The RMSE on train is {mean_squared_error(y_train, y_pred, squared=False)}')

The RMSE on train is 10.528519107212144


# Q6. Evaluating the model

Now let's apply this model to the validation dataset.

What's the RMSE on validation?

    6.01
    11.01
    16.01
    21.01


In [27]:
#Load and apply transformations to the validation set
df_val = pd.read_parquet('../data/fhv_tripdata_2021-02.parquet')

df_val['duration'] = df_val.dropOff_datetime - df_val.pickup_datetime
df_val.duration = df_val.duration.apply(lambda td: td.total_seconds() / 60)
df2_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)].copy()

#Replace NaN by -1 for the pickup and dropoff location IDs
df2_val['PUlocationID'] = df2_val.PUlocationID.fillna(value=-1)
df2_val['DOlocationID'] = df2_val.DOlocationID.fillna(value=-1)

df2_val[categorical] = df2_val[categorical].astype(str)
val_dicts = df2_val[categorical].to_dict(orient='records')

X_val = dv.transform(val_dicts)
y_val = df2_val[target].values

In [28]:
print(f'The validation feature matrix has {X_val.shape[1]} columns.')

The validation feature matrix has 525 columns.


In [31]:
#Get predictions for val dataset
y_val_pred = lr.predict(X_val)

In [32]:
#Get model error (RSME)
print(f'The RMSE on validation is {mean_squared_error(y_val, y_val_pred, squared=False)}')

The RMSE on validation is 11.014283226749118
