In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import pickle
from sklearn.metrics import r2_score

In [2]:
categorical=['PULocationID','DOLocationID']

def read_dataframe(filename):
    df=pd.read_parquet(filename)
    df['duration']=df.tpep_dropoff_datetime -df.tpep_pickup_datetime
    df.duration=df.duration.apply(lambda td: td.total_seconds()/60)
    df=df[(df.duration>=1) & (df.duration<=60)]
    df[categorical]=df[categorical].astype(str)
    
    return df

## Q1. Downloading the data
#### We'll use the same NYC taxi dataset, but instead of "Green Taxi Trip Records", we'll use "Yellow Taxi Trip Records".

#### Download the data for January and February 2022.

#### Read the data for January. How many columns are there?

In [3]:
hw_df1= pd.read_parquet('data/yellow_tripdata_2022-01.parquet')

In [4]:
col_names =hw_df1.columns

In [5]:
len(col_names)

19

In [6]:
hw_df1.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


## Q2. Computing duration
#### Now let's compute the duration variable. It should contain the duration of a ride in minutes.

#### What's the standard deviation of the trips duration in January?

In [7]:
hw_df1['duration']=hw_df1.tpep_dropoff_datetime -hw_df1.tpep_pickup_datetime
hw_df1.duration=hw_df1.duration.apply(lambda td: td.total_seconds()/60)

In [8]:
hw_df1['duration'].head()

0    17.816667
1     8.400000
2     8.966667
3    10.033333
4    37.533333
Name: duration, dtype: float64

In [9]:
round(hw_df1.duration.describe(),2)

count    2463931.00
mean          14.21
std           46.45
min        -3442.40
25%            6.32
50%           10.18
75%           16.17
max         8513.18
Name: duration, dtype: float64

##### Standard Deviation is  46.45 mins

## Q3. Dropping outliers
#### Next, we need to check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

#### What fraction of the records left after you dropped the outliers?

In [10]:
hw_df2=hw_df1[(hw_df1.duration>=1) & (hw_df1.duration<=60)]

In [11]:
hw_df2.duration.count()

2421440

In [12]:
hw_df1.duration.count()

2463931

In [13]:
round(2421440/2463931 *100,0)

98.0

##### 98%

## Q4. One-hot encoding
#### Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

#### Turn the dataframe into a list of dictionaries
#### Fit a dictionary vectorizer
#### Get a feature matrix from it
#### What's the dimensionality of this matrix (number of columns)?

In [14]:
catagorical=['PULocationID','DOLocationID']

In [15]:
hw_df2.isnull().sum()

VendorID                     0
tpep_pickup_datetime         0
tpep_dropoff_datetime        0
passenger_count          66959
trip_distance                0
RatecodeID               66959
store_and_fwd_flag       66959
PULocationID                 0
DOLocationID                 0
payment_type                 0
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
improvement_surcharge        0
total_amount                 0
congestion_surcharge     66959
airport_fee              66959
duration                     0
dtype: int64

In [16]:
hw_df2[catagorical]=hw_df2[catagorical].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hw_df2[catagorical]=hw_df2[catagorical].astype('str')


In [17]:
hw_dicts=hw_df2[catagorical].to_dict(orient='records')

In [18]:
hw_dicts[:10]

[{'PULocationID': '142', 'DOLocationID': '236'},
 {'PULocationID': '236', 'DOLocationID': '42'},
 {'PULocationID': '166', 'DOLocationID': '166'},
 {'PULocationID': '114', 'DOLocationID': '68'},
 {'PULocationID': '68', 'DOLocationID': '163'},
 {'PULocationID': '138', 'DOLocationID': '161'},
 {'PULocationID': '233', 'DOLocationID': '87'},
 {'PULocationID': '238', 'DOLocationID': '152'},
 {'PULocationID': '166', 'DOLocationID': '236'},
 {'PULocationID': '236', 'DOLocationID': '141'}]

In [19]:
dv=DictVectorizer()

In [20]:
hw_df_transform= dv.fit_transform(hw_dicts)

In [21]:
hw_df_transform.shape

(2421440, 515)

In [22]:
len(dv.feature_names_)

515

##### 515

## Q5. Training a model
#### Now let's use the feature matrix from the previous step to train a model.

#### Train a plain linear regression model with default parameters
#### Calculate the RMSE of the model on the training data
#### What's the RMSE on train?

In [23]:
X_train = read_dataframe('data/yellow_tripdata_2022-01.parquet')

In [24]:
y_train=X_train.duration.values

In [25]:
train_dicts=X_train[catagorical].to_dict(orient='records')

In [26]:
X_train=dv.fit_transform(train_dicts)

In [27]:
X_train.shape

(2421440, 515)

In [28]:
lr= LinearRegression()

In [29]:
lr.fit(X_train,y_train)

LinearRegression()

In [30]:
y_pred=lr.predict(X_train)

In [31]:
round(mean_squared_error(y_train,y_pred,squared=False),2)

6.99

#### RMSE on Train 6.99

## Q6. Evaluating the model
#### Now let's apply this model to the validation dataset (February 2022).

#### What's the RMSE on validation?

In [32]:
X_val= read_dataframe('data/yellow_tripdata_2022-02.parquet')

In [33]:
y_val=X_val.duration.values

In [34]:
val_dicts=X_val[catagorical].to_dict(orient='records')

In [35]:
X_val=dv.fit_transform(val_dicts)

In [36]:
X_val.shape

(2918187, 514)

In [37]:
vlr= LinearRegression()

In [38]:
vlr.fit(X_val,y_val)

LinearRegression()

In [39]:
y_val_pred=vlr.predict(X_val)

In [40]:
round(mean_squared_error(y_val,y_val_pred,squared=False),2)

7.64

#### RMSE for validation dataset is 7.64