In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [6]:
df = pd.read_parquet('../data/fhv_tripdata_2021-01.parquet')
df.sample(5)

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
201975,B00149,2021-01-07 03:50:28,2021-01-07 03:54:18,,62.0,,B00149
1110626,B02133,2021-01-30 13:29:27,2021-01-30 13:47:53,,,,B02788
649149,B02782,2021-01-19 05:35:00,2021-01-19 05:53:00,,,,B02782
152383,B00987,2021-01-05 18:05:00,2021-01-05 18:19:00,258.0,63.0,,B00987
590371,B00882,2021-01-17 05:02:08,2021-01-17 05:03:57,,166.0,,B00882


## Data Dictionary for FHV trip records:

Taken from https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_fhv.pdf

* Dispatching_base_num: The TLC Base License Number of the base that dispatched the trip
* Pickup_datetime: The date and time of the trip pick-up
* DropOff_datetime: The date and time of the trip dropoff
* PULocationID: TLC Taxi Zone in which the trip began
* DOLocationID: TLC Taxi Zone in which the trip ended
* SR_Flag: Indicates if the trip was a part of a shared ride chain offered by a
High Volume FHV company (e.g. Uber Pool, Lyft Line). For shared
trips, the value is 1. For non-shared rides, this field is null.
NOTE: For most High Volume FHV companies, only shared rides that
were requested AND matched to another shared-ride request over
the course of the journey are flagged. However, Lyft (base license
numbers B02510 + B02844) also flags rides for which a shared ride
was requested but another passenger was not successfully matched
to share the trip—therefore, trips records with SR_Flag=1 from those
two bases could indicate EITHER a first trip in a shared trip chain OR
a trip for which a shared ride was requested but never matched.
Users should anticipate an overcount of successfully shared trips
completed by Lyft.

## Q1 - download the data


In [7]:
num_records_jan = len(df)
print(f'January has {num_records_jan} records')

January has 1154112 records


## Q2 - computing duration

In [15]:
df['duration'] = df['dropOff_datetime'] - df['pickup_datetime']
# the lambda and apply lets us use the .total_seconds() method of the
# datetime object
df['duration'] = df['duration'].apply(lambda td: td.total_seconds() / 60)

jan_duration_avg = df['duration'].mean()
print(f'Average trip duration in jan: {jan_duration_avg}')

Average trip duration in jan: 19.1672240937939


In [17]:
# Data Prep - remove and count outliers
df['valid_trip'] = (df['duration'] >=1) & (df['duration'] <= 60)

num_outliers = len(df['valid_trip']) - df['valid_trip'].sum()
print(f'dropped outliers: {num_outliers}')

dropped outliers: 44286


## Q3 - missing values
 
Replace missing pickup/dropoff loc IDs with -1. How much are missing?

In [18]:
# fraction of nan in PUlocationID
pct_nan = df['PUlocationID'].isna().sum() / len(df['PUlocationID'])
print(pct_nan)

0.8303067639882438


In [19]:
df['PUlocationID_filled'] = df['PUlocationID'].fillna(value=-1)
df['DOlocationID_filled'] = df['DOlocationID'].fillna(value=-1)

## Q4 - one-hot encoding

* turn df into list of dicts
* fit dict vectorizer
* extract feature matrix

What is the dimensionality of the feature matrix?

In [24]:
categorical = ['PUlocationID_filled', 'DOlocationID_filled']
df[categorical] = df[categorical].astype(str)
df['PU_DO'] = df['PUlocationID_filled'] + '_' + df['DOlocationID_filled']

# numeric = ['trip_distance']
dv = DictVectorizer()
train_dicts = df[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)


In [25]:
n_dim = X_train.shape
print(n_dim)

(1154112, 525)


## Q5 training the model

Use linear regression to predict duration and calculate RMSE (root mean squared error)

In [26]:
target = 'duration'
y_train = df[target].values

# default params
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)
rmse = mean_squared_error(y_train, y_pred, squared=False)

In [27]:
print(rmse)

398.5442107796931


In [36]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)

        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df['dropOff_datetime'] - df['pickup_datetime']
    # the lambda and apply lets us use the .total_seconds() method of the
    # datetime object
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds() / 60)

    # remove outliers
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    # handle nans
    categorical = ['PUlocationID', 'DOlocationID']
    df[categorical] = df[categorical].astype(str)
    df['PUlocationID_filled'] = df['PUlocationID'].fillna(value=-1).astype(str)
    df['DOlocationID_filled'] = df['DOlocationID'].fillna(value=-1).astype(str)

    df['PU_DO'] = df['PUlocationID_filled'] + '_' + df['DOlocationID_filled']
    
    use_columns = ['PU_DO', 'duration']
    df = df[use_columns]
    return df

In [37]:
df_train = read_dataframe('../data/fhv_tripdata_2021-01.parquet')
df_val = read_dataframe('../data/fhv_tripdata_2021-02.parquet')

df_train.head()

Unnamed: 0,PU_DO,duration
0,nan_nan,17.0
1,nan_nan,17.0
3,nan_72.0,8.283333
4,nan_61.0,15.216667
5,nan_71.0,9.05


In [39]:
feature = ['PU_DO']
dv = DictVectorizer()

train_dicts = df_train[feature].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[feature].to_dict(orient='records')
X_val = dv.transform(val_dicts)

y_train = df_train[target].values
y_val = df_val[target].values

In [40]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

10.596293717089068