In [1]:
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
!wget -nc https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-01.parquet
!wget -nc https://nyc-tlc.s3.amazonaws.com/trip+data/fhv_tripdata_2021-02.parquet

File ‘fhv_tripdata_2021-01.parquet’ already there; not retrieving.

File ‘fhv_tripdata_2021-02.parquet’ already there; not retrieving.



In [4]:
#Read the data for January. How many records are there?
parquet_path = "data/fhv_tripdata_2021-01.parquet"
df = pd.read_parquet(parquet_path)
print(f"# of Records {len(df)}")

# of Records 1154112


In [5]:
#What's the average trip duration in January?

df["duration"] = df.dropOff_datetime - df.pickup_datetime
df.duration = df.duration.apply(lambda d: d.total_seconds() / 60)
print(f"Average trip duration in January is {df.duration.mean():.2f} secs")

average trip duration in January 19.17 secs


In [6]:
df = df[(df.duration >= 1) & (df.duration <= 60)]

In [7]:
"""
The features we'll use for our model are the pickup and dropoff location IDs. 
But they have a lot of missing values there. Let's replace them with "-1".
What's the fractions of missing values for the pickup location ID? I.e. fraction of "-1"s after you filled the NAs.
"""

print(f"Missing Values: %{df.PUlocationID.isna().sum() / len(df) * 100:.2f}")
df.PUlocationID = df.PUlocationID.fillna(-1)
df.DOlocationID = df.DOlocationID.fillna(-1)

Missing Values %83.53


In [8]:
features_df = df.loc[:,["PUlocationID", "DOlocationID"]]
features_df.PUlocationID = features_df.PUlocationID.astype("string")
features_df.DOlocationID = features_df.DOlocationID.astype("string")

labels_df = df.loc[:,["duration"]]
train_y = labels_df.values

In [9]:
#Dimensionality after OHE
dv = DictVectorizer()
features_dict = features_df.to_dict(orient='records')

train_X = dv.fit_transform(features_dict)
print(f"Dimensionality after OHE: {train_X.shape[1]}")

Dimensionality after OHE: 525


In [10]:
train_parquet_path = "fhv_tripdata_2021-01.parquet"
test_parquet_path = "fhv_tripdata_2021-02.parquet"

def get_features_df(parquet_path):
    df = pd.read_parquet(parquet_path)
    df["duration"] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda d: d.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    df.PUlocationID = df.PUlocationID.fillna(-1)
    df.DOlocationID = df.DOlocationID.fillna(-1)
    features_df = df.loc[:,["PUlocationID", "DOlocationID"]]
    features_df.PUlocationID = features_df.PUlocationID.astype("string")
    features_df.DOlocationID = features_df.DOlocationID.astype("string")
    
    labels_df = df.loc[:,["duration"]]
    
    return features_df, labels_df
    
train_features_df, train_labels_df = get_features_df(train_parquet_path)
test_features_df, test_labels_df = get_features_df(test_parquet_path)

train_features_dict = train_features_df.to_dict(orient='records')
test_features_dict = test_features_df.to_dict(orient='records')

dv = DictVectorizer()
dv.fit(train_features_dict)

train_X = dv.transform(train_features_dict)
train_y = train_labels_df.values

test_X = dv.transform(test_features_dict)
test_y = test_labels_df.values
  
lr = LinearRegression()
lr.fit(train_X, train_y)
    
pred_train_y = lr.predict(train_X)
pred_test_y = lr.predict(test_X)

rmse_train = mean_squared_error(train_y, pred_train_y, squared=False)
print(f"RMSE Train: {rmse_train:.2f}")

rmse_test = mean_squared_error(test_y, pred_test_y, squared=False)
print(f"RMSE Test: {rmse_test:.2f}")


RMSE Train: 10.53
RMSE Test: 11.01
