In [1]:
import typing as tp

import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
JAN_DATA = "data/fhv_tripdata_2021-01.parquet"
FEB_DATA = "data/fhv_tripdata_2021-02.parquet"


jan_data = pd.read_parquet(JAN_DATA)
feb_data = pd.read_parquet(FEB_DATA)

In [3]:
def prepare_dataframe(raw_data: pd.DataFrame) -> pd.DataFrame:
    data = raw_data.copy()
    
    # Translate data columns to datetimes.
    data["pickup_datetime"] = pd.to_datetime(data["pickup_datetime"])
    data["dropOff_datetime"] = pd.to_datetime(data["dropOff_datetime"])
    
    # Compute duration time in minutes.
    data["duration"] = (data["dropOff_datetime"] - data["pickup_datetime"]).dt.total_seconds() / 60
    
    # Filter data by duration.
    data = data.query("duration >= 1 and duration <= 60")
    assert (data["duration"] >= 1).all() and (data["duration"] <= 60).all()
    
    # Fill NaN's.
    data["PUlocationID"] = data["PUlocationID"].fillna(-1).astype(int).astype(str)
    data["DOlocationID"] = data["DOlocationID"].fillna(-1).astype(int).astype(str)
    assert not data["DOlocationID"].isna().all() and not data["PUlocationID"].isna().all()
    return data

In [4]:
jan_data = prepare_dataframe(jan_data)
feb_data = prepare_dataframe(feb_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["PUlocationID"] = data["PUlocationID"].fillna(-1).astype(int).astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["DOlocationID"] = data["DOlocationID"].fillna(-1).astype(int).astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["PUlocationID"] = data["PUlocationID"].fil

In [5]:
from sklearn.feature_extraction import DictVectorizer


def fit_vectorizer(data: pd.DataFrame, categorical_features: list[str]) -> DictVectorizer:
    dv = DictVectorizer()
    train_dicts = data[categorical_features].to_dict(orient="records")
    dv.fit(train_dicts)
    return dv


def get_x_and_y(
    data: pd.DataFrame,
    categorical_features: list[str],
    vectorizer: DictVectorizer,
) -> tuple[np.ndarray, np.ndarray]:
    dicts = data[categorical_features].to_dict(orient="records")
    X = vectorizer.transform(dicts)
    print("Design matrix dims:", X.shape)
    y = data["duration"].values
    return X, y


vectorizer = fit_vectorizer(jan_data, categorical_features=["DOlocationID", "PUlocationID"])
X_train, y_train = get_x_and_y(jan_data, categorical_features=["DOlocationID", "PUlocationID"], vectorizer=vectorizer)
X_test, y_test = get_x_and_y(feb_data, categorical_features=["DOlocationID", "PUlocationID"], vectorizer=vectorizer)

Design matrix dims: (1109826, 525)
Design matrix dims: (990113, 525)


In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


def train_and_validate(
    X_train: np.ndarray,
    y_train: np.ndarray,
    X_val: np.ndarray,
    y_val: np.ndarray
) -> tuple[float, float]:
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    train_score = mean_squared_error(y_train, lr.predict(X_train), squared=False)
    val_score = mean_squared_error(y_val, lr.predict(X_val), squared=False)
    return train_score, val_score


print("Train: {}, val: {}".format(*train_and_validate(X_train, y_train, X_test, y_test)))

Train: 10.52851942175949, val: 11.014286819428976
