In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

## Question 1

In [2]:
df_train = pd.read_parquet('../data/yellow_tripdata_2023-01.parquet')
df_val = pd.read_parquet('../data/yellow_tripdata_2023-02.parquet')

print(f"The data for January 2023 have {len(df_train.columns)} columns.")

The data for January 2023 have 19 columns.


## Question 2

In [3]:
def calculate_duration(df: pd.DataFrame) -> pd.DataFrame:
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    return df

In [4]:
df_train = calculate_duration(df_train)
print(f"The standard deviation of the duration of the trips in January 2023 is {df_train.duration.std():.2f} minutes.")

The standard deviation of the duration of the trips in January 2023 is 42.59 minutes.


## Question 3

In [5]:
def clean_duration(df: pd.DataFrame) -> pd.DataFrame:
    return df[(df.duration >= 1) & (df.duration <= 60)]

In [6]:
df_train_clean = clean_duration(df_train)
rows_before_cleaning = df_train.shape[0]
rows_after_cleaning = df_train_clean.shape[0]
print(f"The fraction of the records left after cleaning is {rows_after_cleaning / rows_before_cleaning:.2f}.")

The fraction of the records left after cleaning is 0.98.


## Question 4

In [7]:
def prep_data_dict(df: pd.DataFrame, cat_cols: list[str]) -> pd.DataFrame:
    df.loc[:, cat_cols] = df[cat_cols].astype(str)
    data_dict = df[cat_cols].to_dict(orient='records')
    return data_dict

In [8]:
categorical = ['PULocationID', 'DOLocationID']
train_dicts = prep_data_dict(df=df_train_clean, cat_cols=categorical)

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
print(f"The number of features after one-hot encoding is {X_train.shape[1]}.")

  df.loc[:, cat_cols] = df[cat_cols].astype(str)
  df.loc[:, cat_cols] = df[cat_cols].astype(str)


The number of features after one-hot encoding is 515.


## Question 5

In [9]:
target = 'duration'
y_train = df_train_clean[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mse_train = root_mean_squared_error(y_train, y_pred)
print(f"The RMSE on the training data is {mse_train:.2f}")

The RMSE on the training data is 7.65


## Question 6

In [10]:
df_val = calculate_duration(df_val)
df_val_clean = clean_duration(df_val)
val_dicts = prep_data_dict(df=df_val_clean, cat_cols=categorical)

X_val = dv.transform(val_dicts)
y_val = df_val_clean[target].values
y_pred = lr.predict(X_val)

mse_val = root_mean_squared_error(y_val, y_pred)
print(f"The RMSE on the validation data is {mse_val:.2f}")

  df.loc[:, cat_cols] = df[cat_cols].astype(str)
  df.loc[:, cat_cols] = df[cat_cols].astype(str)


The RMSE on the validation data is 7.81
