In [1]:
import pathlib

import pandas as pd
import sklearn.feature_extraction
import sklearn.linear_model
import sklearn.metrics

## Q1. Downloading the data

In [2]:
data_dir = pathlib.Path('data')
data_dir.mkdir(exist_ok=True)

In [3]:
!wget -q https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet -O {data_dir}/yellow_tripdata_2023-01.parquet
!wget -q https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet -O {data_dir}/yellow_tripdata_2023-02.parquet

In [4]:
jan_trips = pd.read_parquet(data_dir / 'yellow_tripdata_2023-01.parquet')
print(f'Number of columns in initial data: {len(jan_trips.columns)}')

Number of columns in initial data: 19


### Q2. Computing duration

In [5]:
def calc_duration(df: pd.DataFrame) -> None:
    df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.seconds / 60

calc_duration(jan_trips)

print(f'Standard deviation of trip duration in January: {jan_trips['duration'].std():.2f} minutes')

Standard deviation of trip duration in January: 41.63 minutes


### Q3. Dropping outliers

In [6]:
def filter_outliers(df: pd.DataFrame) -> pd.DataFrame:
    return df[(1 <= df['duration']) & (df['duration'] <= 60)]

num_initial_entries = len(jan_trips)

jan_trips = filter_outliers(jan_trips)

num_remaining_entries = len(jan_trips)
print(f'Percentage of remaining entries: {100 * num_remaining_entries / num_initial_entries:.1f}%')

Percentage of remaining entries: 98.1%


### Q4. One-hot encoding

In [7]:
feature_cols = ['PULocationID', 'DOLocationID']
target_col = 'duration'
selected_cols = feature_cols + [target_col]


def prepare_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df[selected_cols]
    for col in feature_cols:
        df.loc[:, col] = df[col].astype(str)
    return df

jan_trips = prepare_features(jan_trips)
jan_features_dict = jan_trips[feature_cols].to_dict('records')
vectorizer = sklearn.feature_extraction.DictVectorizer()
vectorizer.fit(jan_features_dict)

jan_features = vectorizer.transform(jan_features_dict)
print(f'Feature matrix shape: {jan_features.shape}')

Feature matrix shape: (3009176, 515)


### Q5. Training a model

In [8]:
linear_model = sklearn.linear_model.LinearRegression()
linear_model.fit(jan_features, jan_trips[target_col])

jan_rmse = sklearn.metrics.root_mean_squared_error(jan_trips[target_col], linear_model.predict(jan_features))
print(f'RMSE on January data: {jan_rmse:.2f}')

RMSE on January data: 7.65


### Q6. Evaluating the model

In [9]:
feb_trips = pd.read_parquet(data_dir / 'yellow_tripdata_2023-02.parquet')

calc_duration(feb_trips)
feb_trips = filter_outliers(feb_trips)
feb_trips = prepare_features(feb_trips)

feb_features = vectorizer.transform(feb_trips[feature_cols].to_dict('records'))

feb_rmse = sklearn.metrics.root_mean_squared_error(feb_trips[target_col], linear_model.predict(feb_features))
print(f'RMSE on February data: {feb_rmse:.2f}')

RMSE on February data: 7.81
