In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

!python -V

Python 3.9.12


In [2]:
def load_data(filename):
    df = pd.read_parquet(filename)
    return df

def compute_duration(df):
    # computing the duration of a ride in minutes
    df['duration'] = (df['dropOff_datetime'] - df['pickup_datetime'])
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds() / 60)

    return df

def preprocess_data(df_raw):
    df = df_raw.copy()
    # excluding outliers. keep only the records where the duration is between 1 and 60 minutes (inclusive).
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

    # Replacing NA values with "-1"
    df['PUlocationID'] = df.PUlocationID.fillna(-1)
    df['DOlocationID'] = df.DOlocationID.fillna(-1)

    # changing type to string (categorical variable)
    df['PUlocationID'] = df.PUlocationID.astype(str)
    df['DOlocationID'] = df.DOlocationID.astype(str)

    return df

def prepare_features(df_train, df_val, features_cols, target_col):
    train_dicts = df_train[features_cols].to_dict(orient='records')
    val_dicts = df_val[features_cols].to_dict(orient='records')

    dv = DictVectorizer()
    
    X_train = dv.fit_transform(train_dicts)
    y_train = df_train[target_col].values

    X_val = dv.transform(val_dicts)
    y_val = df_val[target_col].values

    return X_train, y_train, X_val, y_val, dv

def train_model(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)

    return model

def evaluate_model(model, X, y):
    y_pred = model.predict(X)

    return mean_squared_error(y, y_pred, squared=False)

def export_model(filename, dv, model):
    with open(filename, 'wb') as f_out:
        pickle.dump((dv, model), f_out)
    
    return

### PIPELINE

In [3]:
df_train_raw = load_data('../data/fhv_tripdata_2021-01.parquet')
df_val_raw = load_data('../data/fhv_tripdata_2021-02.parquet')

#additional step to answer questions of homework
df_train_duration = compute_duration(df_train_raw) 
df_val_duration = compute_duration(df_val_raw)

df_train = preprocess_data(df_train_duration)
df_val = preprocess_data(df_val_duration)

X_train, y_train, X_val, y_val, dv = prepare_features(df_train, df_val, features_cols=['PUlocationID', 'DOlocationID'], target_col='duration')

model = train_model(X_train, y_train)
rmse_train = evaluate_model(model, X_train, y_train)
rmse_val = evaluate_model(model, X_val, y_val)

export_model('../models/trip_duration.pkl', dv, model)

#### Q1: Read the data for January. How many records are there?

In [5]:
len(df_train_raw)

1154112

#### Q2: What's the average trip duration in January?

In [6]:
df_train_duration['duration'].mean()

19.1672240937939

#### Q3: What's the fractions of missing values for the pickup location ID? I.e. fraction of "-1"s after you filled the NAs.

In [7]:
pct_missing = len(df_train[df_train['PUlocationID'] == '-1.0']) /  len(df_train) * 100
pct_missing

83.52732770722618

#### Q4: Apply one-hot encoding to the pickup and dropoff location IDs. What's the dimensionality of this matrix? (The number of columns).

In [8]:
X_train.shape[1]

525

#### Q5: What's the RMSE on train?

In [9]:
rmse_train

10.528519107206316

#### Q6: What's the RMSE on validation?

In [10]:
rmse_val

11.014283149347039