In [2]:
import pandas as pd
import pickle
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def read_dataset(path):
    df = pd.read_parquet(path)
    
    # prepare duration metric
    df["duration"] = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime) \
        .apply(lambda x: x.total_seconds() / 60)
    
    # remove outliers
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    # convert categorical variables
    categorical = ["PULocationID", "DOLocationID"]
    df[categorical] = df[categorical].astype(str)
    
    return df


jdf = pd.read_parquet("./data/yellow_tripdata_2022-01.parquet")
fdf = read_dataset("./data/yellow_tripdata_2022-02.parquet")

#### Q1. How many columns are in the dataset
19

In [3]:
len(jdf.columns)

19

#### Q2. What is the standard deviation of trips duration in january?
46.45

In [4]:
jdf["duration"] = (jdf.tpep_dropoff_datetime - jdf.tpep_pickup_datetime) \
    .apply(lambda x: x.total_seconds() / 60)

In [5]:
jdf.duration.std()

46.44530513776499

#### Q3. What is the fraction of records left after the outliers drop?
98.27%

In [6]:
initial = jdf.shape[0]

In [7]:
jdf = jdf[(jdf.duration >= 1) & (jdf.duration <= 60)]
no_outliers = jdf.shape[0]

In [8]:
no_outliers / initial * 100

98.27547930522405

#### Q4. Dimensionality of the matrix
515

In [9]:
categorical = ["PULocationID", "DOLocationID"]
jdf[categorical] = jdf[categorical].astype(str)

In [10]:
train_dicts = jdf[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = jdf[target].values

In [11]:
X_train.get_shape()[1]

515

#### Q5. RMSE on train
6.99

In [12]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)

rmse = mean_squared_error(y_train, y_pred, squared=False)
rmse

6.98619123059128

#### Q6. RMSE on validation
7.79

In [13]:
val_dicts = fdf[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = fdf[target].values

In [14]:
y_pred = lr.predict(X_val)
rmse = mean_squared_error(y_val, y_pred, squared=False)
rmse

7.786413522518241

In [24]:
with open("./models/lin_reg.bin", "wb") as file:
    pickle.dump((dv, lr), file)