## Question 3. Creating a pipeline

In [1]:
for date in ['2023-03']:
  data_url = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{date}.parquet'
  data_path = "/".join(["data", data_url.split('/')[-1]])
  !mkdir -p 'data'
  !curl -s -S $data_url -o $data_path

In [2]:
from pandas import read_parquet, to_datetime

df = read_parquet('./data/yellow_tripdata_2023-03.parquet')

len(df.index)

3403766

## Question 4. Data preparation

In [3]:
categorical = ['PULocationID', 'DOLocationID']

In [4]:
def clean(df):
  df.tpep_dropoff_datetime = to_datetime(df.tpep_dropoff_datetime)
  df.tpep_pickup_datetime = to_datetime(df.tpep_pickup_datetime)

  df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
  df.duration = df.duration.dt.total_seconds() / 60

  df = df[(df.duration >= 1) & (df.duration <= 60)]

  return df

In [5]:
df = clean(df)
len(df.index)

3316216

## Question 5. Train a model

Fit a dict vectorizer

In [6]:
from sklearn.feature_extraction import DictVectorizer

def transform(df):
  dv = DictVectorizer()
  
  df[categorical] = df[categorical].astype(str)
  train_dicts = df[categorical].to_dict(orient='records')
  matrix = dv.fit_transform(train_dicts)

  return dv, matrix

Get a dict vectorizer and a matrix of shape (n_samples, n_features)

In [7]:
dv, x_train = transform(df)

Get a vector - the "duration" column values

In [8]:
target = "duration"
y_train = df[target].values

In [9]:
from sklearn.linear_model import LinearRegression

def train_model(x_matrix, y_vector):
  lr = LinearRegression()
  lr.fit(x_matrix, y_vector)
  return lr

Train a linear regression with default parameters and save the dict vectorizer

In [10]:
lr = train_model(x_train, y_train)
lr.intercept_

import pickle

local_artifact_path = 'lin_reg.bin'

with open(local_artifact_path, 'wb') as f_out:
  pickle.dump(dv, f_out)

## Question 6. MLFlow

In [11]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("my-experiment")
mlflow.sklearn.autolog()
with mlflow.start_run():
  lr = train_model(x_train, y_train)
  mlflow.log_metric("intercept", lr.intercept_)
  mlflow.log_artifact(local_path=local_artifact_path, artifact_path="models_pickle")



Get the model size from the artifacts

In [12]:
from mlflow import MlflowClient

mlclient = MlflowClient("sqlite:///backend.db")
runs = mlclient.search_runs(
    experiment_ids="1",
    filter_string="",
    max_results=1
)
mlclient.list_artifacts(run_id=runs[0].info.run_id, path="model")

[<FileInfo: file_size=728, is_dir=False, path='model/MLmodel'>,
 <FileInfo: file_size=248, is_dir=False, path='model/conda.yaml'>,
 <FileInfo: file_size=None, is_dir=True, path='model/metadata'>,
 <FileInfo: file_size=4500, is_dir=False, path='model/model.pkl'>,
 <FileInfo: file_size=120, is_dir=False, path='model/python_env.yaml'>,
 <FileInfo: file_size=125, is_dir=False, path='model/requirements.txt'>]