In [1]:
from zenml import step, pipeline
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

In [2]:

@step
def ingest_data() -> pd.DataFrame:
    """Loads the diabetes dataset."""
    diabetes = load_diabetes()
    data = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
    data['target'] = diabetes.target
    return data

In [3]:

@step
def clean_data(data: pd.DataFrame) -> pd.DataFrame:
    """Cleans the data by dropping null values."""
    return data.dropna()

In [4]:

@step
def train_model(data: pd.DataFrame) -> LinearRegression:
    """Trains a linear regression model."""
    X = data.drop('target', axis=1)
    y = data['target']
    X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

In [5]:

@step
def evaluate_model(model: LinearRegression, data: pd.DataFrame) -> None:
    """Evaluates the model and logs metrics."""
    X = data.drop('target', axis=1)
    y = data['target']
    _, X_test, _, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Mean Squared Error: {mse}")
    print(f"R2 Score: {r2}")

In [6]:

@pipeline
def basic_regression_pipeline():
    """Basic regression pipeline."""
    data = ingest_data()
    cleaned_data = clean_data(data)
    model = train_model(cleaned_data)
    evaluate_model(model, cleaned_data)

In [7]:

if __name__ == "__main__":
    basic_regression_pipeline()

[1;35mInitiating a new run for the pipeline: [0m[1;36mbasic_regression_pipeline[1;35m.[0m
[1;35mRegistered new pipeline: [0m[1;36mbasic_regression_pipeline[1;35m.[0m
[1;35mUsing user: [0m[1;36mdefault[1;35m[0m
[1;35mUsing stack: [0m[1;36mdefault[1;35m[0m
[1;35m  orchestrator: [0m[1;36mdefault[1;35m[0m
[1;35m  artifact_store: [0m[1;36mdefault[1;35m[0m
[1;35mYou can visualize your pipeline runs in the [0m[1;36mZenML Dashboard[1;35m. In order to try it locally, please run [0m[1;36mzenml login --local[1;35m.[0m
[1;35mStep [0m[1;36mingest_data[1;35m has started.[0m
[ingest_data] [33mBy default, the [0m[1;36mPandasMaterializer[33m stores data as a [0m[1;36m.csv[33m file. If you want to store data more efficiently, you can install [0m[1;36mpyarrow[33m by running '[0m[1;36mpip install pyarrow[33m'. This will allow [0m[1;36mPandasMaterializer[33m to automatically store the data as a [0m[1;36m.parquet[33m file instead.[0m
[1;35mStep

  df = pd.read_csv(f, index_col=0, parse_dates=True)


[1;35mStep [0m[1;36mclean_data[1;35m has finished in [0m[1;36m4.755s[1;35m.[0m
[1;35mStep [0m[1;36mtrain_model[1;35m has started.[0m
[train_model] [33mBy default, the [0m[1;36mPandasMaterializer[33m stores data as a [0m[1;36m.csv[33m file. If you want to store data more efficiently, you can install [0m[1;36mpyarrow[33m by running '[0m[1;36mpip install pyarrow[33m'. This will allow [0m[1;36mPandasMaterializer[33m to automatically store the data as a [0m[1;36m.parquet[33m file instead.[0m


  df = pd.read_csv(f, index_col=0, parse_dates=True)


[1;35mStep [0m[1;36mtrain_model[1;35m has finished in [0m[1;36m4.573s[1;35m.[0m
[1;35mStep [0m[1;36mevaluate_model[1;35m has started.[0m
[evaluate_model] [33mBy default, the [0m[1;36mPandasMaterializer[33m stores data as a [0m[1;36m.csv[33m file. If you want to store data more efficiently, you can install [0m[1;36mpyarrow[33m by running '[0m[1;36mpip install pyarrow[33m'. This will allow [0m[1;36mPandasMaterializer[33m to automatically store the data as a [0m[1;36m.parquet[33m file instead.[0m
[evaluate_model] Mean Squared Error: 2900.193628493483
[evaluate_model] R2 Score: 0.45260276297191915
[1;35mStep [0m[1;36mevaluate_model[1;35m has finished in [0m[1;36m0.204s[1;35m.[0m
[1;35mPipeline run has finished in [0m[1;36m18.709s[1;35m.[0m


  df = pd.read_csv(f, index_col=0, parse_dates=True)
