In [1]:
import mlrun
print(mlrun.__version__)

1.6.1


In [3]:
project_name = 'github-example'
project = mlrun.get_or_create_project(project_name, context="./")
project.set_secrets(secrets={"GIT_TOKEN" : ""}, provider="kubernetes")

> 2023-08-30 04:25:21,122 [info] Project loaded successfully: {'project_name': 'gitlab-example'}


In [4]:
%%writefile coffeerating-data-generator.py
import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LinearRegression

def coffeerating_data_generator(context):
    raw = pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-07-07/coffee_ratings.csv")
    df = pd.DataFrame(raw)
    coffee = pd.DataFrame(df[
        [
            "total_cup_points",
            "aroma",
            "flavor",
            "sweetness",
            "acidity",
            "body",
            "uniformity",
            "balance",
        ]
    ])

    # mlrun expr 
    context.log_dataset("coffee_dataset", df=coffee, format='parquet', index=False, artifact_path=context.artifact_subpath('coffee-dataset'))
    #context.log_artifact('coffee_dataset', body=raw, format='csv')
    return coffee, 'outcome'

Overwriting coffeerating-data-generator.py


In [5]:
coffee_data_gen_fn = project.set_function(name='coffeerating-data-function', 
                                          kind="job", 
                                          image="mlrun/mlrun:1.4.1", 
                                          func='coffeerating-data-generator.py', 
                                          handler='coffeerating_data_generator', with_repo=False)

In [6]:
coffee_data_run = project.run_function(coffee_data_gen_fn,local=False)

> 2023-08-30 04:25:22,789 [info] Storing function: {'name': 'coffeerating-data-function-coffeerating-data-generator', 'uid': 'd533916fbf1f4a5aa62b2fa58d11fbcb', 'db': 'http://mlrun-api:8080'}
> 2023-08-30 04:25:23,177 [info] Job is running in the background, pod: coffeerating-data-function-coffeerating-data-generator-59gcf
> 2023-08-30 04:25:33,123 [info] Run execution finished: {'status': 'completed', 'name': 'coffeerating-data-function-coffeerating-data-generator'}
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
gitlab-example,...8d11fbcb,0,Aug 30 04:25:31,completed,coffeerating-data-function-coffeerating-data-generator,v3io_user=xingshengkind=jobowner=xingshengmlrun/client_version=1.4.1mlrun/client_python_version=3.9.16host=coffeerating-data-function-coffeerating-data-generator-59gcf,,,"return=( total_cup_points aroma flavor ... body uniformity balance\n0 90.58 8.67 8.83 ... 8.50 10.00 8.42\n1 89.92 8.75 8.67 ... 8.42 10.00 8.42\n2 89.75 8.42 8.50 ... 8.33 10.00 8.42\n3 89.00 8.17 8.58 ... 8.50 10.00 8.25\n4 88.83 8.25 8.50 ... 8.42 10.00 8.33\n... ... ... ... ... ... ... ...\n1334 78.75 7.75 7.58 ... 5.08 10.00 7.83\n1335 78.08 7.50 7.67 ... 5.17 10.00 5.25\n1336 77.17 7.33 7.33 ... 7.50 9.33 7.17\n1337 75.08 7.42 6.83 ... 7.25 9.33 7.00\n1338 73.75 6.75 6.67 ... 6.92 9.33 6.83\n\n[1339 rows x 8 columns], 'outcome')",coffee_dataset





> 2023-08-30 04:25:36,430 [info] Run execution finished: {'status': 'completed', 'name': 'coffeerating-data-function-coffeerating-data-generator'}


In [7]:
%%writefile trainer.py

import pandas as pd
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
import mlrun
from mlrun.frameworks.sklearn import apply_mlrun

@mlrun.handler()
def train(context,
    dataset: pd.DataFrame,
    model_name: str = "lr_fit"
    ):
    
    x = dataset.drop('total_cup_points', axis=1)
    y = dataset['total_cup_points']
    X_train, X_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.2)

    model = LinearRegression()

    apply_mlrun(model=model, model_name=model_name, x_test=X_test, y_test=y_test)

    model.fit(X_train, y_train)



Overwriting trainer.py


In [8]:
trainer = project.set_function("trainer.py", name="coffeerating-trainer", kind="job", image="mlrun/mlrun:1.4.1", handler="train", with_repo=False)

In [9]:
trainer_run = project.run_function("coffeerating-trainer", inputs={"dataset": coffee_data_run.outputs['coffee_dataset']}, local=False)
trainer_run.outputs

> 2023-08-30 04:25:36,606 [info] Storing function: {'name': 'coffeerating-trainer-train', 'uid': '7938a5cabfe043318ce34e7631d53545', 'db': 'http://mlrun-api:8080'}
> 2023-08-30 04:25:36,915 [info] Job is running in the background, pod: coffeerating-trainer-train-dslhk
> 2023-08-30 04:25:42,147 [info] downloading v3io:///projects/gitlab-example/artifacts/coffee-dataset/coffeerating-data-function-coffeerating-data-generator/0/coffee_dataset.parquet to local temp file
> 2023-08-30 04:25:43,028 [info] Run execution finished: {'status': 'completed', 'name': 'coffeerating-trainer-train'}
final state: completed


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
gitlab-example,...31d53545,0,Aug 30 04:25:41,completed,coffeerating-trainer-train,v3io_user=xingshengkind=jobowner=xingshengmlrun/client_version=1.4.1mlrun/client_python_version=3.9.16host=coffeerating-trainer-train-dslhk,dataset,,mean_absolute_error=0.36552623788055605r2_score=0.9380282286082497root_mean_squared_error=0.6505720930344696mean_squared_error=0.42324404823525064,feature-importancetest_setmodel





> 2023-08-30 04:25:45,024 [info] Run execution finished: {'status': 'completed', 'name': 'coffeerating-trainer-train'}


{'mean_absolute_error': 0.36552623788055605,
 'r2_score': 0.9380282286082497,
 'root_mean_squared_error': 0.6505720930344696,
 'mean_squared_error': 0.42324404823525064,
 'feature-importance': 'v3io:///projects/gitlab-example/artifacts/coffeerating-trainer-train/0/feature-importance.html',
 'test_set': 'store://artifacts/gitlab-example/coffeerating-trainer-train_test_set:7938a5cabfe043318ce34e7631d53545',
 'model': 'store://artifacts/gitlab-example/lr_fit:7938a5cabfe043318ce34e7631d53545'}

In [10]:
%%writefile './workflow.py'

from kfp import dsl
import mlrun

# Create a Kubeflow Pipelines pipeline
@dsl.pipeline(
    name="Coffee Rating Pipeline"
)
def pipeline():
    
    # Run the data creationg function
    ingest = mlrun.run_function(
        "coffeerating_data_function",
        name="coffee-data-generation-step",
        outputs=['coffee_dataset'],
        local=False,
    )

    # Train a model using the trainer function
    train = mlrun.run_function(
        "coffeerating-trainer",
        inputs={"dataset": ingest.outputs['coffee_dataset']},
        outputs=["model"],
        local=False
    )

Overwriting ./workflow.py


In [11]:
project.set_workflow('main', 'workflow.py')

In [12]:
project.set_source(source="git://gitlab.com/xsqian/mlrun-gitlab-example.git#main", pull_at_runtime=False)
print(project.to_yaml())

kind: project
metadata:
  name: gitlab-example
  created: '2023-08-30T00:17:24.504000'
spec:
  functions:
  - url: my_job.py
    name: greetings
    kind: job
    image: mlrun/mlrun:1.4.1
    handler: handler
    with_repo: true
  - url: coffeerating-data-generator.py
    name: coffeerating-data-function
    kind: job
    image: mlrun/mlrun:1.4.1
    handler: coffeerating_data_generator
  - url: trainer.py
    name: coffeerating-trainer
    kind: job
    image: mlrun/mlrun:1.4.1
    handler: train
  workflows:
  - path: workflow.py
    name: main
  artifacts: []
  conda: ''
  source: git://gitlab.com/xsqian/mlrun-gitlab-example.git#main
  origin_url: git://gitlab.com/xsqian/mlrun-gitlab-example.git#refs/heads/main
  load_source_on_run: false
  desired_state: online
  owner: xingsheng
  build:
    commands: []
    requirements: []
  custom_packagers: []
status:
  state: online



In [17]:
project.save()

<mlrun.projects.project.MlrunProject at 0x7fb4d2d40940>

In [13]:
run_id = project.run('main', dirty=True, schedule='*/10 * * * *')

> 2023-08-30 04:26:48,086 [info] executing workflow scheduling 'workflow-runner-main' remotely with kfp engine
> 2023-08-30 04:26:48,089 [info] Storing function: {'name': 'main', 'uid': '5f0d6f7b583349e2ad98349cb4f13381', 'db': None}
> 2023-08-30 04:26:48,698 [info] task schedule modified: {'schedule': '*/10 * * * *', 'project': 'gitlab-example', 'name': 'main'}
