# Train a Model on Flyte

First, install the necessary packages:

In [None]:
%pip install "flytekit==1.14.4" union "pydantic>2" pandas pyarrow scikit-learn joblib

In [1]:
import os

os.environ["AWS_ACCESS_KEY_ID"] = "minio"
os.environ["AWS_SECRET_ACCESS_KEY"] = "miniostorage"

## Create a Dataset

The following code creates a dataset for us to train a model on:

In [2]:
import flytekit as fl
import pandas as pd
import sys


image = fl.ImageSpec(
    name="jupyter-notebook-workshop",
    packages=[
        "pandas",
        "pyarrow",
        "flytekit==1.14.4",
    ],
    registry="localhost:30000",
    python_version=f"{sys.version_info.major}.{sys.version_info.minor}",
)

task = fl.task(container_image=image)


@task
def get_df() -> pd.DataFrame:
    """Returns a new Dataframe with cols [Name, Age, Grade and PassedTest]"""
    return pd.DataFrame({
        'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
        'Age': [23, 25, 22, 24, 23],
        'Grade': ['A', 'B', 'A', 'C', 'B'],
        'PassedTest': [True, False, True, False, True]
    })

Next we create a remote client that can execute tasks on Union Serverless:

In [3]:
remote = fl.FlyteRemote.for_sandbox(
    default_project="flytesnacks",
    default_domain="development",
    interactive_mode_enabled=True,
)

Create the dataframe:

In [4]:
remote = fl.FlyteRemote.for_sandbox(
    default_project="flytesnacks",
    default_domain="development",
    interactive_mode_enabled=True,
)

exe = remote.execute(get_df, inputs={})
exe

[34mImage localhost:30000/jupyter-notebook-workshop:tsRbQIdw3EzEA_W_MitThA found. Skip building.[0m


Wait for the execution to complete, then load the dataframe into memory:

In [5]:
exe.wait(poll_interval=1)
dataframe = exe.outputs['o0']
dataframe

Unnamed: 0,Name,Age,Grade,PassedTest
0,Alice,23,A,True
1,Bob,25,B,False
2,Charlie,22,A,True
3,David,24,C,False
4,Eva,23,B,True


You can now play around with the dataframe directly in the jupyter runtime:

In [6]:
def local_function(dataframe: pd.DataFrame):
    return dataframe.groupby("Grade")["Age"].sum()

local_function(dataframe)

Grade
A    45
B    48
C    24
Name: Age, dtype: int64

## Train a Model

Next we define a task that trains a model:

In [9]:
from flytekit.types.file import FlyteFile


training_task = fl.task(container_image=image.with_packages(["scikit-learn", "joblib"]))

@training_task
def train_model(dataframe: pd.DataFrame) -> FlyteFile:
    import joblib
    from sklearn.linear_model import LogisticRegression

    model = LogisticRegression()
    model.fit(dataframe[["Age"]], dataframe["PassedTest"])

    with open("model.pkl", "wb") as f:
        joblib.dump(model, f)

    return FlyteFile(path="model.pkl")


@fl.workflow
def train_wf() -> FlyteFile:
    dataframe = get_df()
    return train_model(dataframe)

Execute the training run:

In [10]:
model_exe = remote.execute(train_wf, inputs={"dataframe": dataframe})
model_exe

Now let's loads the model into the jupyter runtime:

In [None]:
import joblib

model_exe.wait(poll_interval=1)
model_file = model_exe.outputs['o0']

with open(model_file, "rb") as f:
    model = joblib.load(f)

model

Finally, we generate some predictions with the model:

In [None]:
prediction_data = pd.DataFrame({
    "Age": [23, 25, 22, 24, 23]
})

model.predict(prediction_data)