# Kubeflow pipeline demo

### Load data

#### Traditional way to write

In [10]:
"""
Load data from target resources
"""

import pandas as pd
from sklearn.datasets import fetch_california_housing


# Function to return the orignal data
def load_my_data():
    '''
    Load data
    '''
    # Load data
    cal_data = fetch_california_housing(as_frame=True)
    df = cal_data.frame

    # In reality, the data load process would be
    # requests, or pulling data from data warehouse
    # df = pd.read_csv(f"{loc}/{tablename}")

    return df


In [11]:
df = load_my_data()

In [14]:
df.to_csv('to_csv_test.csv', index=False, header=True)

In [17]:
df = pd.read_csv('to_csv_test.csv')

In [18]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


#### Kubeflow pipeline

In [22]:
import kfp
import kfp.components as comp


"""
Load data from target resources
"""
# Function to return the orignal data
def load_my_data(output_csv: comp.OutputPath('CSV')):
    '''
    Load data
    '''
    import pandas as pd
    from sklearn.datasets import fetch_california_housing
    
    # Load data
    cal_data = fetch_california_housing(as_frame=True)
    df = cal_data.frame

    # In reality, the data load process would be
    # requests, or pulling data from data warehouse
    # df = pd.read_csv(f"{loc}/{tablename}")
    
    # Create an output     
    df.to_csv(output_csv, index=False, header=True)


#### Create a pipeline component from the function

In [23]:
load_data_from_web = kfp.components.create_component_from_func(
    func=load_my_data,
    output_component_file='data_load.yaml', # This is optional. It saves the component spec for future use.
    base_image='python:3.10',
    packages_to_install=['pandas==1.5.3', 'scikit-learn==1.2.2'])

### Preprocess the data

#### Tradtional way to write

In [3]:
"""
Preprocess data for ml model
"""
import pandas as pd

def preprocess_my_data(df) -> pd.DataFrame:
    """
    Some preprocessing
    """
    # Create a new feature
    df['AveRooms_bin'] = pd.to_numeric(pd.cut(x = df['AveRooms'], 
                            bins = [0, 3, 5, 7, 10, 300], 
                            labels = [1, 2, 3, 4, 5]
                            ))

    return df


#### Kubeflow pipeline

In [24]:
def preprocess_my_data(input_csv: comp.InputPath('CSV'), output_csv: comp.OutputPath('CSV')):
    """
    Some preprocessing
    """
    import pandas as pd
    
    df = pd.read_csv(input_csv)
    # Create a new feature
    df['AveRooms_bin'] = pd.to_numeric(pd.cut(x = df['AveRooms'], 
                            bins = [0, 3, 5, 7, 10, 300], 
                            labels = [1, 2, 3, 4, 5]
                            ))

    # Create an output     
    df.to_csv(output_csv, index=False, header=False)

#### Create a pipeline component from the function

In [25]:
preprocess_data = kfp.components.create_component_from_func(
    func=preprocess_my_data,
    output_component_file='data_preprocess.yaml', # This is optional. It saves the component spec for future use.
    base_image='python:3.10',
    packages_to_install=['pandas==1.5.3'])

## Create a pipeline and compile

In [32]:
# Define a pipeline and create a task from a component:
def my_ml_pipeline():
    load_task = load_data_from_web()
    preprocess_task = preprocess_data(input_csv=load_task.outputs['output_csv'])
    # The outputs of the merge_csv_task can be referenced using the
    # merge_csv_task.outputs dictionary: merge_csv_task.outputs['output_csv']

In [33]:
kfp.compiler.Compiler().compile(
    pipeline_func=my_ml_pipeline,
    package_path='ml_pipeline_demo.yaml')