# Image data preprocessing

## Install dependencies

In [None]:
%%capture
!pip3 install --upgrade https://storage.googleapis.com/ml-pipeline/release/0.1.12/kfp.tar.gz

## Test dataset

For this example we use a [sample CSV](https://storage.googleapis.com/kf-pipeline-contrib-public/ai-hub-assets/shared-data/covtype/train.csv) data file.

In [1]:
print('The first 3 lines of the sample CSV file:')
!gsutil cat gs://kf-pipeline-contrib-public/release-0.1.1/kfp-components/data_converter/csv_libsvm/data/test.csv | head -n 3

The first 3 lines of the sample CSV file:
a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15,a16,a17,a18,a19,a20,a21,a22,a23,a24,a25,a26,a27,a28,label
2596,51,3,258,0,510,221,232,148,6279,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
2590,56,2,212,-6,390,220,235,151,6225,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5


## Create a pipeline

The cell below creates a pipeline.tar.gz file containing the pipeline. You can upload it with the Kubeflow Pipeline UI or run it programmatically

In [None]:
import kfp
import kfp.dsl as dsl
import kfp.gcp as gcp

CsvLibSVM = kfp.components.load_component_from_url('https://storage.googleapis.com/kf-pipeline-contrib-public/release-0.1.1/kfp-components/data_converter/csv_libsvm/component.yaml')

@dsl.pipeline(name='Csv To Libsvm', description='One-Step pipeline')
def csv_libsvm_pipeline(
    input_file=dsl.PipelineParam(name="input-file"),
    output_file=dsl.PipelineParam(name="output-file"),
    label_name=dsl.PipelineParam(name="label-name")):

    csv_libsvm_op = CsvLibSVM(
        input_file=input_file,
        output_file=output_file,
        label_name=label_name
    ).apply(gcp.use_gcp_secret('user-gcp-sa'))

kfp.compiler.Compiler().compile(csv_libsvm_pipeline, 'pipeline.tar.gz')

## Run an experiment with your parameters programmatically

This cell needs to be executed within the cluster.

In [None]:
params = {
    'input_file': 'gs://kf-pipeline-contrib-public/release-0.1.1/kfp-components/data_converter/csv_libsvm/data/test.csv',
    'output_file': 'gs://CHANGE-TO-WRITABLE-LOCATION',
    'label_name': 'label'
}
job_name = 'csv_libsvm_pipeline-job'
experiment_name = 'csv_libsvm_pipeline-experiment'

client = kfp.Client()
exp = client.create_experiment(name=experiment_name)
run = client.run_pipeline(exp.id, job_name, 'pipeline.tar.gz', params=params)