# Managing files in GCS from within a component

This notebook demonstrates two approaches to consume file object in GCS from within a Kubeflow Pipeline component.
1. Using `gsutil` to copy the file and open.
2. Using `tf.gfile` to directly open the file.

In [None]:
!pip3 install pip --upgrade

In [None]:
import kfp.components as components

def read_and_print_gsutil(filename: str) -> str:
    """Reading file from GCS using gsutil, download to a tmp location."""
    import os
    import subprocess
    import tempfile

    tmp = tempfile.NamedTemporaryFile(delete=True)
    subprocess.run(['gsutil', 'cp', filename, tmp.name])
    with open(tmp.name) as f:
        print("Printing file: %s" % f.name)
        for line in f:
            print(line.rstrip())

read_and_print_gsutil_op = components.func_to_container_op(
    read_and_print_gsutil,
    base_image='google/cloud-sdk:latest')

In [None]:
def read_and_print_tf_gfile(filename: str) -> str:
    """Reading file from GCS using tf.file."""
    import tensorflow as tf
    with tf.gfile.GFile(filename) as f:
        print("Printing file: %s" % f.name)
        for line in f:
            print(line.rstrip())

read_and_print_tf_gfile_op = components.func_to_container_op(
    read_and_print_tf_gfile, base_image='tensorflow/tensorflow:1.12.0-py3')

In [None]:
import kfp.dsl as dsl
@dsl.pipeline(
   name='Read-and-print pipeline',
   description='A toy pipeline that performs read and print from GCS.'
)
def pipeline(filename):
    _ = read_and_print_gsutil_op(filename)
    _ = read_and_print_tf_gfile_op(filename)

pipeline_func = pipeline
pipeline_filename = pipeline_func.__name__ + '.pipeline.tar.gz'

import kfp.compiler as compiler
compiler.Compiler().compile(pipeline_func, pipeline_filename)

In [None]:
from kfp_experiment.rest import ApiException

EXPERIMENT_NAME = 'Read File in GCS'
FILENAME = 'gs://ml-pipeline-playground/tfx/taxi-cab-classification/train.csv'

#Get or create an experiment and submit a pipeline run
import kfp
client = kfp.Client()

try:
    experiment = client.create_experiment(EXPERIMENT_NAME)
except ApiException as e:
    experiment = client.get_experiment(experiment_name=EXPERIMENT_NAME)
    
run_name = pipeline_func.__name__ + ' Run'
run_result = client.run_pipeline(
    experiment.id, run_name, pipeline_filename, {'filename': FILENAME})