In [76]:
# STEP 1: Install required libraries (if not installed already)
!pip install boto3 pandas sentence-transformers



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [94]:
@component(
    base_image="python:3.9",
    packages_to_install=["boto3"]
)
def fetch_from_minio(
    bucket_name: str,
    file_key: str,
    minio_endpoint: str,
    minio_access_key: str,
    minio_secret_key: str,
    output_file: OutputPath()
):
    import boto3
    import os

    s3 = boto3.client(
        "s3",
        endpoint_url=minio_endpoint,
        aws_access_key_id=minio_access_key,
        aws_secret_access_key=minio_secret_key
    )

    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    s3.download_file(bucket_name, file_key, output_file)
    print(f"File downloaded to: {output_file}")


In [95]:
from kfp.dsl import component, InputPath, OutputPath

@component(
    base_image="python:3.9",
    packages_to_install=["pandas", "sentence-transformers"]
)
def chunk_and_embed(
    input_file: InputPath(),
    embeddings_output: OutputPath()
):
    import pandas as pd
    from sentence_transformers import SentenceTransformer
    import json
    import os

    print("Reading CSV from:", input_file)
    df = pd.read_csv(input_file)

    if df.empty:
        raise ValueError("CSV is empty!")

    texts = df.astype(str).agg(" ".join, axis=1).tolist()

    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(texts)

    result = [
        {"text": txt, "embedding": emb.tolist()}
        for txt, emb in zip(texts, embeddings)
    ]

    with open(embeddings_output, "w") as f:
        json.dump(result, f)

    print(f"Saved {len(result)} embeddings to {embeddings_output}")


In [96]:
from kfp.dsl import pipeline
from kfp.v2 import compiler

@pipeline(name="pipeline-fetch-chunk-embed")
def full_pipeline():
    fetch_step = fetch_from_minio(
        bucket_name="llama",
        file_key="austinHousingData.csv",
        minio_endpoint="https://minio-api-minio.apps.ai-dev02.kni.syseng.devcluster.openshift.com",
        minio_access_key="minio",
        minio_secret_key="minio123"
    )

    chunk_and_embed(
        input_file=fetch_step.outputs["output_file"]
    )


In [97]:
compiler.Compiler().compile(
    pipeline_func=full_pipeline,
    package_path="fetch_chunk_embed_pipeline.yaml"
)
