In [1]:
!python3 -m pip install --upgrade pymilvus minio

Collecting pymilvus
  Obtaining dependency information for pymilvus from https://files.pythonhosted.org/packages/5a/c8/fed45f4b9473c07f36647ec83b9e60df241d61c419c34ebc785ee4b57cd7/pymilvus-2.3.5-py3-none-any.whl.metadata
  Downloading pymilvus-2.3.5-py3-none-any.whl.metadata (4.4 kB)
Collecting minio
  Obtaining dependency information for minio from https://files.pythonhosted.org/packages/8a/78/4b0fb944cb3f71e6637d8e716593683ea1b41ca5c75ed6a98699d7e31381/minio-7.2.3-py3-none-any.whl.metadata
  Using cached minio-7.2.3-py3-none-any.whl.metadata (6.4 kB)
Collecting grpcio<=1.58.0,>=1.49.1 (from pymilvus)
  Obtaining dependency information for grpcio<=1.58.0,>=1.49.1 from https://files.pythonhosted.org/packages/a1/9c/ef89aae6948949a891a50e19bb951aac2f7ceb9561fdfdcd07c9b890ed6c/grpcio-1.58.0-cp311-cp311-macosx_10_10_universal2.whl.metadata
  Using cached grpcio-1.58.0-cp311-cp311-macosx_10_10_universal2.whl.metadata (4.0 kB)
Collecting argon2-cffi (from minio)
  Obtaining dependency inform

In [1]:
# Configs for Zilliz Cloud cluster
CLUSTER_ENDPOINT=""
API_KEY=""
TOKEN=""
CLUSTER_ID=""
CLOUD_REGION=""
CLOUD_API_ENDPOINT="controller.api.{0}.zillizcloud.com".format(CLOUD_REGION)
COLLECTION_NAME=""

# Configs for remote bucket
ACCESS_KEY=""
SECRET_KEY=""
BUCKET_NAME=""

In [2]:
!curl https://assets.zilliz.com/doc-assets/medium_articles_partial_a13e0f2a.csv \
        --output medium_articles_partial.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 5133k  100 5133k    0     0   436k      0  0:00:11  0:00:11 --:--:--  648k   0      0      0 --:--:--  0:00:01 --:--:--     0


In [3]:
from pymilvus import FieldSchema, CollectionSchema, DataType

schema = CollectionSchema(
    fields=[
        FieldSchema(name='id', dtype=DataType.INT64, is_primary=True),
        FieldSchema(name='title_vector', dtype=DataType.FLOAT_VECTOR, dim=768),
        FieldSchema(name='title', dtype=DataType.VARCHAR, max_length=512),
        FieldSchema(name='link', dtype=DataType.VARCHAR, max_length=512),
    ],
    description="A series of articles from medium.com",
    auto_id=False,
    enable_dynamic_field=True
)

In [4]:
from pymilvus import connections, Collection

# 1. Set up a connection
connections.connect(
        uri=CLUSTER_ENDPOINT,
        token=TOKEN
)
# 2. Create collection
collection = Collection(name=COLLECTION_NAME, schema=schema)

# 3. Set index parameters
index_params = {
    "index_type": "AUTOINDEX",
    "metric_type": "IP",
    "params": {}
}

# 4. Create index
collection.create_index(
        field_name="title_vector",
        index_params=index_params,
)

# 5. Load the collection
collection.load()

In [5]:
from pymilvus import RemoteBulkWriter, BulkFileType

# Connections parameters to access the remote bucket
conn = RemoteBulkWriter.ConnectParam(
    endpoint="storage.googleapis.com", # Use "s3.amazonaws.com" for AWS S3
    access_key=ACCESS_KEY,
    secret_key=SECRET_KEY,
    bucket_name=BUCKET_NAME, # Use a bucket hosted in the same cloud as the target cluster
    secure=True
)

In [6]:
writer = RemoteBulkWriter(
    schema=schema, # Target collection schema
    remote_path="/", # Output directory relative to the remote bucket root
    segment_size=512*1024*1024, # Maximum segment size when segmenting the raw data
    connect_param=conn, # Connection parameters defined above
    file_type=BulkFileType.JSON_RB # Type of the generated file.
)

In [7]:
import pandas as pd

df = pd.read_csv("medium_articles_partial.csv") # Use the actual file path to the dataset

for i in range(len(df)):
    row = df.iloc[i].to_dict()
    row["title_vector"] = [float(x) for x in row["title_vector"][1:-1].split(",")]
    writer.append_row(row)

In [8]:
writer.commit()

In [9]:
print(writer.data_path)

/54859c6c-145d-444d-a596-48a7ee940e00


In [10]:
from pymilvus import bulk_import

# Publicly accessible URL for the prepared data in the remote bucket
object_url = "gs://{0}/{1}/".format(BUCKET_NAME, str(writer.data_path)[1:])
# Change `gs` to `s3` for AWS S3

# Start bulk-import
res = bulk_import(
    # Parameters for Zilliz Cloud access
    # highlight-next-line
    url=CLOUD_API_ENDPOINT,
    api_key=API_KEY,
    cluster_id=CLUSTER_ID,
    collection_name=COLLECTION_NAME,
    # Parameters for bucket access
    object_url=object_url,
    access_key=ACCESS_KEY,
    secret_key=SECRET_KEY,

)

print(res.json())

{'code': 200, 'data': {'jobId': '61c0d0ae-46f8-4eef-a182-8d4e6b52144d'}}


In [11]:
import time
from pymilvus import get_import_progress

job_id = res.json()['data']['jobId']

res = get_import_progress(
    # highlight-next-line
    url=CLOUD_API_ENDPOINT,
    api_key=API_KEY,
    job_id=job_id,
    cluster_id=CLUSTER_ID
)

print(res.json()["data"]["readyPercentage"])

# check the bulk-import progress
while res.json()["data"]["readyPercentage"] < 1:
    time.sleep(5)

    res = get_import_progress(
        # highlight-next-line
        url=CLOUD_API_ENDPOINT,
        api_key=API_KEY,
        job_id=job_id,
        cluster_id=CLUSTER_ID
    )
    
    print(res.json()["data"]["readyPercentage"])

# 0.01   -- import progress 1%
# 0.5    -- import progress 50%
# 0.5
# 1      -- finished

0.01
0.5
0.5
1


In [13]:
from pymilvus import list_import_jobs

res = list_import_jobs(
    # highlight-next-line
    url=CLOUD_API_ENDPOINT,
    api_key=API_KEY,
    cluster_id=CLUSTER_ID,
    page_size=10,
    current_page=1,
)

print(res.json())

{'code': 200, 'data': {'tasks': [{'collectionName': 'medium_articles', 'jobId': '61c0d0ae-46f8-4eef-a182-8d4e6b52144d', 'state': 'ImportCompleted'}, {'collectionName': 'medium_articles', 'jobId': 'aec225ca-c8c3-48fe-9605-8507cd0ea3af', 'state': 'ImportFailed'}], 'count': 2, 'currentPage': 1, 'pageSize': 10}}
