In [33]:
import pandas as pd
from google.cloud import storage
from google.cloud.storage.blob import Blob

BUCKET_NAME = 'data-and-predictions'
PROJECT_PREFIX = 'fish-species-classification'

storage_client = storage.Client()

In [3]:
# TRAIN schema
# ###########

blobs = storage_client.list_blobs(
    BUCKET_NAME,
    prefix=f'{PROJECT_PREFIX}/train',
)

img_paths = [
    f'gs://{BUCKET_NAME}/{blob.name}' for blob in blobs
]    
    
schema = pd.DataFrame(img_paths, columns=['img_path'])
# Extract label only once data is in DataFrame so it can be vectorized
schema = schema.assign(
    label=schema.img_path
        .str.split('/')
        .str.get(-2)
)

schema.head()

Unnamed: 0,img_path,label
0,gs://data-and-predictions/fish-species-classif...,ALB
1,gs://data-and-predictions/fish-species-classif...,ALB
2,gs://data-and-predictions/fish-species-classif...,ALB
3,gs://data-and-predictions/fish-species-classif...,ALB
4,gs://data-and-predictions/fish-species-classif...,ALB


In [None]:
Blob.from_string(
        uri=f'gs://{BUCKET_NAME}/{PROJECT_PREFIX}/schema.csv',
        client=storage_client,
    ) \
    .upload_from_string(
        data=schema.to_csv(index=False, header=False),
        content_type='text/csv',
    )

In [72]:
# blob.download_as_string()

In [40]:
# TEST schema
# ###########

BUCKET_NAME = 'data-and-predictions-usc1'  # Different bucket
blobs = storage_client.list_blobs(
    BUCKET_NAME,
    prefix=f'{PROJECT_PREFIX}/test',
)

# For predictions, we need to use JSON-line format.
schema = [
    f'{{"content": "gs://{BUCKET_NAME}/{blob.name}", "mimeType": "image/jpeg"}}'
    for blob in blobs
]


# 
schema_json_lines = '\n'.join(schema)
schema_json_lines[:500]

'{"content": "gs://data-and-predictions-usc1/fish-species-classification/test/img_00005.jpg", "mimeType": "image/jpeg"}\n{"content": "gs://data-and-predictions-usc1/fish-species-classification/test/img_00007.jpg", "mimeType": "image/jpeg"}\n{"content": "gs://data-and-predictions-usc1/fish-species-classification/test/img_00009.jpg", "mimeType": "image/jpeg"}\n{"content": "gs://data-and-predictions-usc1/fish-species-classification/test/img_00018.jpg", "mimeType": "image/jpeg"}\n{"content": "gs://data-a'

In [42]:
Blob.from_string(
        uri=f'gs://{BUCKET_NAME}/{PROJECT_PREFIX}/schema-test.jsonl',
        client=storage_client,
    ) \
    .upload_from_string(
        data=schema_json_lines,
        # content_type='text/json',
    )