# Preparing the Dataset for Modeling

In [1]:
import os
import numpy as np
import pandas as pd
from google.cloud import storage

In [2]:
# Instantiates a client
storage_client = storage.Client()

In [42]:
# Define bucket name: change this to your own
bucket_name = 'ekabasandbox-vcm'

In [52]:
# Get GCS bucket
bucket = storage_client.get_bucket(bucket_name)

In [53]:
# Retrieve blobs in the train dataset
blobs = bucket.list_blobs(prefix='chest_xray/chest_xray/train/')

In [54]:
# Retrieve list of blobs
blob_list = []
for blob in blobs:
    blob_list.append(blob.name)

In [55]:
len(blob_list)

5219

In [56]:
print(blob_list[0])
print(blob_list[1])

chest_xray/chest_xray/train/.DS_Store
chest_xray/chest_xray/train/NORMAL/.DS_Store


In [61]:
# Remove .DS_Store from list created by the datasource system
# We don't mind doing this O(n) operation because the list size is manageable.
# For larger datasets, it may be better to simply delete this file from Google Cloud Storage.

# Append the bucket_prefix to the object file path

i, length = 0, len(blob_list)
bucket_prefix = 'gs://ekabasandbox-vcm/'
data = []
for blob in blob_list:
    if '.DS_Store' not in blob:
        entry = [''.join([bucket_prefix, blob]), blob.split('/')[3]]
        data.append(entry)

In [62]:
print(data[0])
print(data[1])

['gs://ekabasandbox-vcm/chest_xray/chest_xray/train/NORMAL/IM-0115-0001.jpeg', 'NORMAL']
['gs://ekabasandbox-vcm/chest_xray/chest_xray/train/NORMAL/IM-0117-0001.jpeg', 'NORMAL']


In [63]:
# convert to Pandas DataFrame
data_pd = pd.DataFrame(np.array(data))

In [64]:
data_pd.to_csv("data.csv", header=None, index=None)

In [65]:
# Upload data.csv to Google Cloud Storage
output_blob = bucket.blob('data.csv')

In [66]:
output_blob.upload_from_filename('data.csv')

# Finish