In [56]:
# pip install fsspec gcsfs pandas pyarrow fs-gcsfs

In [241]:
import fsspec

def tree(fs, path, prefix=""):
    """Recursively list the contents of a directory in a tree-like format."""
    print(path)
    items = fs.ls(path, detail=True)
    for i, item in enumerate(items):
        is_last = i == (len(items) - 1)  # Check if this is the last item
        if item['type'] == 'directory':
            # Print the directory name
            print(f"{prefix}{'└── ' if is_last else '├── '}{item['name'].split('/')[-1]}")
            # Recursively list this directory's contents
            new_prefix = prefix + ('    ' if is_last else '│   ')
            list_directory(fs, item['name'], new_prefix)
        else:
            # Print the file name
            print(f"{prefix}{'└── ' if is_last else '├── '}{item['name'].split('/')[-1]}")

gcs_token_path = os.path.join('../_env/dsi310-2023-teacher.json')

In [242]:
fs = fsspec.filesystem('file',auto_mkdir=True) #fsspec.filesystem(catalog_path, auto_mkdir=False)
catalog_path='./catalog/'
with fs.open(catalog_path+'readme.md','wb') as f:
    f.write(b'# Hello')
    
tree(fs,catalog_path)

./catalog/
└── readme.md


In [243]:
# catalog_path = 'gcs://dsi310_bucket/'
# fs=fsspec.filesystem('gcs', token=gcs_token_path)
# with fs.open(catalog_path+'readme.md','wb') as f:
#     f.write(b'# Hello')
# tree(fs,catalog_path)

In [244]:
import pandas as pd
import fsspec
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime

# Example DataFrame
df = pd.DataFrame({
    'date': pd.date_range(start='2023-01-01', periods=5, freq='D'),
    'product_id': range(1, 6),
    'quantity': [5, 3, 6, 2, 7],
    'price': [20.5, 10.0, 15.5, 25.0, 30.0]
})

# Metadata and Data Dictionary
metadata = {'source': 'Sales System', 'creation_date': datetime.now().isoformat()}
data_dictionary = {
    'date': 'Transaction date',
    'product_id': 'Product identifier',
    'quantity': 'Quantity sold',
    'price': 'Sale price'
}

# Convert DataFrame to PyArrow Table with metadata
table = pa.Table.from_pandas(df)
table = table.replace_schema_metadata({'metadata': str(metadata), 'dictionary': str(data_dictionary)})

table.schema

date: timestamp[ns]
product_id: int64
quantity: int64
price: double
-- schema metadata --
metadata: '{'source': 'Sales System', 'creation_date': '2023-11-23T02:24:' + 11
dictionary: '{'date': 'Transaction date', 'product_id': 'Product identifi' + 56

In [245]:
# from fsspec.implementations.local import LocalFileSystem
# fs = LocalFileSystem()

In [246]:
# Write to GCS
# gcs_path = 'gcs://dsi310_bucket/sales_data.parquet'  # Replace with your bucket path
with fs.open(path=catalog_path+'sale.parquet',mode='wb') as f:  # Replace with your GCS token
    pq.write_table(table, f)

tree(fs,catalog_path)

./catalog/
├── sale.parquet
└── readme.md


In [247]:
import pandas as pd
import fsspec
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime

# Example DataFrame
df = pd.DataFrame({
    'date': pd.date_range(start='2023-01-01', periods=5, freq='D'),
    'product_id': range(1, 6),
    'quantity': [5, 3, 6, 2, 7],
    'price': [20.5, 10.0, 15.5, 25.0, 30.0]
})
df['date'] = df['date'].dt.date
# Metadata and Data Dictionary
metadata = {'source': 'Sales System', 'creation_date': datetime.now().isoformat()}
data_dictionary = {
    'date': 'Transaction date',
    'product_id': 'Product identifier',
    'quantity': 'Quantity sold',
    'price': 'Sale price'
}

# Convert DataFrame to PyArrow Table with metadata
table = pa.Table.from_pandas(df)
table = table.replace_schema_metadata({'metadata': str(metadata), 'dictionary': str(data_dictionary)})



In [248]:
# Define GCS path and write to GCS

# No need to open a file with fsspec, use the path directly
dataset_name ='sale'
path = catalog_path+dataset_name
path

'./catalog/sale'

In [249]:
pq.write_to_dataset(table, root_path=path, partition_cols=['date'], filesystem=fs,)

tree(fs,catalog_path)

./catalog/
├── sale.parquet
├── sale
│   ├── date=2023-01-03
│   │   └── a7baa69219054e50bf4679a052093531-0.parquet
│   ├── date=2023-01-02
│   │   └── a7baa69219054e50bf4679a052093531-0.parquet
│   ├── date=2023-01-04
│   │   └── a7baa69219054e50bf4679a052093531-0.parquet
│   ├── date=2023-01-05
│   │   └── a7baa69219054e50bf4679a052093531-0.parquet
│   └── date=2023-01-01
│       └── a7baa69219054e50bf4679a052093531-0.parquet
└── readme.md


In [250]:

import pyarrow.dataset as ds
# dataset = ds.dataset('/dsi310_bucket/sale/', format="parquet", filesystem=fs)


dataset = pq.ParquetDataset(path_or_paths=catalog_path+'sale', filesystem=fs)
table = dataset.read()

table.schema

product_id: int64
quantity: int64
price: double
date: dictionary<values=string, indices=int32, ordered=0>
-- schema metadata --
metadata: '{'source': 'Sales System', 'creation_date': '2023-11-23T02:24:' + 11
dictionary: '{'date': 'Transaction date', 'product_id': 'Product identifi' + 56

In [251]:
# Extracting metadata and data dictionary
metadata = eval(table.schema.metadata[b'metadata']) if b'metadata' in table.schema.metadata else None
data_dictionary = eval(table.schema.metadata[b'dictionary']) if b'dictionary' in table.schema.metadata else None

# Display the DataFrame, Metadata, and Data Dictionary
print("Metadata:", metadata)
print("Data Dictionary:", data_dictionary)

Metadata: {'source': 'Sales System', 'creation_date': '2023-11-23T02:24:06.543639'}
Data Dictionary: {'date': 'Transaction date', 'product_id': 'Product identifier', 'quantity': 'Quantity sold', 'price': 'Sale price'}


In [252]:
# Convert to Pandas DataFrame
df = table.to_pandas()

# Display the DataFrame
print(df.head())

   product_id  quantity  price        date
0           1         5   20.5  2023-01-01
1           2         3   10.0  2023-01-02
2           3         6   15.5  2023-01-03
3           4         2   25.0  2023-01-04
4           5         7   30.0  2023-01-05
