In [3]:
import glob

In [9]:
src_file_names = glob.glob('data/retail_db/*/part-*')
src_file_names


['data/retail_db\\categories\\part-00000',
 'data/retail_db\\customers\\part-00000',
 'data/retail_db\\departments\\part-00000',
 'data/retail_db\\orders\\part-00000',
 'data/retail_db\\order_items\\part-00000',
 'data/retail_db\\products\\part-00000']

In [10]:
import re

In [12]:
for file in src_file_names:
    file_path_list = re.split('[/\\\]', file)
    print(file_path_list)

['data', 'retail_db', 'categories', 'part-00000']
['data', 'retail_db', 'customers', 'part-00000']
['data', 'retail_db', 'departments', 'part-00000']
['data', 'retail_db', 'orders', 'part-00000']
['data', 'retail_db', 'order_items', 'part-00000']
['data', 'retail_db', 'products', 'part-00000']


In [25]:
tgt_base_dir = 'data/retail_db_json'

In [26]:

file = src_file_names[0]

In [19]:
file_path_list = re.split('[/\\\]', file)

In [23]:
file_name = file_path_list[-1]
file_name

'part-00000'

In [24]:
ds_name = file_path_list[-2]
ds_name

'categories'

In [27]:
f'{tgt_base_dir}/{ds_name}/{file_name}'

'data/retail_db_json/categories/part-00000'

In [28]:
for file in src_file_names:
    file_path_list= re.split('[/\\\]', file)
    ds_name = file_path_list[-2]
    file_name=file_path_list[-1]
    json_file_path = f'{tgt_base_dir}/{ds_name}/{file_name}'
    print(json_file_path)


data/retail_db_json/categories/part-00000
data/retail_db_json/customers/part-00000
data/retail_db_json/departments/part-00000
data/retail_db_json/orders/part-00000
data/retail_db_json/order_items/part-00000
data/retail_db_json/products/part-00000


#### Write Pandas Data Frame to JSON Files

In [38]:
def get_column_names(schemas, ds_name, sorting_key='column_position'):
    column_details = schemas[ds_name]
    columns = sorted(column_details, key= lambda col:col[sorting_key])
    return [col['column_name'] for col in columns]

In [39]:
import json
schemas = json.load(open(r'data\retail_db\schemas.json'))

In [40]:
src_file_names = glob.glob('data/retail_db/*/part-*')

In [41]:
import pandas as pd
import os

In [44]:
tgt_base_dir='data/retail_db_json'
for file in src_file_names:
    print(f'processing {file}')
    file_path_list = re.split('[/\\\]', file)
    print(file_path_list)
    ds_name=file_path_list[-2]
    file_name = file_path_list[-1]
    json_file_path = f'{tgt_base_dir}/{ds_name}/{file_name}'
    columns = get_column_names(schemas,ds_name)
    df = pd.read_csv(file, names=columns)
    os.makedirs(f'{tgt_base_dir}/{ds_name}', exist_ok=True)
    df.to_json(
        json_file_path,
        orient='records',
        lines=True
    )

processing data/retail_db\categories\part-00000
['data', 'retail_db', 'categories', 'part-00000']
processing data/retail_db\customers\part-00000
['data', 'retail_db', 'customers', 'part-00000']
processing data/retail_db\departments\part-00000
['data', 'retail_db', 'departments', 'part-00000']
processing data/retail_db\orders\part-00000
['data', 'retail_db', 'orders', 'part-00000']
processing data/retail_db\order_items\part-00000
['data', 'retail_db', 'order_items', 'part-00000']
processing data/retail_db\products\part-00000
['data', 'retail_db', 'products', 'part-00000']


##### Wrapper to process all Data Sets

In [45]:
import glob
import os
import json
import re
import pandas as pd

In [51]:
def get_column_names(schemas, ds_name, sorting_key='column_position'):
    column_details = schemas[ds_name]
    columns = sorted(column_details, key= lambda col:col[sorting_key])
    return [col['column_name'] for col in columns]

In [52]:
def read_csv(file,schemas):
    file_path_list = re.split('[/\\\]', file)
    ds_name = file_path_list[-2]
    file_name = file_path_list[-1]
    columns=get_column_names(schemas, ds_name)
    df = pd.read_csv(file, names=columns)
    return df

In [53]:
def to_json(df,tgt_base_dir,ds_name,file_name):
    json_file_path=f'{tgt_base_dir}/{ds_name}/{file_name}'
    os.makedirs(f'{tgt_base_dir}/{ds_name}', exist_ok=True)
    df.to_json(
        json_file_path,
        orient='records',
        lines=True
    )

In [55]:
def file_converter(src_base_dir, tgt_base_dir, ds_name):
    schemas=json.load(open(f'{src_base_dir}/schemas.json'))
    files = glob.glob(f'{src_base_dir}/{ds_name}/part-*')

    for file in files:
        df = read_csv(file, schemas)
        file_name =  re.split('[/\\\]', file) [-1]
        to_json(df,tgt_base_dir,ds_name,file_name)


In [56]:
def process_files(ds_names=None):
    src_base_dir = 'data/retail_db'
    tgt_base_dir = 'data/retail_db_json'
    schemas = json.load(open(f'{src_base_dir}/schemas.json'))
    if not ds_names:
        ds_names = schemas.keys()
    for ds_name in ds_names:
        print(f'processing {ds_name}')
        file_converter(src_base_dir,tgt_base_dir,ds_name)

In [59]:
# process_files(['orders'])
process_files()


processing departments
processing categories
processing orders
processing products
processing customers
processing order_items


In [67]:
schemas = json.load(open('data/retail_db/schemas.json'))
# schemas.values()
schemas.keys()

dict_keys(['departments', 'categories', 'orders', 'products', 'customers', 'order_items'])