In [31]:
import os
import glob
import json
import pandas as pd

In [32]:
def get_column_names(schemas, ds_name, sorting_key='column_position'):
    column_details = schemas[ds_name]
    columns = sorted(column_details, key=lambda col: col[sorting_key])
    return [col['column_name'] for col in columns]

In [33]:
def read_csv(file, schemas):
    ds_name = os.path.basename(os.path.dirname(file))
    columns = get_column_names(schemas, ds_name)
    df = pd.read_csv(file, names=columns)
    return df

In [34]:
def to_json(df, tgt_base_dir, ds_name, file_name):
    output_dir = os.path.join(tgt_base_dir, ds_name)
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, file_name.replace('.csv', '.json'))
    df.to_json(output_file, orient='records', lines=True)

In [35]:
def file_converter(src_base_dir,tgt_base_dir,ds_name):
    #src_base_dir = 'data/retail_db'
    #tgt_base_dir = 'data/retail_json'
    schemas = json.load(open(f'{src_base_dir}/schemas.json'))
    files = glob.glob(f'{src_base_dir}/{ds_name}/part-*')

    for file in files:
        try:
            df = read_csv(file, schemas)
            file_name = os.path.basename(file)
            to_json(df, tgt_base_dir, ds_name, file_name)
        except Exception as e:
            print(f"[ERROR] Failed to process {file}: {e}")



In [42]:
def process_files(ds_names=None):
    src_base_dir = 'data/retail_db'
    tgt_base_dir = 'data/retail_json'

    schemas = json.load(open(f'{src_base_dir}/schemas.json'))
    if not ds_names:
        ds_name = schemas.keys()
    for ds_name in ds_names:
        print(f'Processing: {ds_name}')
        file_converter(src_base_dir,tgt_base_dir,ds_name)
    



In [43]:
process_files()

TypeError: 'NoneType' object is not iterable

In [None]:
schemas = json.load(open('data/retail_db/schemas.json'))

In [None]:
schemas.keys()