In [7]:
import glob
import json
import re
import pandas as pd
import os

In [8]:
# Define a function to get column names from schemas.json
def get_column_names(schemas: dict, ds_name: str, sorting_key='column_position'):
    column_details = schemas[ds_name]
    columns = sorted(column_details, key=lambda col: col[sorting_key], reverse=False)
    return [col['column_name'] for col in columns]

In [9]:
# read csv file by using the column names in schema
def read_csv(file, schemas):
    ds_name = re.split('[/]', file)[-2]
    columns = get_column_names(schemas, ds_name)
    df = pd.read_csv(file, names=columns)
    return df

In [10]:
# convert csv file to json file using pandas dataframe
def to_json(df, trg_base_dir, ds_name, file_name):
    json_file_path = f'{trg_base_dir}/{ds_name}/{file_name}'
    os.makedirs(f'{trg_base_dir}/{ds_name}', exist_ok=True)
    df.to_json(
        json_file_path,
        orient='records',
        lines=True
    )

In [11]:
# file converter function
def file_converter(ds_name):
    src_base_dir = '../data/retail_db'
    trg_base_dir = '../data/retail_db_json'

    schemas = json.load(open(f'{src_base_dir}/schemas.json'))
    files = glob.glob(f'{src_base_dir}/{ds_name}/part-*')

    for file in files:
        df = read_csv(file, schemas)
        file_name = re.split('[/]', file)[-1]
        to_json(df, trg_base_dir, ds_name, file_name)


In [12]:
ds_name = 'orders'
file_converter(ds_name)