# Using [filesplit library](https://pypi.org/project/filesplit/)

# Split files

In [1]:
from filesplit import merge, split
import os

In [2]:
basepath = os.path.abspath('.')
basepath

'/Users/henryliu/Temp00/exp_Databricks_YelpAcademicDataset'

In [3]:
os.listdir(basepath)

['.DS_Store',
 'yelp_data',
 'Untitled Notebook 2023-07-06 17_10_37.py',
 'README.md',
 'FileSplit.ipynb',
 '.gitignore',
 '.gitattributes',
 '.ipynb_checkpoints',
 'venv',
 '.git',
 '.vscode']

In [4]:
data_files = [os.path.join(os.path.abspath("./yelp_data"),f) for f in os.listdir("./yelp_data") if f[-5:] == '.json']
data_files

['/Users/henryliu/Temp00/exp_Databricks_YelpAcademicDataset/yelp_data/yelp_academic_dataset_checkin.json',
 '/Users/henryliu/Temp00/exp_Databricks_YelpAcademicDataset/yelp_data/yelp_academic_dataset_tip.json',
 '/Users/henryliu/Temp00/exp_Databricks_YelpAcademicDataset/yelp_data/yelp_academic_dataset_review.json',
 '/Users/henryliu/Temp00/exp_Databricks_YelpAcademicDataset/yelp_data/yelp_academic_dataset_business.json',
 '/Users/henryliu/Temp00/exp_Databricks_YelpAcademicDataset/yelp_data/yelp_academic_dataset_user.json']

In [5]:
def split_datafile(filepath, defaultsize=2000000000):
    file = filepath
    output_dir = filepath.split('.')[0]
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    s = split.Split(file, output_dir)
    s.bysize(size=defaultsize)

In [6]:
for file in data_files:
    split_datafile(file)
os.listdir(os.path.join(basepath, "yelp_data/"))

['yelp_academic_dataset_user_out',
 'yelp_academic_dataset_user',
 '.DS_Store',
 'yelp_academic_dataset_review_out',
 'yelp_academic_dataset_checkin.json',
 'yelp_academic_dataset_checkin_out',
 'yelp_academic_dataset_tip.json',
 'yelp_academic_dataset_tip_out',
 'yelp_academic_dataset_business',
 'yelp_academic_dataset_review',
 'yelp_academic_dataset_business_out',
 'yelp_academic_dataset_review.json',
 'yelp_academic_dataset_checkin',
 'yelp_academic_dataset_tip',
 'yelp_academic_dataset_business.json',
 'yelp_academic_dataset_user.json']

# Merge files

In [7]:
from filesplit import merge, split
import os

In [8]:
basepath = os.path.abspath('.')
basepath

'/Users/henryliu/Temp00/exp_Databricks_YelpAcademicDataset'

In [9]:
os.listdir(basepath)

['.DS_Store',
 'yelp_data',
 'Untitled Notebook 2023-07-06 17_10_37.py',
 'README.md',
 'FileSplit.ipynb',
 '.gitignore',
 '.gitattributes',
 '.ipynb_checkpoints',
 'venv',
 '.git',
 '.vscode']

In [13]:
input_folders = [
    os.path.join(os.path.abspath("./yelp_data"), d) 
    for d in os.listdir("./yelp_data") 
    if d[0] != '.'
    and ".json" not in d 
    and "_out" not in d
                ]
input_folders

['/Users/henryliu/Temp00/exp_Databricks_YelpAcademicDataset/yelp_data/yelp_academic_dataset_user',
 '/Users/henryliu/Temp00/exp_Databricks_YelpAcademicDataset/yelp_data/yelp_academic_dataset_business',
 '/Users/henryliu/Temp00/exp_Databricks_YelpAcademicDataset/yelp_data/yelp_academic_dataset_review',
 '/Users/henryliu/Temp00/exp_Databricks_YelpAcademicDataset/yelp_data/yelp_academic_dataset_checkin',
 '/Users/henryliu/Temp00/exp_Databricks_YelpAcademicDataset/yelp_data/yelp_academic_dataset_tip']

In [14]:
def merge_datafile(inputdir, outputdir="*_out", outputfilename=None):
    outputdir = outputdir.replace('*', inputdir)
    if not os.path.exists(outputdir):
        os.mkdir(outputdir)
    fileext = [path for path in os.listdir(inputdir) if path != 'manifest'][0].split('.')[-1]
    m = merge.Merge(
        inputdir, 
        outputdir, 
        outputfilename=outputfilename if outputfilename is not None else '.'.join([inputdir.split('/')[-1], fileext])
    )
    m.merge()

In [15]:
for i in input_folders:
    merge_datafile(i)