In [1]:
import json
import pandas as pd
import os
from tqdm.notebook import tqdm

In [2]:
def get_filepaths(rootdir='.', filetypes=['txt']):
    
    """
    Returns the path-string of all the json files in the given root dirctory.

    Usage:
        from utils import get_filepaths

        get_filepaths(rootdir='~/path/to/dir', filetypes=['txt'])

    Parameters:
        rootdir:
              The directory where the files are read from.
              The default is '.'

        filetypes:
              A list containing the string form of the file types to read.
              Default: ['txt']  reads all text files
              pass an empty list [] for reading all file types

    This function raises an exception if the given path does not exist in the local system.
    """
    # If found, remove the '/' at the end of rootdir 
    if rootdir[-1] == os.sep:
        rootdir = rootdir[:-1]

    # If the directory does not exist on the user system, then raise an exception error
    if os.path.exists(rootdir) is False:
        raise Exception(f"Directory `{rootdir}` does not exist.")

    # Go through the folder structure and add to filepaths list
    filepaths = []
    # Convert filetypes to lower case
    filetypes = [ftype.lower() for ftype in filetypes]

    for (dirpath, dirnames, filenames) in os.walk(rootdir):
        for filename in filenames:
            # if filename is in given filetypes
            if filetypes == []:
                filepaths.append(os.path.join(dirpath, filename))
            else:
                # Split the filename with . and check if it is the desired extension
                if filename.split('.')[-1].lower() in filetypes:
                    filepaths.append(os.path.join(dirpath, filename))

    # return the filepaths list
    return filepaths

In [3]:
filepaths = get_filepaths('./dataset1')
filepaths.extend(get_filepaths('./dataset2'))

In [4]:
all_data = []

In [5]:
for fp in tqdm(filepaths):
    data = json.load(open(fp, 'r'))
    all_data.extend(data)

In [6]:
# df = pd.DataFrame(all_data)

In [10]:
df = df.explode(column='Flights').dropna()
df.reset_index(inplace=True, drop=True)
df = pd.concat([df.drop(['Flights'], axis=1), df['Flights'].apply(pd.Series)], axis=1)

In [None]:
df.to_csv('airtickets.csv')