In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
def filter_is_csv(item):
    """
    Returns true if the item is a csv file
    """
    return item.endswith(".csv")

def filter_is_json(item):
    """
    Returns true if the item is a csv file
    """
    return item.endswith(".json")
        

In [3]:
def get_data(input_folder):
    """
    Get the aggregated dataframe, consisting of all data inside 'input_folder'
    
    Args:
       input_folder (str): specifying the folder which stores the data
    Returns:
       pd.DataFrame: aggregated data frame in a dictionary and list
       with the name of the matadata json file (can be an empty list) 
    """
    
    list_of_files = []

    

    for (dirpath, dirnames, filenames) in os.walk(input_folder):
        list_of_files += [os.path.join(dirpath, file) for file in filenames]

    print(f"loaded {len(list_of_files)} files")

    # Make relative paths from absolute file paths
    rel_paths = [os.path.relpath(item, input_folder) for item in list_of_files]
    print(rel_paths)
    # Filter out csv files
    json_path = list(filter(filter_is_json, rel_paths))[0]
    # Filter out non-csv files
    rel_paths = list(filter(filter_is_csv, rel_paths))
    print(rel_paths)
    # Get absolute paths
    abs_paths = [os.path.join(input_folder, item) for item in rel_paths]
    # Concatenate all files in a dictionary
    dfs = {}
    for item,file in zip(rel_paths,abs_paths):
        dfs[item[:-4]] = pd.read_csv(file, encoding = 'ISO-8859-1')
    print(dfs.keys())
    return dfs,json_path

In [4]:
dfs,metadata_path = get_data('../data/proj_demo_adcubumMT/airflow_data/data_synth')

loaded 6 files
['demo_metadata.json', 'tariftyp.csv', 'tariftyp.pkl', 'tarifziffer.csv', 'tarifziffer.pkl', '.ipynb_checkpoints\\tariftyp-checkpoint.csv']
['tariftyp.csv', 'tarifziffer.csv', '.ipynb_checkpoints\\tariftyp-checkpoint.csv']
dict_keys(['tariftyp', 'tarifziffer', '.ipynb_checkpoints\\tariftyp-checkpoint'])


In [6]:
dfs.keys()

dict_keys(['tariftyp', 'tarifziffer', '.ipynb_checkpoints\\tariftyp-checkpoint'])

In [7]:
primary_keys = set(dfs['tariftyp']['BOID'])
foreign_keys = set(dfs['tarifziffer']['ITSTARIFTYP'])

unknown_references = foreign_keys - primary_keys

print('Number of unknown references:', len(unknown_references))
print('Unknown reference values:', unknown_references)

Number of unknown references: 0
Unknown reference values: set()
