In [40]:
import pandas as pd
from os.path import isfile, join, isdir, getsize
from os import listdir
import numpy as np

In [41]:
def fetch_directories(path: str) -> [str]:
    '''
    Fetches all directories in the given path 
    '''
    directories = []
    try:
        directories = [join(path, f) for f in listdir(path) if isdir(join(path, f))] 
    except:
        print("Trouble opening a directory")
        return directories
    return directories

def fetch_files(path: str) -> [str]:    
    '''
    Fetches all files in the given path 
    '''
    files = []
    try:
        files = [f for f in listdir(path) if isfile(join(path, f))] 
    except:
        print("Trouble opening a directory")
        return files
    return files

In [42]:
def is_float(x):
    try:
        float(x)
    except ValueError:
        return False
    return True

In [43]:
def clean_csv(filename):
    if getsize(filename) == 0:
        return None
    cols = pd.read_csv(filename, nrows=1, delimiter=';').columns
    df = pd.read_csv(filename, usecols=cols, delimiter=';', skip_blank_lines=True, error_bad_lines=False, 
                         warn_bad_lines=False, dtype=str)
    df.columns = ['id', 'demand', 'supply', 'timestamp', 'none']
    df = df[df.id.apply(lambda x: x.isnumeric())]
    df = df[df.timestamp.apply(lambda x: is_float(x))]
    df = df.sort_values(['id', 'timestamp'])
    df = df.drop(['none'], axis=1)
    df = df.dropna()
    df = df.drop_duplicates('id')
    return df

In [1]:
def cleanup(path: str, output_path: str, children_directories: bool):
    if children_directories:
        for directory in fetch_directories(path):
            for files in fetch_files(directory), directory, directory[len(path):]:
                for f in files:
                    if len(f) < 10:
                        continue
                    print(directory + '/' + f)
                    df = clean_csv(directory + '/' + f)
                    if df is not None:
                        df.to_csv(output_path + directory[len(path):] + '/' + f, sep=';', header=False, index=False)
                    df = None
    else:
        for directory in path:
            for files in fetch_files(directory), directory, directory[len(path):]:
                for f in files:
                    if len(f) < 10:
                        continue
                    print(directory + '/' + f)
                    df = clean_csv(directory + '/' + f)
                    if df is not None:
                        df.to_csv(output_path + directory + '/' + f, sep=';', header=False, index=False)
                    df = None

In [None]:
dirs = [
    '../data_02-18=21/',
    '../data_02-20=23/',
    '../data_02-23=26/',
    '../data_02-26=01/',
    '../data_03-01=06/',
    '../data_03-11=15/',
]
output_path = '../out/'
for path in dirs:
    cleanup(path, output_path, children_directories=False)