In [54]:
import pandas as pd
from os.path import isfile, join, isdir, getsize
from os import listdir
import numpy as np
from pathlib import Path
from time import time

In [55]:
def fetch_directories(path: str) -> [str]:
    '''
    Fetches all directories in the given path 
    '''
    directories = []
    try:
        directories = [join(path, f) for f in listdir(path) if isdir(join(path, f))] 
    except:
        print("Trouble opening a directory")
        return directories
    return directories

def fetch_files(path: str) -> [str]:    
    '''
    Fetches all files in the given path 
    '''
    files = []
    try:
        files = [f for f in listdir(path) if isfile(join(path, f))] 
    except:
        print("Trouble opening a directory")
        return files
    return files

In [72]:
def is_relevant_timestamp(x: any) -> bool:
    '''
    Checks if the value x is relevant timestamp (dates between year 2018 and today)
    '''
    try:
        fl = float(x)
        if 1500000000 < fl < time():
            return False
    except ValueError:
        return False
    return True

In [69]:
def clean_csv(filename: str) -> pd.DataFrame:
    '''
        Cleans a specific file.
    '''
    if getsize(filename) == 0:
        return None
    cols = pd.read_csv(filename, nrows=1, delimiter=';').columns
    df = pd.read_csv(filename, usecols=cols, delimiter=';', skip_blank_lines=True, error_bad_lines=False, 
                         warn_bad_lines=False, dtype=str)
    if len(df.columns) != 5:
        return None
    df.columns = ['id', 'demand', 'supply', 'timestamp', 'none']
    df = df[df.id.apply(lambda x: x.isnumeric())]
    df = df[df.timestamp.apply(lambda x: is_relevant_timestamp(x))]
    df = df.sort_values(['id', 'timestamp'])
    df = df.drop(['none'], axis=1)
    df = df.dropna()
    df = df.drop_duplicates('id')
    return df

In [70]:
def cleanup(path: str, output_path: str, children_directories: bool):
    '''
    Goes through all the files in a specific directory
    '''
    if children_directories:
        for directory in fetch_directories(path):
            for f in fetch_files(directory):
                if len(f) < 10:
                    continue
                print(directory + '/' + f)
                df = clean_csv(directory + '/' + f)
                if df is not None:
                    Path(output_path + directory[len(path):]).mkdir(parents=True, exist_ok=True)
                    df.to_csv(output_path + directory[len(path):] + '/' + f, sep=';', header=False, index=False)
                df = None
    else:
        directory = path
        for f in fetch_files(directory):
            if len(f) < 10:
                continue
            print(directory + '/' + f)
            df = clean_csv(directory + '/' + f)
            if df is not None:
                Path(output_path + directory).mkdir(parents=True, exist_ok=True)
                df.to_csv(output_path + directory + '/' + f, sep=';', header=False, index=False)
            df = None

In [71]:
dirs = [
    './test/'
]
output_path = './out/'
for path in dirs:
    cleanup(path, output_path, children_directories=True)

./test/data_05-02=05/BCHBNB-2020-05-02
./test/data_05-02=05/BCHBNB-2020-05-03
./test/data_05-02=05/BCHBNB-2020-05-04
./test/data_05-02=05/BCHBNB-2020-05-05
./test/data_05-02=05/BCHBTC-2020-05-02
./test/data_05-02=05/BCHBTC-2020-05-03
./test/data_05-02=05/BCHBTC-2020-05-04
./test/data_05-02=05/BCHBTC-2020-05-05
./test/data_05-02=05/BCHTUSD-2020-05-02
./test/data_05-02=05/BCHTUSD-2020-05-03
./test/data_05-02=05/BCHTUSD-2020-05-04
./test/data_05-02=05/BCHTUSD-2020-05-05


KeyboardInterrupt: 