# Utils

> Utility functions used throughout the project.


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp utils

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
# | export
import tarfile
import json
import pandas as pd
from fastcore.all import Path

from nbdev.config import get_config
from pathlib import Path
from io import TextIOWrapper

import warnings
warnings.filterwarnings("ignore")

In [None]:
#| export
def get_repository_directory():
    cfg = get_config()
    project_root = Path(cfg.nbs_path).parent
    return project_root

In [None]:
#| hide
#| eval: false
repo_dir = get_repository_directory()
repo_dir

In [None]:
#| export
def get_data_directory():
    data_dir = get_repository_directory() / 'data'
    data_dir.mkdir(parents=True, exist_ok=True)
    return data_dir

In [None]:
#| hide
#| eval: false
data_dir = get_data_directory()
data_dir

In [None]:
# | export
def get_latest_directory(directory: Path):
    "Return the latest subdirectory, sorting by name for a given directory. Only returns directories, not files."
    directories = (
        directory
        .ls()
        .filter(lambda f: not f.name.startswith('.') and f.is_dir())
        .sorted(key=lambda f: f.name)
    )
    
    if len(directories):
        latest = directories[-1]
        return latest

In [None]:
#| hide
#| eval: false

directory = data_dir / 'bmtc' / 'raw' / 'route_points'
for item in sorted(directory.ls()):
    print(item)

In [None]:
#| hide
#| eval: false

print(get_latest_directory(directory))

In [None]:
# | export
def get_latest_file(directory: Path):
    "Return the latest file, sorting by name for a given directory. Only returns files, not directories."
    files = (
        directory
        .ls()
        .filter(lambda f: not f.name.startswith('.') and f.is_file())
        .sorted(key=lambda f: f.name)
    )
    if len(files) > 0:
        return files[-1]

In [None]:
#| hide
#| eval: false

directory = data_dir / 'bmtc' / 'raw' / 'vehicles'
for item in sorted(directory.ls()):
    print(item)

filepath = get_latest_file(directory)
print(f'\nLatest file in the directory: \n{filepath}')

In [None]:
# | export
def extract_file(filepath: Path):

    if filepath.suffix != ".gz":
        return filepath

    extract_dir = filepath.parent
    extracted_files = []

    with tarfile.open(filepath, "r:gz") as tar:
        for member in tar.getmembers():
            if Path(member.name).name.startswith("._"):
                continue
            tar.extract(member, path=extract_dir)
            extracted_files.append(extract_dir / member.name)

    for du_file in extract_dir.glob("._*"):
        du_file.unlink()

    if len(extracted_files) != 1:
        assert False, f"Expected 1 file, got {len(extracted_files)}"
    return extracted_files[0]

In [None]:
#| hide
#| eval: false

extract_file(filepath)

In [None]:
#| export
def read_file(filepath: Path, format: str = 'json'):
    "Read a file in either JSON or CSV format and return its contents."
    with open(filepath) as f:
        if format == 'json':
            return json.load(f)
        elif format == 'csv':
            return pd.read_csv(f)
        else:
            raise ValueError(f"Unsupported format: {format}")

In [None]:
#| hide
#| eval: false

read_file(extract_file(get_latest_file(directory)))

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()