In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import random
import string
import multiprocessing

from tqdm import tqdm
from pathlib import Path

In [2]:
RAW_DATA_DIR = Path("./data/raw")
PROCESSED_DATA_DIR = Path("./data/processed")
IMAGES_DIR = Path("./images")

In [3]:
def cleaned_dataframe(csv_path: Path, skiprows: int = 16) -> pd.DataFrame:
    """Return df with only TIME + CH1–CH4, numeric."""
    
    WANTED  = ["TIME", "CH1", "CH2", "CH3", "CH4"]  
    
    df = pd.read_csv(
        csv_path,
        skiprows=skiprows,
        header=None,
        usecols=range(7),          
        engine="pyarrow" 
    )

    hdr = df.iloc[0].astype(str).str.upper().str.strip().tolist()
    df  = df.iloc[1:]          
    df.columns = hdr

    df = df.loc[:, ~df.columns.duplicated()]

    return df[WANTED].reset_index(drop=True)

def new_filename(old_name: str) -> str:
    """
    'ges1343-dev-e11-0.8kohm-10nF-300K-1.050V_after_cycles_002_ALL.csv'
        → 'A7K3-e11-0.8kOhm-10nF-300K-1.050V.csv'
    """
    stem   = Path(old_name).stem
    parts  = stem.split("-")     

    core   = parts[2:7]
    core[-1] = core[-1].split("_", 1)[0] 

    core = [re.sub(r"kohm", "kOhm", p, flags=re.I) for p in core]
    rand  = "".join(random.choices(string.ascii_uppercase + string.digits, k=4))

    return f"{rand}-{'-'.join(core)}.csv"


In [4]:
errors = []                            
all_csvs = list(RAW_DATA_DIR.rglob("*.csv"))

for raw_csv in tqdm(all_csvs, desc="Processing CSVs", unit="file"):
    try:
        df = cleaned_dataframe(raw_csv, 16)

        rel_path   = raw_csv.relative_to(RAW_DATA_DIR)
        out_folder = PROCESSED_DATA_DIR / rel_path.parent
        out_folder.mkdir(parents=True, exist_ok=True)

        out_file = out_folder / new_filename(raw_csv.name)
        df.to_csv(out_file, index=False)
    except Exception as err:
        errors.append((raw_csv, err))   

if errors:
    for raw_csv, err in errors:
        print(f"{raw_csv}  →  {err}")
else:
    print("All files processed without errors")

Processing CSVs: 100%|██████████| 180/180 [05:36<00:00,  1.87s/file]

All files processed without errors



