In [33]:
import pandas as pd
import os 
import re
from pathlib import Path

In [34]:
# Mapeamento de canais (exemplo); para cada "data_{part}" podemos definir canais específicos.
# Neste exemplo, estou supondo que cada parte (por exemplo, "motor", "gearbox", etc.)
# possua seu próprio sub-mapeamento de canais.
# Se os canais forem os mesmos para todos, você pode usar um único mapa.
CHANNEL_MAP = {
    "motor": {
        "CH1": ("Motor", "Motor (drive end)", "Tri-axial acceleration"),
        "CH2": ("Motor", "Motor (drive end)", "Tri-axial acceleration"),
        "CH3": ("Motor", "Motor (drive end)", "Tri-axial acceleration"),
        "CH4": ("Motor", "Motor (fan end)", "Tri-axial acceleration"),
        "CH5": ("Motor", "Motor (fan end)", "Tri-axial acceleration"),
        "CH6": ("Motor", "Motor (fan end)", "Tri-axial acceleration"),
        "CH7": ("Motor", "Motor (cable)", "Three-phase current"),
        "CH8": ("Motor", "Motor (cable)", "Three-phase current"),
        "CH9": ("Motor", "Motor (cable)", "Three-phase current"),
    },
    "gearbox": {
        "CH10": ("Gearbox", "Gearbox (input axle)", "Tri-axial acceleration"),
        "CH11": ("Gearbox", "Gearbox (input axle)", "Tri-axial acceleration"),
        "CH12": ("Gearbox", "Gearbox (input axle)", "Tri-axial acceleration"),
        "CH13": ("Gearbox", "Gearbox (output axle)", "Tri-axial acceleration"),
        "CH14": ("Gearbox", "Gearbox (output axle)", "Tri-axial acceleration"),
        "CH15": ("Gearbox", "Gearbox (output axle)", "Tri-axial acceleration"),
    },
    "leftaxlebox": {
        "CH16": ("Axle box (left)", "Axle box (end cover)", "Tri-axial acceleration"),
        "CH17": ("Axle box (left)", "Axle box (end cover)", "Tri-axial acceleration"),
        "CH18": ("Axle box (left)", "Axle box (end cover)", "Tri-axial acceleration"),
    },
    "leftaxlebox_alt": {},
    "rightaxlebox": {
        "CH19": ("Axle box (right)", "Axle box (end cover)", "Tri-axial acceleration"),
        "CH20": ("Axle box (right)", "Axle box (end cover)", "Tri-axial acceleration"),
        "CH21": ("Axle box (right)", "Axle box (end cover)", "Tri-axial acceleration"),
    },
}

# Mapeamento de rótulos do tipo "TYPE X" para o formato composto "Mx_Gx_LAx_RAx"
TYPE_MAP = {
    "Normal": "M0_G0_LA0_RA0",
    "TYPE1": "M1_G0_LA0_RA0",
    "TYPE2": "M2_G0_LA0_RA0",
    "TYPE3": "M3_G0_LA0_RA0",
    "TYPE4": "M4_G0_LA0_RA0",
    "TYPE5": "M0_G1_LA0_RA0",
    "TYPE6": "M0_G2_LA0_RA0",
    "TYPE7": "M0_G3_LA0_RA0",
    "TYPE8": "M0_G4_LA0_RA0",
    "TYPE9": "M0_G5_LA0_RA0",
    "TYPE10": "M0_G6_LA0_RA0",
    "TYPE11": "M0_G7_LA0_RA0",
    "TYPE12": "M0_G8_LA0_RA0",
    "TYPE13": "M0_G0_LA1_RA0",
    "TYPE14": "M0_G0_LA2_RA0",
    "TYPE15": "M0_G0_LA3_RA0",
    "TYPE16": "M0_G0_LA4_RA0",
}

In [35]:
# Função para padronizar o caminho dos arquivos
def padronizar_caminho(file_path: str) -> dict:
    """
    Processa o path e retorna um dicionário com:
      - stage: partes[3] (Ex.: "Final" ou "Preliminar")
      - split: partes[4] (Ex.: "Training" ou "Test")
      - label: se houver, extraído conforme a estrutura; pode ser None se ausente
      - sample: extraído da pasta Sample (ex.: "Sample110" ou "Sample_3")
      - part: nome do arquivo sem extensão, ex.: "data_motor"

    Exemplos:
      1) ..\\data\\raw\\Final\\Test\\Sample110\\data_motor.csv
         -> {"stage": "Final", "split": "Test", "label": None, "sample": "Sample110", "part": "data_motor"}
      2) ..\\data\\raw\\Final\\Training\\M4_G0_LA0_RA0\\Sample_3\\data_gearbox.csv
         -> {"stage": "Final", "split": "Training", "label": "M4_G0_LA0_RA0", "sample": "Sample_3", "part": "data_gearbox"}
      3) ..\\data\\raw\\Preliminar\\Training\\训练集\\TYPE9\\Sample3\\data_gearbox.csv
         -> {"stage": "Preliminar", "split": "Training", "label": "TYPE9", "sample": "Sample3", "part": "data_gearbox"}
      4) ..\\data\\raw\\Preliminar\\Test\\Test\\测试集\\TYPE10\\Sample2\\data_leftaxlebox.csv
         -> {"stage": "Preliminar", "split": "Test", "label": "TYPE10", "sample": "Sample2", "part": "data_leftaxlebox"}
    """
    # Divide o caminho em partes (suporta / e \)
    parts = re.split(r"[\\/]", file_path)
    parts = [p for p in parts if p]  # remove partes vazias

    # Assume que a estrutura inicia em "data/raw"
    if "raw" in parts:
        raw_idx = parts.index("raw")
    else:
        raise ValueError("Caminho inválido: 'raw' não encontrado.")

    stage = parts[raw_idx + 1] if raw_idx + 1 < len(parts) else None
    split = parts[raw_idx + 2] if raw_idx + 2 < len(parts) else None

    # A partir de raw, as partes seguintes podem ser:
    # Se houver duplicata da pasta split (por exemplo, "Test"), pula-a
    idx = raw_idx + 3
    if idx < len(parts) and parts[idx] == split:
        idx += 1

    # Se a parte iniciar com "Sample", não há label
    if idx < len(parts) and parts[idx].lower().startswith("sample"):
        label = None
        sample = parts[idx]
        idx += 1
    else:
        # Se houver uma pasta extra (por exemplo, "训练集" ou "测试集")
        if idx < len(parts) and parts[idx] in ("训练集", "测试集"):
            idx += 1
        # A próxima parte é o label
        label = parts[idx] if idx < len(parts) else None
        idx += 1
        # A próxima parte é a amostra
        sample = parts[idx] if idx < len(parts) else None
        idx += 1

    # A parte final é o nome do arquivo sem extensão
    part = parts[idx].rsplit(".", 1)[0] if idx < len(parts) else None

    return {
        "stage": stage,
        "split": split,
        "label": label,
        "sample": sample,
        "part": part,
    }


# Teste da função padronizar_caminho:
test_paths = [
    r"..\data\raw\Final\Test\Sample110\data_motor.csv",
    r"..\data\raw\Final\Training\M4_G0_LA0_RA0\Sample_3\data_gearbox.csv",
    r"..\data\raw\Preliminar\Training\训练集\TYPE9\Sample3\data_gearbox.csv",
    r"..\data\raw\Preliminar\Test\Test\测试集\TYPE10\Sample2\data_leftaxlebox.csv",
]

for p in test_paths:
    print(padronizar_caminho(p))

{'stage': 'Final', 'split': 'Test', 'label': None, 'sample': 'Sample110', 'part': 'data_motor'}
{'stage': 'Final', 'split': 'Training', 'label': 'M4_G0_LA0_RA0', 'sample': 'Sample_3', 'part': 'data_gearbox'}
{'stage': 'Preliminar', 'split': 'Training', 'label': 'TYPE9', 'sample': 'Sample3', 'part': 'data_gearbox'}
{'stage': 'Preliminar', 'split': 'Test', 'label': 'TYPE10', 'sample': 'Sample2', 'part': 'data_leftaxlebox'}


In [36]:
def parse_label(label_str: str):
    """
    Converte uma string de rótulo em quatro colunas:
      motor_label, gearbox_label, axle_box_left_label, axle_box_right_label.

    Lógica:
      - Se label_str já estiver no formato "M<number>_G<number>_LA<number>_RA<number>",
        retorna o split desse rótulo.
      - Se label_str contiver "TYPE" (independentemente de caixa), utiliza TYPE_MAP para convertê-lo.
      - Caso contrário, retorna (None, None, None, None).
    """
    if not label_str or label_str.strip() == "":
        return None, None, None, None

    label_str = label_str.strip()
    pattern = r"^M\d+_G\d+_LA\d+_RA\d+$"
    if re.match(pattern, label_str):
        parts = label_str.split("_")
        if len(parts) == 4:
            return parts[0], parts[1], parts[2], parts[3]
        else:
            return None, None, None, None

    if "TYPE" in label_str.upper():
        for key in TYPE_MAP:
            if key.upper() == label_str.upper():
                label_str = TYPE_MAP[key]
                break
        else:
            return None, None, None, None
        parts = label_str.split("_")
        if len(parts) == 4:
            return parts[0], parts[1], parts[2], parts[3]
        else:
            return None, None, None, None

    return None, None, None, None


# Teste parse_label:
print("Ex1:", parse_label("M0_G0_LA0_RA0"))
print("Ex2:", parse_label("TYPE2"))
print("Ex3:", parse_label(""))
print("Ex4:", parse_label("XYZ"))

Ex1: ('M0', 'G0', 'LA0', 'RA0')
Ex2: ('M2', 'G0', 'LA0', 'RA0')
Ex3: (None, None, None, None)
Ex4: (None, None, None, None)


In [37]:
def extract_channels(csv_file: str):
    """
    Lê o cabeçalho do arquivo CSV (primeira linha) e retorna as colunas que começam com "CH".
    """
    try:
        df_temp = pd.read_csv(csv_file, nrows=0)
        cols = df_temp.columns.tolist()
        channels = [col for col in cols if col.upper().startswith("CH")]
        return channels
    except Exception as e:
        print(f"Erro ao ler {csv_file}: {e}")
        return []
    
# Teste extract_channels:
print(extract_channels("../data/raw/Final/Test/Sample110/data_motor.csv"))

['CH1', 'CH2', 'CH3', 'CH4', 'CH5', 'CH6', 'CH7', 'CH8', 'CH9']


In [38]:
def gather_csv_files(base_dir: str) -> list:
    """
    Percorre recursivamente o diretório base_dir e retorna uma lista de caminhos normalizados para os arquivos CSV.
    """
    csv_files = []
    for root, _, files in os.walk(base_dir):
        for f in files:
            if f.endswith(".csv"):
                full_path = os.path.join(root, f)
                norm_path = os.path.normpath(full_path)
                csv_files.append(norm_path)
    return csv_files


# Exemplo de uso:
# all_csv_files = gather_csv_files("../data/raw")
# print(all_csv_files)

In [39]:
def gerar_metadata_df(file_paths: list) -> pd.DataFrame:
    metadata_records = []
    for file_path in file_paths:
        # Verifica se o arquivo existe
        if not os.path.isfile(file_path):
            continue

        info = padronizar_caminho(file_path)
        stage = info.get("stage")
        split = info.get("split")
        label = info.get("label")
        sample = info.get("sample")
        part = info.get("part")

        # Processa o label para obter os 4 sub-rótulos
        motor_lbl, gear_lbl, axl_left_lbl, axl_right_lbl = parse_label(label)

        # Extrai os canais do arquivo CSV
        channels = extract_channels(file_path)
        if channels:
            # Para cada canal, procura no mapeamento específico pela parte.
            # Se não encontrar, tenta buscar em todo o CHANNEL_MAP.
            for ch in channels:
                mapping = None
                if part and part.lower() in CHANNEL_MAP:
                    mapping = CHANNEL_MAP[part.lower()].get(ch)

                # Fallback: se não houve mapeamento específico usando a parte, pesquisa em todos os mapeamentos.
                if not mapping:
                    for sub_map in CHANNEL_MAP.values():
                        if ch in sub_map:
                            mapping = sub_map[ch]
                            break

                if mapping:
                    comp, deploc, sigtype = mapping
                else:
                    comp = deploc = sigtype = None

                metadata_records.append(
                    {
                        "file_path": file_path,
                        "stage": stage,
                        "split": split,
                        "label": label,
                        "motor_label": motor_lbl,
                        "gearbox_label": gear_lbl,
                        "axle_box_left_label": axl_left_lbl,
                        "axle_box_right_label": axl_right_lbl,
                        "sample": sample,
                        "part": part,
                        "channel": ch,
                        "component": comp,
                        "deployment_location": deploc,
                        "signal_type": sigtype,
                    }
                )
        else:
            # Se o CSV não apresentar canais, cria uma única entrada com channel = None
            metadata_records.append(
                {
                    "file_path": file_path,
                    "stage": stage,
                    "split": split,
                    "label": label,
                    "motor_label": motor_lbl,
                    "gearbox_label": gear_lbl,
                    "axle_box_left_label": axl_left_lbl,
                    "axle_box_right_label": axl_right_lbl,
                    "sample": sample,
                    "part": part,
                    "channel": None,
                    "component": None,
                    "deployment_location": None,
                    "signal_type": None,
                }
            )

    return pd.DataFrame(metadata_records)


# Coleta dos caminhos a partir da base raw
all_csv_files = gather_csv_files("../data/raw")
metadata_df = gerar_metadata_df(all_csv_files)

In [40]:
print(metadata_df.head())

                                         file_path  stage split label  \
0  ..\data\raw\Final\Test\Sample1\data_gearbox.csv  Final  Test  None   
1  ..\data\raw\Final\Test\Sample1\data_gearbox.csv  Final  Test  None   
2  ..\data\raw\Final\Test\Sample1\data_gearbox.csv  Final  Test  None   
3  ..\data\raw\Final\Test\Sample1\data_gearbox.csv  Final  Test  None   
4  ..\data\raw\Final\Test\Sample1\data_gearbox.csv  Final  Test  None   

  motor_label gearbox_label axle_box_left_label axle_box_right_label   sample  \
0        None          None                None                 None  Sample1   
1        None          None                None                 None  Sample1   
2        None          None                None                 None  Sample1   
3        None          None                None                 None  Sample1   
4        None          None                None                 None  Sample1   

           part channel component    deployment_location  \
0  data_gearbo

In [41]:
"""
- ..\data\raw\Final\Test\Sample110\data_motor.csv
- ..\data\raw\Final\Training\M4_G0_LA0_RA0\Sample_3\data_gearbox.csv
- ..\data\raw\Preliminar\Training\训练集\TYPE9\Sample3\data_gearbox.csv
-..\data\raw\Preliminar\Test\测试集\TYPE10\Sample2\data_leftaxlebox.csv
"""

  """


'\n- ..\\data\raw\\Final\\Test\\Sample110\\data_motor.csv\n- ..\\data\raw\\Final\\Training\\M4_G0_LA0_RA0\\Sample_3\\data_gearbox.csv\n- ..\\data\raw\\Preliminar\\Training\\训练集\\TYPE9\\Sample3\\data_gearbox.csv\n-..\\data\raw\\Preliminar\\Test\\测试集\\TYPE10\\Sample2\\data_leftaxlebox.csv\n'

In [42]:
def visualizar_metadata(
    df: pd.DataFrame, stage: str = None, split: str = None, label: str = None
) -> pd.DataFrame:
    df_filtrado = df.copy()
    if stage:
        df_filtrado = df_filtrado[df_filtrado["stage"] == stage]
    if split:
        df_filtrado = df_filtrado[df_filtrado["split"] == split]
    if label is not None:
        df_filtrado = df_filtrado[df_filtrado["label"] == label]
    return df_filtrado


# Visualizações de exemplo:

'''# Tipo 1: ..\data\raw\Final\Test\Sample110\data_motor.csv
print("Tipo 1 (Final, Test):")
print(visualizar_metadata(metadata_df, stage="Final", split="Test").head(5))

# Tipo 2: ..\data\raw\Final\Training\M4_G0_LA0_RA0\Sample_3\data_gearbox.csv
print("\nTipo 2 (Final, Training, label M4_G0_LA0_RA0):")
print(
    visualizar_metadata(
        metadata_df, stage="Final", split="Training", label="M4_G0_LA0_RA0"
    ).head(5)
)'''

# Tipo 3: ..\data\raw\Preliminar\Training\训练集\TYPE9\Sample3\data_gearbox.csv
print("\nTipo 3 (Preliminar, Training, label TYPE9):")
print(
    visualizar_metadata(
        metadata_df, stage="Preliminar", split="Training", label="TYPE9"
    ).head(5)
)

# Tipo 4: ..\data\raw\Preliminar\Test\测试集\TYPE10\Sample2\data_leftaxlebox.csv
print("\nTipo 4 (Preliminar, Test, label TYPE10):")
print(
    visualizar_metadata(
        metadata_df, stage="Preliminar", split="Test", label="TYPE10"
    ).head(5)
)


Tipo 3 (Preliminar, Training, label TYPE9):
                                               file_path       stage  \
10962  ..\data\raw\Preliminar\Training\训练集\TYPE9\Samp...  Preliminar   
10963  ..\data\raw\Preliminar\Training\训练集\TYPE9\Samp...  Preliminar   
10964  ..\data\raw\Preliminar\Training\训练集\TYPE9\Samp...  Preliminar   
10965  ..\data\raw\Preliminar\Training\训练集\TYPE9\Samp...  Preliminar   
10966  ..\data\raw\Preliminar\Training\训练集\TYPE9\Samp...  Preliminar   

          split  label motor_label gearbox_label axle_box_left_label  \
10962  Training  TYPE9          M0            G5                 LA0   
10963  Training  TYPE9          M0            G5                 LA0   
10964  Training  TYPE9          M0            G5                 LA0   
10965  Training  TYPE9          M0            G5                 LA0   
10966  Training  TYPE9          M0            G5                 LA0   

      axle_box_right_label   sample          part channel component  \
10962             

  '''# Tipo 1: ..\data\raw\Final\Test\Sample110\data_motor.csv
