In [1]:
import pandas as pd

import os

import warnings
warnings.filterwarnings('ignore')

In [2]:
print(f"Pandas: {pd.__version__}")

Pandas: 1.2.4


In [3]:
filenames_df = pd.read_csv(os.path.join("csv", "filenames.csv"))

In [4]:
filenames_df.sample(5)

Unnamed: 0,filename
3217,pump\train\normal_id_04_00000550.wav
2027,pump\train\normal_id_02_00000265.wav
1859,pump\train\normal_id_02_00000097.wav
3789,pump\train\normal_id_06_00000520.wav
2918,pump\train\normal_id_04_00000251.wav


In [5]:
machines_df = (
    filenames_df["filename"]
    .str.extract(r"(pump).(train|test).(normal|anomaly)_id_(\d{2})_\d{4}(\d{4})", expand=True)
    .rename(columns={0: "machine_type", 1: "split", 2: "label", 3: "machine_id", 4: "audio_id", 5: "ext"})
)

In [6]:
machines_df.sample(5)

Unnamed: 0,machine_type,split,label,machine_id,audio_id
3558,pump,train,normal,6,289
3766,pump,train,normal,6,497
4074,pump,train,normal,6,805
1561,pump,train,normal,0,705
1176,pump,train,normal,0,320


In [7]:
machines_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4205 entries, 0 to 4204
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   machine_type  4205 non-null   object
 1   split         4205 non-null   object
 2   label         4205 non-null   object
 3   machine_id    4205 non-null   object
 4   audio_id      4205 non-null   object
dtypes: object(5)
memory usage: 164.4+ KB


In [8]:
machines_df.memory_usage(deep=True)

Index              128
machine_type    256505
split           259854
label           265371
machine_id      248095
audio_id        256505
dtype: int64

In [9]:
dct_types = {
    "machine_type": "category",
    "split": "category",
    "label": "category",
    "machine_id": "category",
    "audio_id": "category",
}
machines_df = machines_df.astype(dct_types)

In [10]:
machines_df.dtypes

machine_type    category
split           category
label           category
machine_id      category
audio_id        category
dtype: object

In [11]:
machines_df.sample(5)

Unnamed: 0,machine_type,split,label,machine_id,audio_id
2861,pump,train,normal,4,194
1178,pump,train,normal,0,322
2627,pump,train,normal,2,865
2016,pump,train,normal,2,254
2172,pump,train,normal,2,410


In [12]:
machines_df.memory_usage(deep=True)

Index             128
machine_type     4374
split            4436
label            4440
machine_id       4613
audio_id        98570
dtype: int64

In [13]:
machines_df.memory_usage(deep=True)

Index             128
machine_type     4374
split            4436
label            4440
machine_id       4613
audio_id        98570
dtype: int64

In [14]:
(
    machines_df
    .filter(["machine_id", "split", "label", "audio_id"])
    .pivot_table(
        values="audio_id",
        index=["machine_id"],
        columns=["split","label"],
        aggfunc='count',
        fill_value=0,
        observed=True)
)

split,test,test,train
label,anomaly,normal,normal
machine_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,143,100,906
2,111,100,905
4,100,100,602
6,102,100,936


In [15]:
machines_df.to_csv(os.path.join("csv", "machines.csv"), index=False)