# converting signals into images, bin files(.npz)

このノートブックではディレクトリを指定して、その中にある生波形データ(.mat)を、一括して画像(.png)及びその値（.npz）に変換し保存するという処理を行っています。

## シミュレーションデータ変換  
 シミュレーションで生成した`.mat`のファイルを統一形式である`.npz`に変換します。以下は、フォルダを指定するとその配下のファイルをすべて一括で変換し、`/processed`ディレクトリに保存するという処理を行うものです。

In [2]:
from src import mat2npz_sim,npz2png
import os
import glob

# Define input and output directories
# Define the case name as a variable for clarity and reusability
case_name = "case1"

# Define base data directory for clarity
base_data_dir = "/home/smatsubara/documents/airlift/data/simulation"
visualize_dir = "/home/smatsubara/documents/airlift/data/visualize"
def convert_all_simulation_mat_to_npz(case_name, base_data_dir):
    """
    Convert all simulation .mat files in the specified case directory to .npz format.

    Parameters
    ----------
    case_name : str
        The name of the simulation case (e.g., "case5").
    base_data_dir : str
        The base directory where simulation data is stored.
    """
    # Define input directory for raw simulation signals (relative to base_data_dir and case_name)
    mat_dir = os.path.join(base_data_dir, f"rawsignal/{case_name}/data")

    # Define config file path (relative to base_data_dir and case_name)
    config_path = os.path.join(base_data_dir, f"rawsignal/{case_name}/config.json")

    # Define output directory for processed files (relative to base_data_dir and case_name)
    output_dir = os.path.join(base_data_dir, f"processed/{case_name}")
    # Create the output directory if it does not exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created output directory: {output_dir}")
    # Save a copy of the config.json file to the output directory for reference
    import shutil
    config_copy_path = os.path.join(output_dir, "config.json")
    shutil.copy2(config_path, config_copy_path)
    print(f"Copied config.json to: {config_copy_path}")

    # Create the output directory if it does not exist
    os.makedirs(output_dir, exist_ok=True)
    mat_files_list = glob.glob(os.path.join(mat_dir, "*.mat"))
    for mat_file in mat_files_list:
        print(f"Processing: {mat_file}")
        mat2npz_sim(mat_file, config_path, output_dir)

# 関数の呼び出し例
convert_all_simulation_mat_to_npz(case_name, base_data_dir)

npz2png(file_path="/home/smatsubara/documents/airlift/data/simulation/processed/case26/solid_liquid_reflector3_processed.npz",save_path=visualize_dir,full=False,pulse_index=0)
npz2png(file_path="/home/smatsubara/documents/airlift/data/simulation/processed/case9/solid_liquid_reflector3_processed.npz",save_path=visualize_dir,full=True,pulse_index=0)

Created output directory: /home/smatsubara/documents/airlift/data/simulation/processed/case1
Copied config.json to: /home/smatsubara/documents/airlift/data/simulation/processed/case1/config.json
Processing: /home/smatsubara/documents/airlift/data/simulation/rawsignal/case1/data/solid_liquid_reflector1.mat
<KeysViewHDF5 ['#refs#', '#subsystem#', 'kgrid', 'sensor_data']>
['Nt', 'Nx', 'Ny', 'Nz', 'dim', 'dt', 'dx', 'dxudxn', 'dxudxn_sgx', 'dy', 'dyudyn', 'dyudyn_sgy', 'dz', 'dzudzn', 'dzudzn_sgz', 'k', 'k_max', 'kx_max', 'kx_vec', 'ky_max', 'ky_vec', 'kz_max', 'kz_vec', 'nonuniform', 'xn_vec', 'xn_vec_sgx', 'yn_vec', 'yn_vec_sgy', 'zn_vec', 'zn_vec_sgz']
999999999.9999999
keys: ['#refs#', '#subsystem#', 'kgrid', 'sensor_data']
['#refs#', '#subsystem#', 'kgrid', 'sensor_data']
(50000,)
Processed data and metadata saved to: /home/smatsubara/documents/airlift/data/simulation/processed/case1/solid_liquid_reflector1_processed.npz
Processing: /home/smatsubara/documents/airlift/data/simulation/r

## 機械学習用データセット生成（シミュレーション）
　次に、変換した`.npz`のファイルに対応する目標変数となる値を`/config.json`を使って計算し、データセットとなる`x_train.npy`,`t_train.npy`を作成していきます。これらのiDの紐づけが狂うとすべての計算の意味がなくなってしまうので、最大限注意してください。  
また、実機への展開をスムーズにするために、最大値を用いてスケーリングしていることとに注して下さい。その他順序付けなど筆者は細心の注意を払って実装していますが、もし誤りがあればご指摘いただけると幸いです。


In [8]:
from src import calculate_gvf_and_signal,npz2png,process_case_and_return_dataset
import numpy as np
import math
import json
import glob
import os
#units are all mm

x_list = []
t_list = []

# Define the base directory containing all cases
processed_base_dir = "/home/smatsubara/documents/airlift/data/simulation/processed"
output_path = "/home/smatsubara/documents/airlift/data/simulation/dataset"
rawsignal_base_dir = "/home/smatsubara/documents/airlift/data/simulation/rawsignal"

# Get all case directories (e.g., case5, case6, ...)
case_dirs = sorted([d for d in os.listdir(processed_base_dir) if os.path.isdir(os.path.join(processed_base_dir, d)) and d.startswith("case")])

x_train_list = []
t_train_list = []

for case_name in case_dirs:
    base_dir = os.path.join(processed_base_dir, case_name)
    csv_dir = os.path.join(rawsignal_base_dir, case_name)
    print(f"Processing {case_name} in {base_dir}")
    x_tmp, t_tmp = process_case_and_return_dataset(case_name, base_dir, csv_dir)
    print(f"x_tmp shape: {x_tmp.shape}, t_tmp shape: {t_tmp.shape}")
    x_train_list.append(x_tmp)
    t_train_list.append(t_tmp)
print("list done")
# Concatenate all cases into single arrays
x_train = np.concatenate(x_train_list, axis=0)
t_train = np.concatenate(t_train_list, axis=0)
x_train = x_train/np.max(x_train)
x_train = np.log1p(x_train)
# x_train = x_train/np.max(x_train)
print("Final x_train shape:", x_train.shape)
print("Final t_train shape:", t_train.shape)
print(np.max(x_train),np.min(x_train))
np.save(os.path.join(output_path, "x_train.npy"), x_train)
np.save(os.path.join(output_path, "t_train.npy"), t_train)
#print(np.max(x_train))
#print(t_train)
#print(x_train)
npz_file_path = "/home/smatsubara/documents/airlift/data/simulation/processed/case5/solid_liquid4_processed.npz"
output_folder_path = "/home/smatsubara/documents/airlift/data/results/"
npz2png(npz_file_path, output_folder_path, channel_index=0, start_time=0.0, end_time=None, full=True, pulse_index=0)
npz2png(npz_file_path, output_folder_path, channel_index=0, start_time=0.0, end_time=None, full=False, pulse_index=0)


Processing case1 in /home/smatsubara/documents/airlift/data/simulation/processed/case1
['/home/smatsubara/documents/airlift/data/simulation/processed/case1/solid_liquid_reflector10_processed.npz', '/home/smatsubara/documents/airlift/data/simulation/processed/case1/solid_liquid_reflector1_processed.npz', '/home/smatsubara/documents/airlift/data/simulation/processed/case1/solid_liquid_reflector2_processed.npz', '/home/smatsubara/documents/airlift/data/simulation/processed/case1/solid_liquid_reflector3_processed.npz']
22233.767230882375 0.030785031114614694
x_tmp shape: (4, 2500), t_tmp shape: (4,)
Processing case10 in /home/smatsubara/documents/airlift/data/simulation/processed/case10
['/home/smatsubara/documents/airlift/data/simulation/processed/case10/solid_liquid_reflector10_processed.npz', '/home/smatsubara/documents/airlift/data/simulation/processed/case10/solid_liquid_reflector1_processed.npz', '/home/smatsubara/documents/airlift/data/simulation/processed/case10/solid_liquid_reflec

In [7]:
from src import calculate_gvf_and_signal

config_path = "/home/smatsubara/documents/airlift/data/simulation/processed/case5/config.json"
npz_path = "/home/smatsubara/documents/airlift/data/simulation/processed/case4/solid_liquid4_processed.npz"
input_tmp, target_tmp = calculate_gvf_and_signal(config_path, npz_path)
print(input_tmp.shape,target_tmp)

(2500,) 0.024076984714003942


## 実機データ変換  
実機データの`.mat`のファイルも、先ほどと同様に統一形式である`.npz`に変換します。同じく、`experiments/processed`に保存されるようにしています。下のものは、サンプルを一つだけ変換するものです。

In [None]:
from src import mat2npz_exp,npz2png
import os
import glob
file_path = "/home/smatsubara/documents/airlift/data/experiments/rawsignal/solid_liquid/P20241016-1117.mat"
output_dir =  "/home/smatsubara/documents/airlift/data/results/exp_sample"
mat2npz_exp(
        file_path=file_path,
        output_dir=output_dir,
        start_time=0.1,  #初期の信号は不安定であることが多いため除外
        duration=5.0,
        amplitude_threshold=2,
        window_width=0.1e-3,
        signal_key="TDX1"
    )

## 複数を一括で変換するスクリプト

In [None]:
from src import mat2npz_exp,npz2png
import os

import glob

# Get all .mat files in the rawsignal directory
output_dir = "/home/smatsubara/documents/airlift/data/experiments/processed"
rawsignal_dir = "/home/smatsubara/documents/airlift/data/experiments/rawsignal/solid_liquid"
mat_files = glob.glob(os.path.join(rawsignal_dir, "*.mat"))


for file_path in mat_files:
    print(f"Processing {file_path}")
    mat2npz_exp(
        file_path=file_path,
        output_dir=output_dir,
        start_time=0.1,  #初期の信号は不安定であることが多いため除外
        duration=5.0,
        amplitude_threshold=2,
        window_width=0.1e-3,
        signal_key="TDX1"
    )


/home/smatsubara/documents/airlift/data/results/exp_sample/P20241016-1117_processed_0pulse45.png


## 機械学習用データセット生成(実機)  
次に、変換した`.npz`のファイルに対応する目標変数となる値を`/target_variables.csv`を使って参照し、データセット`x_test.npy` `t_test.npy`を作成していきます。


In [3]:
import polars as pl

df =pl.read_csv("/home/smatsubara/documents/airlift/data/sandbox/experiments/target_valiables.csv",encoding="shift_jis")


data_path = "/home/smatsubara/documents/airlift/data/experiments/processed/solid_liquid"
# "ガラス球直径"列の右隣に、"P2024{日付}{時分}_processed.npz"という名前列を追加
# コピーを作成し、result_dirに保存

# まず既存のdfをコピー
df_copy = df.clone()

# 新しいカラム名
new_col_name = "ファイル名"

# "P2024" + 日付 + 時分 + "_processed.npz"を作成
df_copy = df_copy.with_columns(
    (pl.lit("P") + df_copy["日付"].cast(pl.Utf8) + "-" + df_copy["時分"].cast(pl.Utf8) + "_processed.npz").alias(new_col_name)
)

# "ガラス球直径"列のすぐ右隣りに挿入
glass_col_idx = df_copy.columns.index("ガラス球直径")
cols = df_copy.columns.copy()
cols.insert(glass_col_idx + 1, cols.pop(-1))  # 新しいカラム(最後にある)を目的位置へ

df_final = df_copy.select(cols)

# 保存
result_dir = "/home/smatsubara/documents/airlift/data/sandbox/results" 
save_path = result_dir.rstrip("/") + "/target_valiables_with_filename.csv"
df_final.write_csv(save_path)

print(f"saved: {save_path}")
# "ファイル名" の右隣に data_path と結合したフルパス列 "FullPath" を追加

# 再読込（または前の df_final を使用）
df_work = df_final.clone()

# "FullPath" を作成
fullpath_col = "FullPath"
df_work = df_work.with_columns(
    (pl.lit(data_path.rstrip("/") + "/") + df_work["ファイル名"]).alias(fullpath_col)
)

# "ガラス球直径" のインデックス取得し、2つ右隣に "FullPath" を移動
glass_idx = df_work.columns.index("ガラス球直径")
# "ファイル名"は1つ右、その隣=2つ右になるよう "FullPath" を移動
cols = df_work.columns.copy()
# 一旦 "FullPath" をpop
cols.pop(cols.index(fullpath_col))
cols.insert(glass_idx + 2, fullpath_col)
df_final2 = df_work.select(cols)

# 保存
save_path2 = result_dir.rstrip("/") + "/target_valiables_with_fullpath.csv"
df_final2.write_csv(save_path2)
print(f"saved: {save_path2}")

df_final2.head()



saved: /home/smatsubara/documents/airlift/data/sandbox/results/target_valiables_with_filename.csv
saved: /home/smatsubara/documents/airlift/data/sandbox/results/target_valiables_with_fullpath.csv


日付,時分,固相見かけ流速,気相見かけ流速,液相見かけ流速,固相体積率,気相体積率,液相体積率,ガラス球直径,ファイル名,FullPath
i64,i64,f64,f64,f64,f64,f64,f64,str,str,str
20240726,1022,0.0,31.922755,6.059601,0.0,0.749158,0.250842,"""-""","""P20240726-1022_processed.npz""","""/home/smatsubara/documents/air…"
20240726,1055,0.0,32.685636,6.101101,0.0,0.745521,0.254479,"""-""","""P20240726-1055_processed.npz""","""/home/smatsubara/documents/air…"
20240726,1113,0.0,32.048131,4.543445,0.0,0.7546132,0.2453868,"""-""","""P20240726-1113_processed.npz""","""/home/smatsubara/documents/air…"
20240726,1122,0.0,31.968982,4.555286,0.0,0.742794,0.257206,"""-""","""P20240726-1122_processed.npz""","""/home/smatsubara/documents/air…"
20240726,1334,0.0,32.881976,3.119066,0.0,0.74734,0.25266,"""-""","""P20240726-1334_processed.npz""","""/home/smatsubara/documents/air…"


In [None]:
import numpy as np

# "FullPath" 列の各ファイルを np.load でロードし 'x_train_real' 配列だけでなく
# 見かけ流速・体積率6項目（固相見かけ流速, 気相見かけ流速, 液相見かけ流速, 固相体積率, 気相体積率, 液相体積率）
# も行ごとに配列化し、それぞれ np.stack でまとめる
import numpy as np

file_paths = df_final2["FullPath"].to_list()
processed_data_list = []
targets_list = []
for i, p in enumerate(file_paths):
    try:
        data_npz = np.load(p)
    except Exception as e:
        print(f"failed to load: {p} ({e})")
        processed_data_list.append(None)
        continue

    if 'processed_data' not in data_npz:
        print(f"'processed_data' is not in {p}")
        processed_data_list.append(None)
        continue

    arr = data_npz['processed_data']
    print(f"{p}: original shape: {arr.shape}")
    # 軸が3つ（例:(4,15000,2500)）なら、axisを(2,0,1)へ
    if arr.ndim == 3:
        arr_T = np.transpose(arr, (2, 0, 1))
        print(f"    transposed shape: {arr_T.shape}")
        processed_data_list.append(arr_T)
        # 固相見かけ流速, 気相見かけ流速, 液相見かけ流速, 固相体積率, 気相体積率, 液相体積率をtargetとしてappend
        # 元データはdf_final2内にある。i番目のデータのtargetsをまとめる
        # dtypeなどに注意してfloatで取る
        try:
            # .ilocはpolarsでは使えないため、polarsからpandasに変換して取得する（またはrowオブジェクトから直接取得）
            # まずrowで取得し、dictなので列名で値を取り出す
            row = df_final2.row(i)
            targets = np.array([
                float(row[df_final2.columns.index("固相見かけ流速")]),
                float(row[df_final2.columns.index("気相見かけ流速")]),
                float(row[df_final2.columns.index("液相見かけ流速")]),
                float(row[df_final2.columns.index("固相体積率")]),
                float(row[df_final2.columns.index("気相体積率")]),
                float(row[df_final2.columns.index("液相体積率")])
            ])
            targets_list.append(targets)
        except Exception as e:
            print(f"failed to load targets for {p} ({e})")
            processed_data_list.append(None)
            continue
        
    else:
        print(f"    processed_data is not 3D (got shape {arr.shape}), skipping")
        processed_data_list.append(None)

# Noneを取り除き、すべて有効ならstack
processed_data_valids = [a for a in processed_data_list if a is not None]
if len(processed_data_valids) == 0:
    print("No valid data could be loaded. combined_data will be None")
    combined_data = None
else:
    combined_data = np.stack(processed_data_valids, axis=0)
    print(f"shape: {combined_data.shape}")

# 次々に結合したい場合、file_pathsをdf_final2["FullPath"]など全件で行えば良い
# ---
# 必要に応じて結合軸(axis=0=バッチ、axis=1=チャンネル結合等)を調整
x_train_real = combined_data
t_train_real = np.stack(targets_list, axis=0)

print(x_train_real.shape)
print(t_train_real.shape)
# x_train_realのshapeは(バッチ数, チャンネル, 14000, 2500)なので、
# メモリエラー回避のため、(N, C, 14000, 2500)→(N, C, 1400, 2500)のダウンサンプリングはサンプル・チャンネルごと（バッチ＆チャンネルごと）にブロック処理で実施する
import numpy as np
from scipy.signal import resample

if x_train_real.shape[2] == 14000:
    N, C, T, S = x_train_real.shape
    x_downsampled = np.empty((N, C, 1400, S), dtype=x_train_real.dtype)
    for n in range(N):
        for c in range(C):
            # 1サンプル1チャンネルずつ(14000, 2500)→(1400, 2500)にメモリ節約しつつresample
            x_downsampled[n, c] = resample(x_train_real[n, c], 1400, axis=0)
        print(f"  Downsampled batch {n+1}/{N}", flush=True)
    x_train_real = x_downsampled
    print(f"Downsampled x_train_real to shape: {x_train_real.shape}")
else:
    print(f"x_train_real shape unexpected for downsampling: {x_train_real.shape}")




In [None]:
np.save("/home/smatsubara/documents/airlift/data/sandbox/results/x_train_real.npy", x_train_real)
np.save ("/home/smatsubara/documents/airlift/data/sandbox/results/t_train_real.npy", t_train_real)