In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
import csv
import scipy.signal as signal
from scipy.signal import find_peaks
import shutil

def butter_lowpass_filter(data, cutoff, fs, order=5):
    b, a = signal.butter(order, cutoff, fs=fs, btype='low', analog=False)
    y = signal.lfilter(b, a, data)
    return y

def butter_highpass_filter(data, cutoff, fs, order=5):
    b, a = signal.butter(order, cutoff,fs=fs, btype='high', analog=False)
    y = signal.filtfilt(b, a, data)
    return y

In [24]:
# 数据清洗：数据滤波，裁剪，绘制
def plot_data(filename):
    df = pd.read_csv(filename)
    ACC_X = df['ACC_X']
    ACC_Y = df['ACC_Y']
    ACC_Z = df['ACC_Z']
    time = df['Time']

    # 滤波参数
    fs = 1/np.mean(np.diff(time))
    # plt.specgram(ACC_Z, NFFT=1024, Fs=fs, detrend=None)
    cutoff_L,cutoff_H = 500,20
    order_L,order_H = 6,5
    truncate_length = 0
    
    ACC_X_filtered_L = butter_lowpass_filter(ACC_X, cutoff_L, fs, order_L)
    ACC_X_filtered_H = butter_highpass_filter(ACC_X_filtered_L, cutoff_H, fs, order_H)[truncate_length:]
    ACC_Y_filtered_L = butter_lowpass_filter(ACC_Y, cutoff_L, fs, order_L)
    ACC_Y_filtered_H = butter_highpass_filter(ACC_Y_filtered_L, cutoff_H, fs, order_H)[truncate_length:]
    ACC_Z_filtered_L = butter_lowpass_filter(ACC_Z, cutoff_L, fs, order_L)
    ACC_Z_filtered_H = butter_highpass_filter(ACC_Z_filtered_L, cutoff_H, fs, order_H)[truncate_length:]
    time = time.values[truncate_length:]

    # 截取窗口
    threshold = 1000 
    peaks, _ = find_peaks(ACC_Z_filtered_H, height=threshold)
    pivot = peaks[0]
    start_index = pivot - int(0.4*fs)
    end_index = pivot + int(1.4*fs)
    # print("index:",start_index,end_index)
    
    
    fig, axes = plt.subplots(3, 1, figsize=(10, 10))
    
    axes[0].plot(time,ACC_X_filtered_H, label='ACC_X Data')
    axes[1].plot(time,ACC_Y_filtered_H, label='ACC_Y Data')
    axes[2].plot(time,ACC_Z_filtered_H, label='ACC_Z Data')
    
    for i, data_name in enumerate(['ACC_X', 'ACC_Y', 'ACC_Z']): 
        
        axes[i].axvspan(time[start_index], time[end_index], facecolor='red', alpha=0.2, label='Target Waveform')
        
        # ax.set_title(data_name)
        axes[i].set_xlabel("Time (s)")
        axes[i].set_ylabel(data_name+' (mg)')
        axes[i].legend()
    plt.suptitle(filename)
    
    plt.tight_layout()
    plt.show()
    
    return start_index,end_index
 
    
def crop_data(filename):
    df = pd.read_csv(filename)
    ACC = df['ACC_Z']
    time = df['Time']

    fs = 1/np.mean(np.diff(time))
    truncate_length = 0
    

    ACC_filtered_L = butter_lowpass_filter(ACC, cutoff=500, fs=fs, order=6)
    ACC_filtered_H = butter_highpass_filter(ACC_filtered_L, cutoff=20, fs=fs, order=5)[truncate_length:]
    time = time.values[truncate_length:]
    
    threshold = 1000 
    peaks, _ = find_peaks(ACC_filtered_H, height=threshold)
    pivot = peaks[0]
    start_index = pivot - int(0.5*fs)
    end_index = pivot + int(1.5*fs)
    
    df_cropped = df[start_index:end_index + 1]
    df_cropped.to_csv(filename, index=False)
    
def clean_data(labelfile):
    
    source_folder = 'output'  # 替换为你的源文件夹路径
    backup_folder = 'output_backup'  # 替换为你的备份文件夹路径

    shutil.copytree(source_folder, backup_folder)
    
    df = pd.read_csv(labelfile)
    for index, row in df.iterrows():
        filepath = row.iloc[0]
        # plot_data(filepath)
        # crop_data(filepath)

        # 检查裁剪后的文件的行数，避免出现错误数据
        with open(filepath, 'r') as csv_file:
            csv_reader = csv.reader(csv_file)
        
            row_count = len(list(csv_reader))
            print(f"{filepath} has {row_count} rows")
    
        
# clean_data('./label_file.csv')

# plot_all_data('./output/20240104_094355.csv')

# crop_data('./test/20240104_091258.csv')
# plot_data('./output_backup/20240104_104247.csv')


output/20240104_091258.csv has 2664 rows
output/20240104_091511.csv has 2647 rows
output/20240104_094232.csv has 2646 rows
output/20240104_094302.csv has 2678 rows
output/20240104_094325.csv has 2668 rows
output/20240104_094355.csv has 2662 rows
output/20240104_094428.csv has 2660 rows
output/20240104_094500.csv has 2636 rows
output/20240104_094524.csv has 2671 rows
output/20240104_094552.csv has 2662 rows
output/20240104_095025.csv has 2650 rows
output/20240104_095103.csv has 2654 rows
output/20240104_095131.csv has 2662 rows
output/20240104_095247.csv has 2644 rows
output/20240104_103510.csv has 2634 rows
output/20240104_103726.csv has 2675 rows
output/20240104_103910.csv has 2650 rows
output/20240104_104247.csv has 2660 rows
output/20240104_104319.csv has 2652 rows
output/20240104_104408.csv has 2663 rows
output/20240104_104623.csv has 2679 rows
output/20240104_104652.csv has 2684 rows
output/20240104_104717.csv has 2672 rows
output/20240104_104832.csv has 2680 rows
output/20240104_

In [19]:
# 根据标签文件的内容，清除./output文件夹中的无关文件
def remove_files_not_in_label(tag_file, folder_path):
    # 读取标签文件的第一列元素
    with open(tag_file, 'r') as tag_csv:
        tag_reader = csv.reader(tag_csv)
        # 跳过第一行（列名称）
        next(tag_reader)
        tag_elements = [row[0] for row in tag_reader]

    # 获取文件夹中的所有文件名
    folder_files = os.listdir(folder_path)
    print('tag_elements:',tag_elements)
    print('folder_files:',folder_files)

    count = 0
    # 遍历文件夹中的文件名
    for filename in folder_files:
        # 检查文件名是否在标签文件的第一列元素中
        if not any(filename in tag for tag in tag_elements):
            # 构造文件的完整路径
            file_path = os.path.join(folder_path, filename)

            # 删除文件
            os.remove(file_path)
            # print(f"File '{file_path}' removed.")
            count += 1
    
    print('count:',count)

tag_file_path = './label_file.csv'
folder_path = './output/'
remove_files_not_in_label(tag_file_path, folder_path)

tag_elements: ['FilePath', 'output/20240104_091258.csv', 'output/20240104_091511.csv', 'output/20240104_094232.csv', 'output/20240104_094302.csv', 'output/20240104_094325.csv', 'output/20240104_094355.csv', 'output/20240104_094428.csv', 'output/20240104_094500.csv', 'output/20240104_094524.csv', 'output/20240104_094552.csv', 'output/20240104_095025.csv', 'output/20240104_095103.csv', 'output/20240104_095131.csv', 'output/20240104_095247.csv', 'output/20240104_103510.csv', 'output/20240104_103726.csv', 'output/20240104_103910.csv', 'output/20240104_104247.csv', 'output/20240104_104319.csv', 'output/20240104_104408.csv', 'output/20240104_104623.csv', 'output/20240104_104652.csv', 'output/20240104_104717.csv', 'output/20240104_104832.csv', 'output/20240104_104902.csv', 'output/20240104_104948.csv', 'output/20240104_105015.csv', 'output/20240104_105044.csv', 'output/20240104_105108.csv', 'output/20240104_105200.csv', 'output/20240104_105315.csv', 'output/20240104_151321.csv', 'output/20240