In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# 定义路径
root_dir = r'D:\datauser\training_files'  # 源文件路径
save_root_dir = r'D:\datauser\NEWNEW_converted_files'  # 转换后的文件保存路径

# 创建保存路径（如果不存在）
os.makedirs(save_root_dir, exist_ok=True)

# 设置数值精度
FLOAT_PRECISION = 1e-10

# 步骤 1: 转换原始文件为 CSV 文件
for folder_name in os.listdir(root_dir):
    folder_path = os.path.join(root_dir, folder_name)
    if os.path.isdir(folder_path) and folder_name.startswith('user'):
        # 在保存路径中创建对应的子文件夹
        save_folder_path = os.path.join(save_root_dir, folder_name)
        os.makedirs(save_folder_path, exist_ok=True)

        # 遍历文件夹内的所有文件
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            if os.path.isfile(file_path):
                try:
                    # 尝试读取文件内容，使用多种编码
                    df = None
                    for encoding in ['utf-8', 'latin1', 'ISO-8859-1']:
                        try:
                            df = pd.read_csv(file_path, sep=None, engine="python", encoding=encoding)
                            break
                        except Exception:
                            continue
                    
                    if df is None:
                        raise ValueError(f"Unable to read file with any encoding: {file_path}")

                    # 保存为 CSV 文件到对应的子文件夹
                    csv_file_name = os.path.splitext(file_name)[0] + '.csv'
                    csv_file_path = os.path.join(save_folder_path, csv_file_name)
                    df.to_csv(csv_file_path, index=False)
                    print(f"Converted: {file_path} -> {csv_file_path}")
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

# 步骤 2: 对转换后的 CSV 文件进行特征提取
for folder_name in os.listdir(save_root_dir):
    folder_path = os.path.join(save_root_dir, folder_name)
    if os.path.isdir(folder_path) and folder_name.startswith('user'):
        # 遍历文件夹内的所有 CSV 文件
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.csv'):
                file_path = os.path.join(folder_path, file_name)
                try:
                    # 读取 CSV 文件
                    df = pd.read_csv(file_path)

                    # 删除 'client timestamp' 列
                    if 'client timestamp' in df.columns:
                        df.drop(columns=['client timestamp'], inplace=True)

                    # 删除时间重复的第二行
                    duplicate_indices = df[df.duplicated(subset=['record timestamp'], keep='first')].index
                    df = df.drop(index=duplicate_indices).reset_index(drop=True)

                    # 初始化新的动态特征列
                    for feature in ['distance', 'velocity', 'acceleration', 'curvature', 'angle_change',
                                  'x_velocity', 'y_velocity', 'x_acceleration', 'y_acceleration']:
                        df[feature] = np.nan

                    # 找到所有 'Move' 和 'Drag' 行的索引
                    target_indices = df.index[df['state'].isin(['Move', 'Drag'])].tolist()

                    # 遍历这些行，计算增强后的鼠标动态特征
                    for i in range(len(target_indices) - 1):
                        current_index = target_indices[i]
                        next_index = target_indices[i + 1]

                        current_row = df.loc[current_index]
                        next_row = df.loc[next_index]

                        # 时间差
                        time_diff = next_row['record timestamp'] - current_row['record timestamp']
                        if abs(time_diff) < FLOAT_PRECISION:
                            continue

                        # 位移（保留方向）
                        delta_x = next_row['x'] - current_row['x']
                        delta_y = next_row['y'] - current_row['y']
                        distance = np.sqrt(delta_x**2 + delta_y**2)
                        df.at[current_index, 'distance'] = distance

                        # 速度（保留方向）
                        x_velocity = delta_x / time_diff
                        y_velocity = delta_y / time_diff
                        velocity = distance / time_diff  # 速度大小
                        df.at[current_index, 'velocity'] = velocity
                        df.at[current_index, 'x_velocity'] = x_velocity
                        df.at[current_index, 'y_velocity'] = y_velocity

                        # 如果不是第一个点，计算加速度
                        if i > 0:
                            prev_index = target_indices[i-1]
                            prev_row = df.loc[prev_index]
                            prev_time_diff = current_row['record timestamp'] - prev_row['record timestamp']
                            
                            if abs(prev_time_diff) > FLOAT_PRECISION:
                                # 计算前一段的速度
                                prev_x_velocity = (current_row['x'] - prev_row['x']) / prev_time_diff
                                prev_y_velocity = (current_row['y'] - prev_row['y']) / prev_time_diff
                                
                                # 加速度计算（速度变化/时间）
                                x_acceleration = (x_velocity - prev_x_velocity) / time_diff
                                y_acceleration = (y_velocity - prev_y_velocity) / time_diff
                                acceleration = np.sqrt(x_acceleration**2 + y_acceleration**2)  # 加速度大小
                                
                                df.at[current_index, 'acceleration'] = acceleration
                                df.at[current_index, 'x_acceleration'] = x_acceleration
                                df.at[current_index, 'y_acceleration'] = y_acceleration

                        # 角度变化
                        angle_change = np.arctan2(delta_y, delta_x)
                        df.at[current_index, 'angle_change'] = angle_change

                        # 曲率
                        if distance > FLOAT_PRECISION:
                            curvature = angle_change / distance
                            df.at[current_index, 'curvature'] = curvature

                    # 处理 button 列为 Scroll 的行
                    scroll_indices = df.index[df['button'] == 'Scroll'].tolist()
                    for i in scroll_indices:
                        if i == 0:
                            continue  # 跳过首行，无法计算间隔
                        time_diff = abs(df.loc[i, 'record timestamp'] - df.loc[i - 1, 'record timestamp'])
                        if time_diff > FLOAT_PRECISION:
                            scroll_speed = 1 / time_diff  # 速度计算
                        else:
                            scroll_speed = np.nan
                        df.at[i, 'velocity'] = scroll_speed

                        # 根据 state 列的值填充其他特征
                        if df.loc[i, 'state'] == 'Down':
                            marker = 0
                        elif df.loc[i, 'state'] == 'Up':
                            marker = 0
                        else:
                            marker = np.nan

                        # 填充其他列
                        for feature in ['distance', 'acceleration', 'curvature', 'angle_change',
                                      'x_velocity', 'y_velocity', 'x_acceleration', 'y_acceleration']:
                            df.at[i, feature] = marker

                    # 保存未归一化的数据
                    df.to_csv(file_path, index=False)
                    print(f"Processed and saved: {file_path}")

                except Exception as e:
                    print(f"Error processing {file_path}: {e}")

# 步骤 3: 收集所有特征绝对值的最大和最小值
# 步骤 3: 收集所有特征绝对值的最大和最小值
features = ['distance', 'velocity', 'acceleration', 'curvature', 'angle_change',
           'x_velocity', 'y_velocity', 'x_acceleration', 'y_acceleration']

# 为每个特征单独记录绝对值的最大最小值
feature_ranges = {feature: {'min': float('inf'), 'max': float('-inf')} for feature in features}

# 第一次遍历：收集所有绝对值的范围
for folder_name in os.listdir(save_root_dir):
    folder_path = os.path.join(save_root_dir, folder_name)
    if os.path.isdir(folder_path) and folder_name.startswith('user'):
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.csv'):
                file_path = os.path.join(folder_path, file_name)
                try:
                    df = pd.read_csv(file_path)
                    
                    # 对每个特征
                    for feature in features:
                        # 取所有非NaN值的绝对值
                        values = df[feature].dropna()
                        if len(values) > 0:
                            abs_values = np.abs(values)
                            current_min = abs_values.min()
                            current_max = abs_values.max()
                            
                            # 更新该特征的全局最大最小值
                            feature_ranges[feature]['min'] = min(feature_ranges[feature]['min'], current_min)
                            feature_ranges[feature]['max'] = max(feature_ranges[feature]['max'], current_max)
                
                except Exception as e:
                    print(f"Error in collecting ranges from {file_path}: {e}")

# 打印收集到的范围，方便检查
for feature in features:
    print(f"{feature} absolute value range: min={feature_ranges[feature]['min']}, max={feature_ranges[feature]['max']}")

# 第二次遍历：进行归一化
for folder_name in os.listdir(save_root_dir):
    folder_path = os.path.join(save_root_dir, folder_name)
    if os.path.isdir(folder_path) and folder_name.startswith('user'):
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.csv'):
                file_path = os.path.join(folder_path, file_name)
                try:
                    df = pd.read_csv(file_path)
                    
                    # 对每个特征进行归一化
                    for feature in features:
                        # 只处理非NaN的值
                        mask = df[feature].notna()
                        if mask.any():
                            # 1. 保存原始符号
                            original_values = df.loc[mask, feature]
                            signs = np.sign(original_values)
                            
                            # 2. 取绝对值
                            abs_values = np.abs(original_values)
                            
                            # 3. 使用全局范围归一化
                            feature_min = feature_ranges[feature]['min']
                            feature_max = feature_ranges[feature]['max']
                            
                            if feature_max - feature_min > 0:  # 避免除以0
                                # 归一化绝对值到0-1
                                normalized_values = (abs_values - feature_min) / (feature_max - feature_min)
                                # 4. 恢复符号
                                df.loc[mask, feature] = signs * normalized_values
                    
                    df.to_csv(file_path, index=False)
                    print(f"Normalized and saved: {file_path}")
                
                except Exception as e:
                    print(f"Error in normalizing {file_path}: {e}")

Converted: D:\datauser\training_files\user12\session_2144641057 -> D:\datauser\NEWNEW_converted_files\user12\session_2144641057.csv
Converted: D:\datauser\training_files\user12\session_5265929106 -> D:\datauser\NEWNEW_converted_files\user12\session_5265929106.csv
Converted: D:\datauser\training_files\user12\session_5815391283 -> D:\datauser\NEWNEW_converted_files\user12\session_5815391283.csv
Converted: D:\datauser\training_files\user12\session_7409188284 -> D:\datauser\NEWNEW_converted_files\user12\session_7409188284.csv
Converted: D:\datauser\training_files\user12\session_8872593360 -> D:\datauser\NEWNEW_converted_files\user12\session_8872593360.csv
Converted: D:\datauser\training_files\user12\session_9031593624 -> D:\datauser\NEWNEW_converted_files\user12\session_9031593624.csv
Converted: D:\datauser\training_files\user12\session_9838420452 -> D:\datauser\NEWNEW_converted_files\user12\session_9838420452.csv
Converted: D:\datauser\training_files\user15\session_0205904470 -> D:\dataus

In [2]:
import os
import pandas as pd
import numpy as np

# 定义路径
input_dir = r'D:\datauser\NEWNEW_converted_files'  # 已经保存的CSV文件路径
output_dir = r'D:\datauser\NEW_PROCESSED_files'  # 处理后CSV文件的新保存路径

# 创建保存路径（如果不存在）
os.makedirs(output_dir, exist_ok=True)

# 处理函数
def process_csv(file_path, save_path):
    df = pd.read_csv(file_path)

    # 确保数据按 'record timestamp' 排序
    df.sort_values(by='record timestamp', inplace=True)
    df.reset_index(drop=True, inplace=True)  # 重置索引，确保索引连续

    # 遍历处理 button 列为 Left 和 Right 的行
    for button in ['Left', 'Right']:
        button_rows = df[df['button'] == button].index.to_list()  # 获取所有该按钮的索引列表

        indices_to_remove = []  # 标记要删除的行

        for i in button_rows:
            if df.loc[i, 'state'] == 'Released':
                # 判断前一行是否为 Pressed 且索引合法
                if i > 0 and df.loc[i - 1, 'state'] == 'Pressed' and df.loc[i - 1, 'button'] == button:
                    # 检查坐标是否一致
                    if df.loc[i, 'x'] == df.loc[i - 1, 'x'] and df.loc[i, 'y'] == df.loc[i - 1, 'y']:
                        # 合并两行
                        df.loc[i - 1, 'state'] = 'Pressed&Released'
                        df.loc[i - 1, 'press_duration'] = df.loc[i, 'record timestamp'] - df.loc[i - 1, 'record timestamp']
                        df.loc[i - 1, 'velocity'] = 0  # 特殊标记速度

                        # 填充动态特征列为 0
                        for col in ['distance', 'acceleration', 'curvature', 'angle_change',
                                    'x_velocity', 'y_velocity', 'x_acceleration', 'y_acceleration']:
                            if col in df.columns:
                                df.loc[i - 1, col] = 0

                        # 标记 Released 行为待删除
                        indices_to_remove.append(i)
                    else:
                        # 坐标不一致，标记 Pressed 和 Released 行为待删除
                        indices_to_remove.extend([i - 1, i])
                else:
                    # 没有对应的 Pressed，标记当前行为垃圾数据
                    indices_to_remove.append(i)

            elif df.loc[i, 'state'] == 'Pressed':
                # 判断后一行是否为 Released 且索引合法
                if i < len(df) - 1 and df.loc[i + 1, 'state'] == 'Released' and df.loc[i + 1, 'button'] == button:
                    # 检查坐标是否一致
                    if df.loc[i, 'x'] == df.loc[i + 1, 'x'] and df.loc[i, 'y'] == df.loc[i + 1, 'y']:
                        # 合并两行
                        df.loc[i, 'state'] = 'Pressed&Released'
                        df.loc[i, 'press_duration'] = df.loc[i + 1, 'record timestamp'] - df.loc[i, 'record timestamp']
                        df.loc[i, 'velocity'] = 0  # 特殊标记速度

                        # 填充动态特征列为 0
                        for col in ['distance', 'acceleration', 'curvature', 'angle_change',
                                    'x_velocity', 'y_velocity', 'x_acceleration', 'y_acceleration']:
                            if col in df.columns:
                                df.loc[i, col] = 0

                        # 标记 Released 行为待删除
                        indices_to_remove.append(i + 1)
                    else:
                        # 坐标不一致，标记 Pressed 和 Released 行为待删除
                        indices_to_remove.extend([i, i + 1])
                else:
                    # 没有对应的 Released，标记当前行为垃圾数据
                    indices_to_remove.append(i)

        # 删除标记的行
        indices_to_remove = list(set(indices_to_remove))  # 去重
        df.drop(indices_to_remove, inplace=True)
        df.reset_index(drop=True, inplace=True)  # 每次删除后重置索引，确保连续性

    # 填充 press_duration 中的空值为 0
    if 'press_duration' in df.columns:
        df['press_duration'].fillna(0, inplace=True)

    # 保存处理后的文件
    df.to_csv(save_path, index=False)
    print(f"Processed and saved: {save_path}")


# 遍历所有CSV文件并处理
for folder_name in os.listdir(input_dir):
    folder_path = os.path.join(input_dir, folder_name)
    if os.path.isdir(folder_path):
        save_folder_path = os.path.join(output_dir, folder_name)
        os.makedirs(save_folder_path, exist_ok=True)

        for file_name in os.listdir(folder_path):
            if file_name.endswith('.csv'):
                input_file_path = os.path.join(folder_path, file_name)
                output_file_path = os.path.join(save_folder_path, file_name)
                process_csv(input_file_path, output_file_path)


Processed and saved: D:\datauser\NEW_PROCESSED_files\user12\session_2144641057.csv
Processed and saved: D:\datauser\NEW_PROCESSED_files\user12\session_5265929106.csv
Processed and saved: D:\datauser\NEW_PROCESSED_files\user12\session_5815391283.csv
Processed and saved: D:\datauser\NEW_PROCESSED_files\user12\session_7409188284.csv
Processed and saved: D:\datauser\NEW_PROCESSED_files\user12\session_8872593360.csv
Processed and saved: D:\datauser\NEW_PROCESSED_files\user12\session_9031593624.csv
Processed and saved: D:\datauser\NEW_PROCESSED_files\user12\session_9838420452.csv
Processed and saved: D:\datauser\NEW_PROCESSED_files\user15\session_0205904470.csv
Processed and saved: D:\datauser\NEW_PROCESSED_files\user15\session_1366248436.csv
Processed and saved: D:\datauser\NEW_PROCESSED_files\user15\session_5657866014.csv
Processed and saved: D:\datauser\NEW_PROCESSED_files\user15\session_6715291950.csv
Processed and saved: D:\datauser\NEW_PROCESSED_files\user15\session_8694009379.csv
Proc

In [3]:
import os
import pandas as pd
import numpy as np

# 定义路径
input_dir = r'D:\datauser\NEW_PROCESSED_files'  # 已经保存的CSV文件路径
output_dir = r'D:\datauser\NEW_cleaned_files'  # 处理后CSV文件的新保存路径

# 创建保存路径（如果不存在）
os.makedirs(output_dir, exist_ok=True)

# 处理函数
def process_csv(file_path, save_path):
    df = pd.read_csv(file_path)

    # 确保数据按 'record timestamp' 排序
    df.sort_values(by='record timestamp', inplace=True)
    df.reset_index(drop=True, inplace=True)  # 重置索引，确保索引连续

    # 针对 button 列为 Scroll 的行
    scroll_indices = df[df['button'] == 'Scroll'].index.to_list()  # 获取所有 Scroll 行的索引

    for i in scroll_indices:
        if i > 0:  # 跳过首行（因为没有上一行）
            df.at[i, 'x'] = df.at[i - 1, 'x']
            df.at[i, 'y'] = df.at[i - 1, 'y']

    # 保存处理后的文件
    df.to_csv(save_path, index=False)
    print(f"Processed and saved: {save_path}")


# 遍历所有CSV文件并处理
for folder_name in os.listdir(input_dir):
    folder_path = os.path.join(input_dir, folder_name)
    if os.path.isdir(folder_path):
        save_folder_path = os.path.join(output_dir, folder_name)
        os.makedirs(save_folder_path, exist_ok=True)

        for file_name in os.listdir(folder_path):
            if file_name.endswith('.csv'):
                input_file_path = os.path.join(folder_path, file_name)
                output_file_path = os.path.join(save_folder_path, file_name)
                process_csv(input_file_path, output_file_path)


Processed and saved: D:\datauser\NEW_cleaned_files\user12\session_2144641057.csv
Processed and saved: D:\datauser\NEW_cleaned_files\user12\session_5265929106.csv
Processed and saved: D:\datauser\NEW_cleaned_files\user12\session_5815391283.csv
Processed and saved: D:\datauser\NEW_cleaned_files\user12\session_7409188284.csv
Processed and saved: D:\datauser\NEW_cleaned_files\user12\session_8872593360.csv
Processed and saved: D:\datauser\NEW_cleaned_files\user12\session_9031593624.csv
Processed and saved: D:\datauser\NEW_cleaned_files\user12\session_9838420452.csv
Processed and saved: D:\datauser\NEW_cleaned_files\user15\session_0205904470.csv
Processed and saved: D:\datauser\NEW_cleaned_files\user15\session_1366248436.csv
Processed and saved: D:\datauser\NEW_cleaned_files\user15\session_5657866014.csv
Processed and saved: D:\datauser\NEW_cleaned_files\user15\session_6715291950.csv
Processed and saved: D:\datauser\NEW_cleaned_files\user15\session_8694009379.csv
Processed and saved: D:\data

In [4]:
import os
import pandas as pd
import numpy as np

# 定义路径
input_dir = r'D:\datauser\NEW_cleaned_files'  # 已经保存的CSV文件路径
output_dir = r'D:\datauser\NEW_fully_cleaned_files'  # 处理后CSV文件的新保存路径

# 创建保存路径（如果不存在）
os.makedirs(output_dir, exist_ok=True)

# 处理函数
def process_csv(file_path, save_path):
    df = pd.read_csv(file_path)

    # 确保数据按 'record timestamp' 排序
    df.sort_values(by='record timestamp', inplace=True)
    df.reset_index(drop=True, inplace=True)  # 重置索引，确保索引连续

    # 针对 button 列进行替换
    button_mapping = {
        'NoButton': 0,
        'Left': 1,
        'Right': 2,
        'Scroll': 3
    }
    if 'button' in df.columns:
        df['button'] = df['button'].map(button_mapping)

    # 针对 state 列进行替换
    state_mapping = {
        'Move': 0,
        'Drag': 1,
        'Down': 2,
        'Up': 3,
        'Pressed&Released': 4
    }
    if 'state' in df.columns:
        df['state'] = df['state'].map(state_mapping)

    # 保存处理后的文件
    df.to_csv(save_path, index=False)
    print(f"Processed and saved: {save_path}")


# 遍历所有CSV文件并处理
for folder_name in os.listdir(input_dir):
    folder_path = os.path.join(input_dir, folder_name)
    if os.path.isdir(folder_path):
        save_folder_path = os.path.join(output_dir, folder_name)
        os.makedirs(save_folder_path, exist_ok=True)

        for file_name in os.listdir(folder_path):
            if file_name.endswith('.csv'):
                input_file_path = os.path.join(folder_path, file_name)
                output_file_path = os.path.join(save_folder_path, file_name)
                process_csv(input_file_path, output_file_path)


Processed and saved: D:\datauser\NEW_fully_cleaned_files\user12\session_2144641057.csv
Processed and saved: D:\datauser\NEW_fully_cleaned_files\user12\session_5265929106.csv
Processed and saved: D:\datauser\NEW_fully_cleaned_files\user12\session_5815391283.csv
Processed and saved: D:\datauser\NEW_fully_cleaned_files\user12\session_7409188284.csv
Processed and saved: D:\datauser\NEW_fully_cleaned_files\user12\session_8872593360.csv
Processed and saved: D:\datauser\NEW_fully_cleaned_files\user12\session_9031593624.csv
Processed and saved: D:\datauser\NEW_fully_cleaned_files\user12\session_9838420452.csv
Processed and saved: D:\datauser\NEW_fully_cleaned_files\user15\session_0205904470.csv
Processed and saved: D:\datauser\NEW_fully_cleaned_files\user15\session_1366248436.csv
Processed and saved: D:\datauser\NEW_fully_cleaned_files\user15\session_5657866014.csv
Processed and saved: D:\datauser\NEW_fully_cleaned_files\user15\session_6715291950.csv
Processed and saved: D:\datauser\NEW_fully_

In [5]:
import os
import pandas as pd
import numpy as np

# 定义路径
input_dir = r'D:\datauser\NEW_fully_cleaned_files'  # 已经保存的CSV文件路径
output_dir = r'D:\datauser\NEW_Data_files'  # 处理后CSV文件的新保存路径

# 创建保存路径（如果不存在）
os.makedirs(output_dir, exist_ok=True)

# 处理函数
def process_csv(file_path, save_path):
    df = pd.read_csv(file_path)

    # 将所有的NaN值替换为0
    df.fillna(0, inplace=True)

    # 保存处理后的文件
    df.to_csv(save_path, index=False)
    print(f"Processed and saved: {save_path}")


# 遍历所有CSV文件并处理
for folder_name in os.listdir(input_dir):
    folder_path = os.path.join(input_dir, folder_name)
    if os.path.isdir(folder_path):
        save_folder_path = os.path.join(output_dir, folder_name)
        os.makedirs(save_folder_path, exist_ok=True)

        for file_name in os.listdir(folder_path):
            if file_name.endswith('.csv'):
                input_file_path = os.path.join(folder_path, file_name)
                output_file_path = os.path.join(save_folder_path, file_name)
                process_csv(input_file_path, output_file_path)


Processed and saved: D:\datauser\NEW_Data_files\user12\session_2144641057.csv
Processed and saved: D:\datauser\NEW_Data_files\user12\session_5265929106.csv
Processed and saved: D:\datauser\NEW_Data_files\user12\session_5815391283.csv
Processed and saved: D:\datauser\NEW_Data_files\user12\session_7409188284.csv
Processed and saved: D:\datauser\NEW_Data_files\user12\session_8872593360.csv
Processed and saved: D:\datauser\NEW_Data_files\user12\session_9031593624.csv
Processed and saved: D:\datauser\NEW_Data_files\user12\session_9838420452.csv
Processed and saved: D:\datauser\NEW_Data_files\user15\session_0205904470.csv
Processed and saved: D:\datauser\NEW_Data_files\user15\session_1366248436.csv
Processed and saved: D:\datauser\NEW_Data_files\user15\session_5657866014.csv
Processed and saved: D:\datauser\NEW_Data_files\user15\session_6715291950.csv
Processed and saved: D:\datauser\NEW_Data_files\user15\session_8694009379.csv
Processed and saved: D:\datauser\NEW_Data_files\user15\session_8

In [6]:
import os
import pandas as pd

# 定义路径
root_dir = r'D:\datauser\NEW_Data_files'  # 存放所有用户文件夹的路径
output_dir = r'D:\datauser\NEW_merged_files'  # 存放合并后的文件的路径

# 创建保存路径（如果不存在）
os.makedirs(output_dir, exist_ok=True)

# 处理函数：竖向拼接 CSV 文件
def merge_csv_files(folder_path, save_path):
    merged_data = pd.DataFrame()  # 初始化空的 DataFrame
    
    # 遍历文件夹内的所有 CSV 文件
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            try:
                # 读取 CSV 文件
                df = pd.read_csv(file_path)
                
                if merged_data.empty:
                    # 如果是第一个文件，保留列名
                    merged_data = df
                else:
                    # 对于后续文件，直接添加数据（不保留列名）
                    merged_data = pd.concat([merged_data, df], ignore_index=True)
                    
            except Exception as e:
                print(f"Error processing {file_path}: {e}")

    # 保存合并后的数据为 CSV
    merged_data.to_csv(save_path, index=False)
    print(f"Merged and saved: {save_path}")

# 遍历所有 "user" 文件夹并处理
for folder_name in os.listdir(root_dir):
    folder_path = os.path.join(root_dir, folder_name)
    
    if os.path.isdir(folder_path) and folder_name.startswith('user'):
        save_folder_path = os.path.join(output_dir, folder_name)
        os.makedirs(save_folder_path, exist_ok=True)

        # 处理每个 user 文件夹内的 CSV 文件
        output_file_path = os.path.join(save_folder_path, f'{folder_name}_merged.csv')
        merge_csv_files(folder_path, output_file_path)


Merged and saved: D:\datauser\NEW_merged_files\user12\user12_merged.csv
Merged and saved: D:\datauser\NEW_merged_files\user15\user15_merged.csv
Merged and saved: D:\datauser\NEW_merged_files\user16\user16_merged.csv
Merged and saved: D:\datauser\NEW_merged_files\user20\user20_merged.csv
Merged and saved: D:\datauser\NEW_merged_files\user21\user21_merged.csv
Merged and saved: D:\datauser\NEW_merged_files\user23\user23_merged.csv
Merged and saved: D:\datauser\NEW_merged_files\user29\user29_merged.csv
Merged and saved: D:\datauser\NEW_merged_files\user35\user35_merged.csv
Merged and saved: D:\datauser\NEW_merged_files\user7\user7_merged.csv
Merged and saved: D:\datauser\NEW_merged_files\user9\user9_merged.csv
