In [11]:
import os
import numpy as np
import pandas as pd
import json
from typing import List
from collections import defaultdict

class DataProcessor:
    def __init__(self, root_dir: str, output_dir: str, sequence_lengths: List[int], 
                 total_sequence_length: int, slide_ratio: float, pos_neg_ratio: int, 
                 pos_user_id: str, columns: List[str] = None):
        """
        初始化数据处理器
        
        Args:
            root_dir: 根目录路径
            output_dir: 输出目录路径
            sequence_lengths: 窗口长度列表（每个样本包含的数据点数量）
            total_sequence_length: 要处理的数据总长度
            slide_ratio: 滑动窗口的移动比例
            pos_neg_ratio: 正负样本比例
            pos_user_id: 正样本用户ID
            columns: 需要提取的列名
        """
        self.root_dir = root_dir
        self.base_output_dir = output_dir
        self.sequence_lengths = sequence_lengths
        self.total_sequence_length = total_sequence_length
        self.slide_ratio = slide_ratio
        self.pos_neg_ratio = pos_neg_ratio
        self.pos_user_id = pos_user_id
        
        # 为每个窗口长度创建单独的输出目录
        self.output_dirs = {}
        for seq_len in self.sequence_lengths:
            dir_path = os.path.join(output_dir, f"processed_data_user{pos_user_id}")
            os.makedirs(dir_path, exist_ok=True)
            self.output_dirs[seq_len] = dir_path
        
        if columns is None:
            self.columns = ['x', 'y', 'velocity', 'acceleration', 'curvature',
                          'angle_change', 'x_velocity', 'y_velocity', 
                          'x_acceleration', 'y_acceleration', 'press_duration']
        else:
            self.columns = columns
        
        # 为每个窗口长度存储统计信息
        self.sample_stats = {seq_len: defaultdict(int) for seq_len in sequence_lengths}

    def load_user_data(self, user_id: str) -> pd.DataFrame:
        """
        加载指定用户的数据文件
        
        Args:
            user_id: 用户ID
            
        Returns:
            pd.DataFrame: 用户的数据
        """
        try:
            user_dir = os.path.join(self.root_dir, f"user{user_id}")
            
            if not os.path.exists(user_dir):
                raise FileNotFoundError(f"找不到用户 {user_id} 的目录: {user_dir}")
            
            csv_files = [f for f in os.listdir(user_dir) if f.endswith('.csv')]
            
            if not csv_files:
                raise FileNotFoundError(f"用户 {user_id} 的目录中没有CSV文件: {user_dir}")
            
            csv_file = sorted(csv_files)[-1]
            file_path = os.path.join(user_dir, csv_file)
            
            df = pd.read_csv(file_path)
            
            missing_columns = [col for col in self.columns if col not in df.columns]
            if missing_columns:
                raise ValueError(f"数据文件缺少以下列: {missing_columns}")
                
            if len(df) > self.total_sequence_length:
                df = df.iloc[:self.total_sequence_length]
                
            return df
            
        except Exception as e:
            print(f"加载用户 {user_id} 数据时出错: {str(e)}")
            raise

    def create_sequences(self, df: pd.DataFrame, user_id: str, sequence_length: int) -> np.ndarray:
        """创建指定窗口长度的样本序列"""
        sequences = []
        data = df[self.columns].values
        
        slide_window = max(1, int(sequence_length * self.slide_ratio))
        
        max_start_idx = min(self.total_sequence_length - sequence_length, 
                          len(data) - sequence_length)
        
        for i in range(0, max_start_idx + 1, slide_window):
            sequence = data[i:i + sequence_length]
            if len(sequence) == sequence_length:
                sequences.append(sequence)
        
        sequences = np.array(sequences)
        self.sample_stats[sequence_length][f"user{user_id}"] = len(sequences)
        
        return sequences

    def save_samples(self, samples: np.ndarray, filename: str, sequence_length: int):
        """
        保存样本到JSON文件
        
        Args:
            samples: 形状为 (n_samples, sequence_length, n_features) 的样本数组
            filename: 文件名前缀
            sequence_length: 序列长度
        """
        if len(samples) == 0:
            print(f"警告: {filename} 没有样本可以保存")
            return
            
        filename_prefix = filename.split('_')[0]
        output_path = os.path.join(
            self.output_dirs[sequence_length], 
            f"{filename_prefix}_samples_user{self.pos_user_id}_{sequence_length}.json"
        )
        
        # 构建JSON数据结构
        data = {
            'metadata': {
                'shape': list(samples.shape),
                'feature_names': self.columns,
                'sequence_length': sequence_length
            },
            'samples': []
        }
        
        # 将每个样本转换为列表并添加到samples列表中
        for sample in samples:
            # sample的形状是(sequence_length, n_features)
            sample_list = []
            for timestep in sample:
                # timestep的形状是(n_features,)
                feature_dict = {feature: float(value) for feature, value in zip(self.columns, timestep)}
                sample_list.append(feature_dict)
            data['samples'].append(sample_list)
        
        # 保存为JSON文件
        with open(output_path, 'w') as f:
            json.dump(data, f, indent=2)
            
        print(f"已保存 {len(samples)} 个样本到 {output_path}")
        print(f"数据形状: {samples.shape}")

    def load_samples(self, filepath: str) -> np.ndarray:
        """
        从JSON文件加载样本数据
        
        Args:
            filepath: JSON文件路径
            
        Returns:
            np.ndarray: 形状为 (n_samples, sequence_length, n_features) 的数组
        """
        with open(filepath, 'r') as f:
            data = json.load(f)
        
        # 从metadata获取信息
        shape = data['metadata']['shape']
        feature_names = data['metadata']['feature_names']
        
        # 将JSON数据转换回numpy数组
        samples_list = []
        for sample in data['samples']:
            # 转换单个样本
            sample_array = np.zeros((len(sample), len(feature_names)))
            for i, timestep in enumerate(sample):
                for j, feature in enumerate(feature_names):
                    sample_array[i, j] = timestep[feature]
            samples_list.append(sample_array)
        
        # 转换为numpy数组并确保形状正确
        samples = np.array(samples_list)
        assert samples.shape == tuple(shape), f"加载的数据形状 {samples.shape} 与预期形状 {shape} 不符"
        
        print(f"已加载数据，形状: {samples.shape}")
        return samples

    def save_sample_stats(self, sequence_length: int):
        """保存样本统计信息"""
        stats_path = os.path.join(self.output_dirs[sequence_length], 'sample_statistics.txt')
        with open(stats_path, 'w', encoding='utf-8') as f:
            f.write("样本统计信息:\n")
            f.write("-" * 50 + "\n")
            f.write(f"窗口长度: {sequence_length}\n")
            f.write(f"总序列长度: {self.total_sequence_length}\n")
            f.write(f"滑动步长: {max(1, int(sequence_length * self.slide_ratio))}\n\n")
            
            f.write("每个用户可用的样本总数:\n")
            for user_id, count in sorted(self.sample_stats[sequence_length].items()):
                if not user_id.endswith('_selected'):
                    f.write(f"{user_id}: {count}\n")
            
            f.write("\n实际选择的样本数:\n")
            for user_id, count in sorted(self.sample_stats[sequence_length].items()):
                if user_id.endswith('_selected'):
                    f.write(f"{user_id.replace('_selected', '')}: {count}\n")

    def process_data(self, neg_user_ids: List[str], 
                    pred_user_ids: List[str]) -> dict:
        """处理所有数据"""
        results = {}
        
        for sequence_length in self.sequence_lengths:
            print(f"\n处理窗口长度 {sequence_length}:")
            
            # 获取正样本
            df_pos = self.load_user_data(self.pos_user_id)
            positive_samples = self.create_sequences(df_pos, self.pos_user_id, sequence_length)
            
            if len(positive_samples) == 0:
                print(f"警告: 窗口长度 {sequence_length} 未能生成正样本")
                continue

            n_positive = len(positive_samples)
            n_negative = max(1, n_positive // self.pos_neg_ratio)
            n_predict = max(1, n_positive // self.pos_neg_ratio)
            
            # 获取负样本
            negative_samples = []
            samples_per_user = max(1, n_negative // len(neg_user_ids))
            for user_id in neg_user_ids:
                df_neg = self.load_user_data(user_id)
                sequences = self.create_sequences(df_neg, user_id, sequence_length)
                if len(sequences) >= samples_per_user:
                    indices = np.linspace(0, len(sequences)-1, samples_per_user, dtype=int)
                    selected_samples = sequences[indices]
                else:
                    indices = np.random.choice(len(sequences), samples_per_user, replace=True)
                    selected_samples = sequences[indices]
                negative_samples.extend(selected_samples)
            negative_samples = np.array(negative_samples)
            
            # 获取预测样本
            predict_samples = []
            samples_per_user = max(1, n_predict // len(pred_user_ids))
            for user_id in pred_user_ids:
                df_pred = self.load_user_data(user_id)
                sequences = self.create_sequences(df_pred, user_id, sequence_length)
                if len(sequences) >= samples_per_user:
                    indices = np.linspace(0, len(sequences)-1, samples_per_user, dtype=int)
                    selected_samples = sequences[indices]
                else:
                    indices = np.random.choice(len(sequences), samples_per_user, replace=True)
                    selected_samples = sequences[indices]
                predict_samples.extend(selected_samples)
            predict_samples = np.array(predict_samples)
            
            # 保存样本
            self.save_samples(positive_samples, 'positive', sequence_length)
            self.save_samples(negative_samples, 'negative', sequence_length)
            self.save_samples(predict_samples, 'predict', sequence_length)
            
            # 保存统计信息
            self.save_sample_stats(sequence_length)
            
            results[sequence_length] = {
                'positive': positive_samples,
                'negative': negative_samples,
                'predict': predict_samples
            }
            
            print(f"窗口长度 {sequence_length} 处理完成:")
            print(f"正样本形状: {positive_samples.shape}")
            print(f"负样本形状: {negative_samples.shape}")
            print(f"预测样本形状: {predict_samples.shape}")
        
        return results

# 使用示例
if __name__ == "__main__":
    root_dir = r"D:/datauser/merged_files"
    output_dir = r"D:/论文数据/mouse/data"
    sequence_lengths = [50, 100, 150, 200,250,300,350,400,450,500]
    total_sequence_length = 98000
    slide_ratio = 0.5
    pos_neg_ratio = 10
    pos_user_id = "23"

    try:
        processor = DataProcessor(
            root_dir=root_dir,
            output_dir=output_dir,
            sequence_lengths=sequence_lengths,
            total_sequence_length=total_sequence_length,
            slide_ratio=slide_ratio,
            pos_neg_ratio=pos_neg_ratio,
            pos_user_id=pos_user_id
        )

        neg_user_ids = ["29", "9", "12", "16", "15", "20", "35"]
        pred_user_ids = ["7", "21"]

        results = processor.process_data(neg_user_ids, pred_user_ids)
        print("\n所有窗口长度处理完成")
        
    except Exception as e:
        print(f"错误: {str(e)}")


处理窗口长度 50:
已保存 3919 个样本到 D:/论文数据/mouse/data\processed_data_user23\positive_samples_user23_50.json
数据形状: (3919, 50, 11)
已保存 385 个样本到 D:/论文数据/mouse/data\processed_data_user23\negative_samples_user23_50.json
数据形状: (385, 50, 11)
已保存 390 个样本到 D:/论文数据/mouse/data\processed_data_user23\predict_samples_user23_50.json
数据形状: (390, 50, 11)
窗口长度 50 处理完成:
正样本形状: (3919, 50, 11)
负样本形状: (385, 50, 11)
预测样本形状: (390, 50, 11)

处理窗口长度 100:
已保存 1959 个样本到 D:/论文数据/mouse/data\processed_data_user23\positive_samples_user23_100.json
数据形状: (1959, 100, 11)
已保存 189 个样本到 D:/论文数据/mouse/data\processed_data_user23\negative_samples_user23_100.json
数据形状: (189, 100, 11)
已保存 194 个样本到 D:/论文数据/mouse/data\processed_data_user23\predict_samples_user23_100.json
数据形状: (194, 100, 11)
窗口长度 100 处理完成:
正样本形状: (1959, 100, 11)
负样本形状: (189, 100, 11)
预测样本形状: (194, 100, 11)

处理窗口长度 150:
已保存 1305 个样本到 D:/论文数据/mouse/data\processed_data_user23\positive_samples_user23_150.json
数据形状: (1305, 150, 11)
已保存 126 个样本到 D:/论文数据/mouse/data\processed_data