In [3]:
import numpy as np  # 用于数学计算和数组操作
import pandas as pd  # 用于数据处理和分析
import matplotlib.pyplot as plt  # 用于数据可视化
import polars as pl  # 高性能数据处理库，类似于pandas
import datetime  # 用于处理日期和时间
from tqdm import tqdm  # 用于显示进度条
import plotly.express as px  # 用于交互式数据可视化
from plotly.subplots import make_subplots  # 用于创建子图
import plotly.graph_objects as go  # 用于创建更复杂的图表
from metric import score  # 导入自定义的评分函数
from sklearn.model_selection import train_test_split  # 用于划分训练集和测试集

tolerances = {
    'onset': [12, 36, 60, 90, 120, 150, 180, 240, 300, 360],  # 睡眠开始事件的时间容忍度（分钟）
    'wakeup': [12, 36, 60, 90, 120, 150, 180, 240, 300, 360]  # 睡眠结束事件的时间容忍度（分钟）
}

In [4]:
# 定义时间格式常量
TIME_FORMAT = "%Y-%m-%dT%H:%M:%S%z"  # 这个格式可以处理 -0400, -0500, +0530 等各种时区偏移

# 创建时间转换表达式
timestamp_expr = pl.col('timestamp').str.to_datetime(format=TIME_FORMAT, time_zone='UTC')

dt_transforms = [
    timestamp_expr.alias('timestamp'),  # 转换为UTC时区的datetime
    (timestamp_expr.dt.year() - 2000).cast(pl.UInt8).alias('year'),  # 提取年份（减去2000以节省空间）
    timestamp_expr.dt.month().cast(pl.UInt8).alias('month'),  # 提取月份
    timestamp_expr.dt.day().cast(pl.UInt8).alias('day'),  # 提取日期
    timestamp_expr.dt.hour().cast(pl.UInt8).alias('hour')  # 提取小时
]

data_transforms = [
    pl.col('anglez').cast(pl.Int16), # 将anglez转换为16位整数以节省空间
    (pl.col('enmo')*1000).cast(pl.UInt16), # 将enmo乘以1000并转换为16位无符号整数
]

# 读取训练数据（使用lazy loading提高效率）
train_series = pl.scan_parquet('/home/zhuangzhuohan/sleep_data/train_series.parquet').with_columns(
    dt_transforms + data_transforms
    )

# 读取训练事件数据
train_events = pl.read_csv('/home/zhuangzhuohan/sleep_data/train_events.csv').with_columns(
    dt_transforms
    ).drop_nulls()  # 删除空值

# 读取测试数据（使用lazy loading提高效率）
test_series = pl.scan_parquet('/home/zhuangzhuohan/sleep_data/test_series.parquet').with_columns(
    dt_transforms + data_transforms
    )

# 移除事件数量不匹配的夜晚（确保每个onset对应一个wakeup）
mismatches = train_events.drop_nulls().group_by(['series_id', 'night']).agg([
    ((pl.col('event') == 'onset').sum() == (pl.col('event') == 'wakeup').sum()).alias('balanced')
    ]).sort(by=['series_id', 'night']).filter(~pl.col('balanced'))

for mm in mismatches.to_numpy(): 
    train_events = train_events.filter(~((pl.col('series_id') == mm[0]) & (pl.col('night') == mm[1])))

# 获取唯一的series_id列表
series_ids = train_events['series_id'].unique(maintain_order=True).to_list()

# 更新train_series，只保留有事件数据的series_id
train_series = train_series.filter(pl.col('series_id').is_in(series_ids))

In [5]:
features, feature_cols = [pl.col('hour')], ['hour']  # 初始化特征列表，先加入小时特征

# 为不同时间窗口创建特征
for mins in [5, 30, 60*2, 60*8] :  # 5分钟、30分钟、2小时、8小时
    
    for var in ['enmo', 'anglez'] :  # 对enmo和anglez两个变量创建特征
        
        # 创建基础统计特征
        features += [
            # 计算滚动平均值（绝对值）
            pl.col(var).rolling_mean(12 * mins, center=True, min_samples=1).abs().cast(pl.UInt16).alias(f'{var}_{mins}m_mean'),
            # 计算滚动最大值（绝对值）
            pl.col(var).rolling_max(12 * mins, center=True, min_samples=1).abs().cast(pl.UInt16).alias(f'{var}_{mins}m_max'),
            # 计算滚动标准差（绝对值）
            pl.col(var).rolling_std(12 * mins, center=True, min_samples=1).abs().cast(pl.UInt16).alias(f'{var}_{mins}m_std')
        ]
        
        # 更新特征列名列表
        feature_cols += [ 
            f'{var}_{mins}m_mean', f'{var}_{mins}m_max', f'{var}_{mins}m_std'
        ]
        
        # 创建一阶差分特征（衡量变化率）
        features += [
            # 计算一阶差分的滚动平均值（绝对值）
            (pl.col(var).diff().abs().rolling_mean(12 * mins, center=True, min_samples=1)*10).abs().cast(pl.UInt32).alias(f'{var}_1v_{mins}m_mean'),
            # 计算一阶差分的滚动最大值（绝对值）
            (pl.col(var).diff().abs().rolling_max(12 * mins, center=True, min_samples=1)*10).abs().cast(pl.UInt32).alias(f'{var}_1v_{mins}m_max'),
            # 计算一阶差分的滚动标准差（绝对值）
            (pl.col(var).diff().abs().rolling_std(12 * mins, center=True, min_samples=1)*10).abs().cast(pl.UInt32).alias(f'{var}_1v_{mins}m_std')
        ]
        
        # 更新特征列名列表
        feature_cols += [ 
            f'{var}_1v_{mins}m_mean', f'{var}_1v_{mins}m_max', f'{var}_1v_{mins}m_std'
        ]

id_cols = ['series_id', 'step', 'timestamp']  # 标识列

# 应用特征变换到训练数据
train_series = train_series.with_columns(
    features
).select(id_cols + feature_cols)  # 只保留需要的列

# 应用特征变换到测试数据
test_series = test_series.with_columns(
    features
).select(id_cols + feature_cols)

In [6]:
def make_train_dataset(train_data, train_events, drop_nulls=False):
    """
    创建训练数据集的改进版本，修复了一些问题
    
    参数:
    train_data: 训练时间序列数据
    train_events: 训练事件数据
    drop_nulls: 是否删除没有事件记录的日期数据
    
    返回:
    X: 特征矩阵
    y: 标签向量
    """
    
    series_ids = train_data['series_id'].unique(maintain_order=True).to_list()
    X, y = pl.DataFrame(), pl.DataFrame()  # 初始化特征和标签数据框
    
    for idx in tqdm(series_ids):  # 遍历每个series_id
        
        # 标准化样本特征
        sample = train_data.filter(pl.col('series_id') == idx).with_columns(
            [(pl.col(col) / pl.col(col).std()).cast(pl.Float32) for col in feature_cols if col != 'hour']
        )
        
        events = train_events.filter(pl.col('series_id') == idx)  # 获取当前series_id的事件数据
        
        if drop_nulls:
            # 移除没有事件记录的日期的数据点
            sample = sample.filter(
                pl.col('timestamp').dt.date().is_in(events['timestamp'].dt.date())
            )
        
        X = X.vstack(sample[id_cols + feature_cols])  # 添加特征数据
        
        # 修复：使用is_not_null()检查空值
        onsets = events.filter((pl.col('event') == 'onset') & (pl.col('step').is_not_null()))['step'].to_list()
        wakeups = events.filter((pl.col('event') == 'wakeup') & (pl.col('step').is_not_null()))['step'].to_list()
        
        # 修复：使用pl.sum_horizontal替代sum，并添加错误处理
        if onsets and wakeups and len(onsets) == len(wakeups):
            conditions = [(onset <= pl.col('step')) & (pl.col('step') <= wakeup) for onset, wakeup in zip(onsets, wakeups)]
            y = y.vstack(sample.with_columns(
                pl.sum_horizontal(conditions).cast(pl.Boolean).alias('asleep')
            ).select('asleep'))
        else:
            # 如果没有有效的睡眠区间，创建全为False的列
            y = y.vstack(sample.with_columns(
                pl.lit(False).alias('asleep')
            ).select('asleep'))
    
    y = y.to_numpy().ravel()  # 将标签转换为一维数组
    
    return X, y

In [7]:
def get_events(series, classifier) :
    '''
    将分类器的预测结果转换为睡眠事件（onset和wakeup），并生成提交格式的数据框
    
    参数:
    series: 时间序列数据
    classifier: 训练好的分类器模型
    
    返回:
    events: 包含预测事件的DataFrame，格式符合提交要求
    '''
    
    series_ids = series['series_id'].unique(maintain_order=True).to_list()
    events = pl.DataFrame(schema={'series_id':str, 'step':int, 'event':str, 'score':float})  # 初始化事件数据框

    for idx in tqdm(series_ids) :  # 遍历每个series_id，显示进度条

        # 准备数据并标准化特征
        scale_cols = [col for col in feature_cols if (col != 'hour') & (series[col].std() !=0)]
        X = series.filter(pl.col('series_id') == idx).select(id_cols + feature_cols).with_columns(
            [(pl.col(col) / series[col].std()).cast(pl.Float32) for col in scale_cols]
        )

        # 使用分类器进行预测，获取类别和概率
        preds, probs = classifier.predict(X[feature_cols]), classifier.predict_proba(X[feature_cols])[:, 1]

        # 将预测结果添加到数据框
        X = X.with_columns(
            pl.lit(preds).cast(pl.Int8).alias('prediction'), 
            pl.lit(probs).alias('probability')
                        )
        
        # 检测睡眠开始和结束事件（通过预测值的变化）
        pred_onsets = X.filter(X['prediction'].diff() > 0)['step'].to_list()  # 从0变为1的点为onset
        pred_wakeups = X.filter(X['prediction'].diff() < 0)['step'].to_list()  # 从1变为0的点为wakeup
        
        if len(pred_onsets) > 0 : 
            
            # 确保所有预测的睡眠周期都有开始和结束
            if min(pred_wakeups) < min(pred_onsets) : 
                pred_wakeups = pred_wakeups[1:]  # 移除第一个wakeup（如果它在第一个onset之前）

            if max(pred_onsets) > max(pred_wakeups) :
                pred_onsets = pred_onsets[:-1]  # 移除最后一个onset（如果它在最后一个wakeup之后）

            # 只保留持续时间超过30分钟的睡眠周期
            sleep_periods = [(onset, wakeup) for onset, wakeup in zip(pred_onsets, pred_wakeups) if wakeup - onset >= 12 * 30]

            for onset, wakeup in sleep_periods :
                # 计算睡眠周期内的平均概率作为分数
                score = X.filter((pl.col('step') >= onset) & (pl.col('step') <= wakeup))['probability'].mean()

                # 将睡眠事件添加到数据框
                events = events.vstack(pl.DataFrame().with_columns(
                    pl.Series([idx, idx]).alias('series_id'), 
                    pl.Series([onset, wakeup]).alias('step'),
                    pl.Series(['onset', 'wakeup']).alias('event'),
                    pl.Series([score, score]).alias('score')
                ))

    # 添加行ID列
    events = events.to_pandas().reset_index().rename(columns={'index':'row_id'})

    return events

In [8]:
# 1. 导入必要的库
from sklearn.model_selection import train_test_split

# 2. 定义列名映射（用于评分函数）
column_names = {
    'series_id_column_name': 'series_id',
    'time_column_name': 'step',
    'event_column_name': 'event',
    'score_column_name': 'score',
}

# 3. 划分训练集和验证集（70%训练，30%验证）
train_ids, val_ids = train_test_split(series_ids, train_size=0.7, random_state=42)

# 4. 收集训练数据，每5分钟取一个数据点（减少数据量）
train_data = train_series.filter(pl.col('series_id').is_in(train_ids)).collect()
# 转换为pandas DataFrame后使用切片方法
train_data = train_data.to_pandas().iloc[::(12 * 5)]  # 每5分钟（12*5步）取一个数据点
train_data = pl.from_pandas(train_data)  # 转回polars DataFrame

# 创建训练事件数据（只包含训练集的事件）
train_solution_series_id = train_events.filter(pl.col('series_id').is_in(train_ids))
train_solution = train_events.filter(pl.col('series_id').is_in(train_ids)).select(['series_id', 'event', 'step']).to_pandas()

# 统计真实的onset和wakeup事件数量
train_solution_onset_count = len(train_solution[train_solution['event'] == 'onset'])
train_solution_wakeup_count = len(train_solution[train_solution['event'] == 'wakeup'])
print(f"train_solution的onset事件数量: {train_solution_onset_count}")
print(f"train_solution的wakeup事件数量: {train_solution_wakeup_count}")

# 5. 创建验证数据
val_data = train_series.filter(pl.col('series_id').is_in(val_ids)).collect()

# 6. 创建验证标签（用于评估模型性能）
val_solution = train_events.filter(pl.col('series_id').is_in(val_ids)).select(['series_id', 'event', 'step']).to_pandas()

# 统计真实的onset和wakeup事件数量
val_solution_onset_count = len(val_solution[val_solution['event'] == 'onset'])
val_solution_wakeup_count = len(val_solution[val_solution['event'] == 'wakeup'])
print(f"val_solution的onset事件数量: {val_solution_onset_count}")
print(f"val_solution的wakeup事件数量: {val_solution_wakeup_count}")

train_solution的onset事件数量: 3416
train_solution的wakeup事件数量: 3416
val_solution的onset事件数量: 1374
val_solution的wakeup事件数量: 1374


In [9]:
# 统计验证数据并保存到CSV文件
import pandas as pd
import polars as pl

print("开始统计验证数据...")

# 1. 统计每个series_id的详细信息（保存到val_num.csv）
results = []

# 预处理：将val_solution按series_id分组，提取onset和wakeup
series_events = {}
for series_id in val_ids:
    # 获取当前series_id的事件
    events = val_solution[val_solution['series_id'] == series_id]
    
    # 提取onset和wakeup的step值并排序
    onset_steps = sorted(events[events['event'] == 'onset']['step'].tolist())
    wakeup_steps = sorted(events[events['event'] == 'wakeup']['step'].tolist())
    
    # 确保onset和wakeup成对
    sleep_periods = []
    min_len = min(len(onset_steps), len(wakeup_steps))
    for i in range(min_len):
        if onset_steps[i] < wakeup_steps[i]:
            sleep_periods.append((onset_steps[i], wakeup_steps[i]))
    
    series_events[series_id] = {
        'onset_count': len(onset_steps),
        'wakeup_count': len(wakeup_steps),
        'sleep_periods': sleep_periods
    }

# 处理每个series_id
for series_id in val_ids:
    # 获取当前series_id的val_data数据点
    series_data = val_data.filter(pl.col('series_id') == series_id)
    total_points = len(series_data)
    
    if total_points > 0:
        # 获取预计算的事件信息
        event_info = series_events.get(series_id, {
            'onset_count': 0,
            'wakeup_count': 0,
            'sleep_periods': []
        })
        
        onset_count = event_info['onset_count']
        wakeup_count = event_info['wakeup_count']
        sleep_periods = event_info['sleep_periods']
        
        # 优化：批量处理数据点
        if sleep_periods:
            # 转换为pandas以提高处理速度
            series_data_pd = series_data.to_pandas()
            
            # 定义判断函数
            def is_asleep(step):
                for onset, wakeup in sleep_periods:
                    if onset <= step <= wakeup:
                        return True
                return False
            
            # 批量应用
            asleep_mask = series_data_pd['step'].apply(is_asleep)
            asleep_count = asleep_mask.sum()
            awake_count = total_points - asleep_count
        else:
            # 无睡眠周期，全部为清醒
            asleep_count = 0
            awake_count = total_points
        
        # 计算比例
        awake_ratio = (awake_count / total_points) * 100 if total_points > 0 else 0
        asleep_ratio = (asleep_count / total_points) * 100 if total_points > 0 else 0
        
        # 添加到结果列表
        results.append({
            'series_id': series_id,
            'total_points': total_points,
            'awake_count': awake_count,
            'asleep_count': asleep_count,
            'awake_ratio': awake_ratio,
            'asleep_ratio': asleep_ratio,
            'onset_count': onset_count,
            'wakeup_count': wakeup_count,
            'sleep_periods_count': len(sleep_periods)
        })

# 转换为DataFrame并保存
results_df = pd.DataFrame(results)
output_path1 = 'val_num.csv'
results_df.to_csv(output_path1, index=False)

print(f"系列统计完成，结果已保存到 {output_path1}")

# 2. 统计每个onset和wakeup事件的详细信息（保存到val_data.csv）
print("\n开始统计事件详细信息...")

# 获取val_ids对应的完整事件数据（包含night和timestamp）
# 注意：需要从原始的train_events中获取，因为val_solution可能只包含部分列
val_events_full = train_events.filter(pl.col('series_id').is_in(val_ids)).to_pandas()

# 选择需要的列
val_events_selected = val_events_full[['series_id', 'night', 'event', 'step', 'timestamp']]

# 保存到CSV
output_path2 = 'val_data.csv'
val_events_selected.to_csv(output_path2, index=False)

print(f"事件详细信息统计完成，结果已保存到 {output_path2}")

# 3. 显示总体统计
print("总体统计：")

if not results_df.empty:
    total_awake = results_df['awake_count'].sum()
    total_asleep = results_df['asleep_count'].sum()
    total_points = results_df['total_points'].sum()
    
    total_awake_ratio = (total_awake / total_points) * 100 if total_points > 0 else 0
    total_asleep_ratio = (total_asleep / total_points) * 100 if total_points > 0 else 0
    
    total_onset = results_df['onset_count'].sum()
    total_wakeup = results_df['wakeup_count'].sum()
    total_sleep_periods = results_df['sleep_periods_count'].sum()
    
    print(f"验证集数据点总数: {total_points}")
    print(f"验证集清醒: {total_awake} ({total_awake_ratio:.2f}%)")
    print(f"验证集睡眠: {total_asleep} ({total_asleep_ratio:.2f}%)")
    print(f"验证集真实onset事件总数: {total_onset}")
    print(f"验证集真实wakeup事件总数: {total_wakeup}")
    print(f"验证集真实睡眠周期总数: {total_sleep_periods}")
    
    # 事件详细信息统计
    event_count = len(val_events_selected)
    onset_event_count = len(val_events_selected[val_events_selected['event'] == 'onset'])
    wakeup_event_count = len(val_events_selected[val_events_selected['event'] == 'wakeup'])
    
    print(f"\n事件详细信息统计：")
    print(f"验证集事件总数: {event_count}")
    print(f"验证集onset事件数: {onset_event_count}")
    print(f"验证集wakeup事件数: {wakeup_event_count}")
else:
    print("没有数据可供统计")

开始统计验证数据...
系列统计完成，结果已保存到 val_num.csv

开始统计事件详细信息...
事件详细信息统计完成，结果已保存到 val_data.csv
总体统计：
验证集数据点总数: 37902600
验证集清醒: 29428818 (77.64%)
验证集睡眠: 8473782 (22.36%)
验证集真实onset事件总数: 1374
验证集真实wakeup事件总数: 1374
验证集真实睡眠周期总数: 1374

事件详细信息统计：
验证集事件总数: 2748
验证集onset事件数: 1374
验证集wakeup事件数: 1374


In [10]:
# 创建训练数据集
X_train, y_train = make_train_dataset(train_data, train_solution_series_id)

100%|██████████| 188/188 [00:00<00:00, 202.94it/s]


In [11]:
from sklearn.ensemble import RandomForestClassifier

# 初始化随机森林分类器
rf_classifier = RandomForestClassifier(random_state=42)

# 训练分类器（设置超参数）
rf_classifier = RandomForestClassifier(n_estimators=500,  # 500棵树
                                    min_samples_leaf=25,  # 每个叶节点最少25个样本
                                    random_state=42,  # 随机种子，保证结果可重现
                                    n_jobs=-1)  # 使用所有CPU核心

# 拟合模型
rf_classifier.fit(X_train[feature_cols], y_train)

0,1,2
,n_estimators,500
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,25
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [12]:
# 绘制特征重要性图
px.bar(x=feature_cols, 
       y=rf_classifier.feature_importances_,  # 随机森林模型的特征重要性得分
       title='Random forest feature importances'
      )

In [13]:
# 在验证集上检查模型性能
rf_submission = get_events(val_data, rf_classifier)  # 生成验证集的预测事件

# 统计onset和wakeup事件数量
onset_count = len(rf_submission[rf_submission['event'] == 'onset'])
wakeup_count = len(rf_submission[rf_submission['event'] == 'wakeup'])
print(f"预测的onset事件数量: {onset_count}")
print(f"预测的wakeup事件数量: {wakeup_count}")

# 计算模型得分
print(f"Random forest score: {score(val_solution, rf_submission, tolerances, **column_names)}")

  0%|          | 0/81 [00:00<?, ?it/s]

100%|██████████| 81/81 [04:06<00:00,  3.04s/it]


预测的onset事件数量: 1899
预测的wakeup事件数量: 1899
Random forest score: 0.46074562895165017


In [None]:
def analyze_predictions(rf_submission, val_data, val_ids, train_events, output_num_file='rf_num.csv', output_data_file='rf_data.csv'):
    """
    统计预测结果并保存到CSV文件
    
    参数:
    rf_submission: 预测结果数据框
    val_data: 验证时间序列数据
    val_ids: 验证集的series_id列表
    train_events: 原始训练事件数据（用于获取night和timestamp信息）
    output_num_file: 系列统计结果保存的文件名，默认'rf_num.csv'
    output_data_file: 事件详细信息保存的文件名，默认'rf_data.csv'
    
    返回:
    None，结果保存在CSV文件中
    """
    import pandas as pd
    import polars as pl
    from tqdm import tqdm
    
    print("开始分析预测结果...")
    
    # 1. 统计每个series_id的详细信息
    results = []
    
    # 预处理：将rf_submission按series_id分组，提取onset和wakeup
    # series_events = {}
    for series_id in tqdm(val_ids, desc="处理每个series_id的预测事件"):
        # 获取当前series_id的事件
        events = rf_submission[rf_submission['series_id'] == series_id]
        
        # 提取onset和wakeup的step值并排序
        onset_steps = sorted(events[events['event'] == 'onset']['step'].tolist())
        wakeup_steps = sorted(events[events['event'] == 'wakeup']['step'].tolist())
        
        # 确保onset和wakeup成对
        sleep_periods = []
        min_len = min(len(onset_steps), len(wakeup_steps))
        for i in range(min_len):
            if onset_steps[i] < wakeup_steps[i]:
                sleep_periods.append((onset_steps[i], wakeup_steps[i]))
        
        series_events[series_id] = {
            'onset_count': len(onset_steps),
            'wakeup_count': len(wakeup_steps),
            'sleep_periods': sleep_periods
        }
    
    # 处理每个series_id
    print("\n统计每个series_id的清醒/睡眠分布...")
    for series_id in tqdm(val_ids, desc="统计每个series_id"):
        # 获取当前series_id的val_data数据点
        series_data = val_data.filter(pl.col('series_id') == series_id)
        total_points = len(series_data)
        
        if total_points > 0:
            # 获取预计算的事件信息
            event_info = series_events.get(series_id, {
                'onset_count': 0,
                'wakeup_count': 0,
                'sleep_periods': []
            })
            
            onset_count = event_info['onset_count']
            wakeup_count = event_info['wakeup_count']
            sleep_periods = event_info['sleep_periods']
            
            # 优化：批量处理数据点
            if sleep_periods:
                # 转换为pandas以提高处理速度
                series_data_pd = series_data.to_pandas()
                
                # 定义判断函数
                def is_asleep(step):
                    for onset, wakeup in sleep_periods:
                        if onset <= step <= wakeup:
                            return True
                    return False
                
                # 批量应用
                asleep_mask = series_data_pd['step'].apply(is_asleep)
                asleep_count = asleep_mask.sum()
                awake_count = total_points - asleep_count
            else:
                # 无睡眠周期，全部为清醒
                asleep_count = 0
                awake_count = total_points
            
            # 计算比例
            awake_ratio = (awake_count / total_points) * 100 if total_points > 0 else 0
            asleep_ratio = (asleep_count / total_points) * 100 if total_points > 0 else 0
            
            # 添加到结果列表
            results.append({
                'series_id': series_id,
                'total_points': total_points,
                'awake_count': awake_count,
                'asleep_count': asleep_count,
                'awake_ratio': awake_ratio,
                'asleep_ratio': asleep_ratio,
                'onset_count': onset_count,
                'wakeup_count': wakeup_count,
                'sleep_periods_count': len(sleep_periods)
            })
    
    # 转换为DataFrame并保存
    results_df = pd.DataFrame(results)
    results_df.to_csv(output_num_file, index=False)
    
    print(f"\n系列统计完成，结果已保存到 {output_num_file}")
    
    # 2. 统计每个onset和wakeup事件的详细信息
    print("\n开始统计事件详细信息...")
    
    # 从train_events中获取每个step对应的night和timestamp
    print("正在添加night和timestamp信息...")
    
    # 构建step到night和timestamp的映射字典
    step_info_map = {}
    train_events_pd = train_events.to_pandas()
    for _, row in tqdm(train_events_pd.iterrows(), desc="构建step信息映射", total=len(train_events_pd)):
        key = (row['series_id'], row['step'])
        step_info_map[key] = {'night': row['night'], 'timestamp': row['timestamp']}
    
    # 为rf_submission添加night和timestamp列
    def get_step_info(row):
        key = (row['series_id'], row['step'])
        info = step_info_map.get(key, {'night': None, 'timestamp': None})
        return info['night'], info['timestamp']
    
    # 批量应用
    rf_events_with_info = rf_submission.copy()
    rf_events_with_info[['night', 'timestamp']] = rf_events_with_info.apply(get_step_info, axis=1, result_type='expand')
    
    # 选择需要的列
    rf_events_final = rf_events_with_info[['series_id', 'night', 'event', 'step', 'timestamp']]
    
    # 保存到CSV
    rf_events_final.to_csv(output_data_file, index=False)
    
    print(f"事件详细信息统计完成，结果已保存到 {output_data_file}")
    
    # 3. 显示总体统计
    print("预测结果总体统计：")
    
    if not results_df.empty:
        total_awake = results_df['awake_count'].sum()
        total_asleep = results_df['asleep_count'].sum()
        total_points = results_df['total_points'].sum()
        
        total_awake_ratio = (total_awake / total_points) * 100 if total_points > 0 else 0
        total_asleep_ratio = (total_asleep / total_points) * 100 if total_points > 0 else 0
        
        total_onset = results_df['onset_count'].sum()
        total_wakeup = results_df['wakeup_count'].sum()
        total_sleep_periods = results_df['sleep_periods_count'].sum()
        
        print(f"验证集数据点总数: {total_points}")
        print(f"预测清醒: {total_awake} ({total_awake_ratio:.2f}%)")
        print(f"预测睡眠: {total_asleep} ({total_asleep_ratio:.2f}%)")
        print(f"预测onset事件总数: {total_onset}")
        print(f"预测wakeup事件总数: {total_wakeup}")
        print(f"预测睡眠周期总数: {total_sleep_periods}")
        
        # 事件详细信息统计
        event_count = len(rf_events_final)
        onset_event_count = len(rf_events_final[rf_events_final['event'] == 'onset'])
        wakeup_event_count = len(rf_events_final[rf_events_final['event'] == 'wakeup'])
        
        print(f"\n预测事件详细信息统计：")
        print(f"预测事件总数: {event_count}")
        print(f"预测onset事件数: {onset_event_count}")
        print(f"预测wakeup事件数: {wakeup_event_count}")
    else:
        print("没有数据可供统计")

In [15]:
# 使用默认文件名
analyze_predictions(rf_submission, val_data, val_ids, train_events)

# 自定义文件名
analyze_predictions(rf_submission, val_data, val_ids, train_events, 
                   output_num_file='custom_rf_num.csv', 
                   output_data_file='custom_rf_data.csv')

开始分析预测结果...
预处理预测事件数据...


处理每个series_id的预测事件: 100%|██████████| 81/81 [00:00<00:00, 2756.14it/s]



统计每个series_id的清醒/睡眠分布...


统计每个series_id: 100%|██████████| 81/81 [00:14<00:00,  5.50it/s]



系列统计完成，结果已保存到 rf_num.csv

开始统计事件详细信息...
正在添加night和timestamp信息...


构建step信息映射: 100%|██████████| 9580/9580 [00:00<00:00, 95830.94it/s]


事件详细信息统计完成，结果已保存到 rf_data.csv
预测结果总体统计：
验证集数据点总数: 37902600
预测清醒: 28446828 (75.05%)
预测睡眠: 9455772 (24.95%)
预测onset事件总数: 1899
预测wakeup事件总数: 1899
预测睡眠周期总数: 1899

预测事件详细信息统计：
预测事件总数: 3798
预测onset事件数: 1899
预测wakeup事件数: 1899
开始分析预测结果...
预处理预测事件数据...


处理每个series_id的预测事件: 100%|██████████| 81/81 [00:00<00:00, 3617.40it/s]



统计每个series_id的清醒/睡眠分布...


统计每个series_id: 100%|██████████| 81/81 [00:14<00:00,  5.50it/s]



系列统计完成，结果已保存到 custom_rf_num.csv

开始统计事件详细信息...
正在添加night和timestamp信息...


构建step信息映射: 100%|██████████| 9580/9580 [00:00<00:00, 96948.41it/s]

事件详细信息统计完成，结果已保存到 custom_rf_data.csv
预测结果总体统计：
验证集数据点总数: 37902600
预测清醒: 28446828 (75.05%)
预测睡眠: 9455772 (24.95%)
预测onset事件总数: 1899
预测wakeup事件总数: 1899
预测睡眠周期总数: 1899

预测事件详细信息统计：
预测事件总数: 3798
预测onset事件数: 1899
预测wakeup事件数: 1899





In [16]:
# 保存分类器到文件
import pickle
with open('rf_classifier_5m_8h.pkl', 'wb') as f:
    pickle.dump(rf_classifier, f)

# 从文件加载分类器（验证保存是否成功）
with open('rf_classifier_5m_8h.pkl', 'rb') as f:
    rf_classifier = pickle.load(f)

In [17]:
# 释放内存（删除大型变量）
del train_data 

In [18]:
# 为测试集生成事件预测并保存提交文件
submission = get_events(test_series.collect(), rf_classifier)  # 处理测试数据并生成预测
submission.to_csv('submission.csv', index=False)  # 保存为CSV文件

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  9.74it/s]
