In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
# 1. 加载数据
file_path = "/Users/rbw/Desktop/binanceus/BTC_USDT_1h.csv"
df = pd.read_csv(file_path)

In [3]:
# 转换时间戳为datetime格式（假设timestamp列为毫秒级）
df['datetime'] = pd.to_datetime(df['timestamp'], unit='ms')
df.set_index('datetime', inplace=True)

# 检查缺失值
print("缺失值统计:")
print(df.isnull().sum())

# 前向填充缺失值（如果有）
df = df.ffill()

# 验证填充结果
print("\n填充后缺失值统计:")
print(df.isnull().sum())

缺失值统计:
timestamp    0
open         0
high         0
low          0
close        0
volume       0
dtype: int64

填充后缺失值统计:
timestamp    0
open         0
high         0
low          0
close        0
volume       0
dtype: int64


In [4]:
# 价格波动范围
df['price_range'] = df['high'] - df['low']

# 涨跌幅（百分比变化）
df['price_change_pct'] = df['close'].pct_change() * 100

# 简单移动平均（4小时窗口）
df['sma_4'] = df['close'].rolling(window=4).mean()

# 交易量变化
df['volume_change'] = df['volume'].pct_change()

# 时间特征
df['hour'] = df.index.hour
df['day_of_week'] = df.index.dayofweek  # Monday=0, Sunday=6

In [5]:
df.dropna(inplace=True)

In [6]:
df_clean = df.dropna().copy()  # 存入新对象df_clean

In [7]:
print("\n（datetime作为索引）：")
print(df_clean[['open', 'close', 'price_range', 'price_change_pct', 'sma_4', 'volume_change', 'hour']].head())
print("对应的时间索引：")
print(df_clean.index[:5])

# 检查形状变化
print(f"\n原始数据行数: {len(df)} → 处理后行数: {len(df_clean)} (删除了{len(df) - len(df_clean)}行NaN)")


（datetime作为索引）：
                         open     close  price_range  price_change_pct  \
datetime                                                                 
2024-01-01 03:00:00  42589.35  42320.95       328.93         -0.558453   
2024-01-01 04:00:00  42338.81  42354.20       165.41          0.078566   
2024-01-01 05:00:00  42354.27  42244.32       180.33         -0.259431   
2024-01-01 06:00:00  42254.90  42412.37       232.58          0.397805   
2024-01-01 07:00:00  42383.39  42490.00       116.61          0.183036   

                          sma_4  volume_change  hour  
datetime                                              
2024-01-01 03:00:00  42484.6525       2.057307     3  
2024-01-01 04:00:00  42450.9700      -0.647988     4  
2024-01-01 05:00:00  42369.5225      -0.059128     5  
2024-01-01 06:00:00  42332.9600       0.681908     6  
2024-01-01 07:00:00  42375.2225      -0.095536     7  
对应的时间索引：
DatetimeIndex(['2024-01-01 03:00:00', '2024-01-01 04:00:00',
         