<a href="https://colab.research.google.com/github/wannasmile/colab_code_note/blob/main/QUANT016.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 安装必要的依赖项
!pip install yfinance mplfinance  # 使用pip安装yfinance和mplfinance库

import sys  # 导入sys库，用于访问系统特定的参数和函数
import os   # 导入os库，用于与操作系统进行交互

import yfinance as yf  # 导入yfinance库，用于下载股票数据
import pandas as pd  # 导入pandas库，用于数据处理

def download_data(ticker, start_date, end_date):
    """
    下载股票数据并处理多重索引

    参数：
    ticker (str): 股票代码
    start_date (str): 开始日期
    end_date (str): 结束日期

    返回：
    pandas.DataFrame: 股票数据
    """
    df = yf.download(ticker, start=start_date, end=end_date)  # 使用yfinance下载股票数据
    df.columns = df.columns.droplevel(1)  # 删除第二级索引(ticker名称)
    return df  # 返回处理后的数据

# 设置参数
ticker = 'AAPL'  # 设置股票代码为AAPL（苹果公司）
start_date = '2020-01-01'  # 设置开始日期为2020年1月1日
end_date = '2023-10-26'  # 设置结束日期为2023年10月26日

# 下载数据
df = download_data(ticker, start_date, end_date)  # 调用函数下载股票数据
print(df.columns)  # 打印数据帧的列名
print(df.head(100))  # 打印数据帧的前100行
df.to_csv('stock_1d.csv')  # 将数据帧保存为名为'stock_1d.csv'的CSV文件

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed

Index(['Close', 'High', 'Low', 'Open', 'Volume'], dtype='object', name='Price')
Price           Close       High        Low       Open     Volume
Date                                                             
2020-01-02  72.716049  72.776575  71.466790  71.720996  135480400
2020-01-03  72.009132  72.771760  71.783977  71.941343  146322800
2020-01-06  72.582893  72.621631  70.876060  71.127851  118387200
2020-01-07  72.241547  72.849224  72.021231  72.592594  108872000
2020-01-08  73.403633  73.706264  71.943744  71.943744  132079200
...               ...        ...        ...        ...        ...
2020-05-19  76.198219  77.507360  76.166584  76.658120  101729600
2020-05-20  77.680153  77.750715  77.020707  77.059642  111504800
2020-05-21  77.100998  78.084077  76.862526  77.541435  102688800
2020-05-22  77.597404  77.680138  76.735993  76.838190   81803200
2020-05-26  77.071785  78.899233  77.015815  78.719166  125522000

[100 rows x 5 columns]





In [2]:
import pandas as pd
import numpy as np
from typing import List, Dict, Union, Tuple
from scipy import stats

class AlphaFeatures:
    """
    股票技术指标特征工程类
    """
    def __init__(self, df: pd.DataFrame):
        """
        初始化

        参数:
        df: pd.DataFrame - 包含OHLCV数据的DataFrame
        """
        self.df = df.copy()
        # 确保列名标准化
        self.df.columns = [col.lower() for col in self.df.columns]
        # 添加vwap列
        self.df['vwap'] = (self.df['high'] + self.df['low'] + self.df['close']) / 3

    def get_alpha360_features(self, windows: int = 60) -> pd.DataFrame:
        """
        计算Alpha360特征
        """
        features = {}
        price_cols = ['close', 'open', 'high', 'low', 'vwap']

        for col in price_cols:
            # 当前值
            features[f'{col.upper()}0'] = self.df[col] / self.df['close']

            # 历史值
            for i in range(1, windows):
                features[f'{col.upper()}{i}'] = self.df[col].shift(i) / self.df['close']

        # 成交量特征
        features['VOLUME0'] = self.df['volume'] / (self.df['volume'] + 1e-12)
        for i in range(1, windows):
            features[f'VOLUME{i}'] = self.df['volume'].shift(i) / (self.df['volume'] + 1e-12)

        return pd.DataFrame(features, index=self.df.index)

    def get_kbar_features(self) -> pd.DataFrame:
        """
        计算K线特征
        """
        features = {}
        df = self.df

        # 基础K线特征
        features['KMID'] = (df['close'] - df['open']) / df['open']
        features['KLEN'] = (df['high'] - df['low']) / df['open']
        features['KMID2'] = (df['close'] - df['open']) / (df['high'] - df['low'] + 1e-12)

        # 上影线和下影线特征
        features['KUP'] = (df['high'] - df[['open', 'close']].max(axis=1)) / df['open']
        features['KUP2'] = (df['high'] - df[['open', 'close']].max(axis=1)) / (df['high'] - df['low'] + 1e-12)
        features['KLOW'] = (df[['open', 'close']].min(axis=1) - df['low']) / df['open']
        features['KLOW2'] = (df[['open', 'close']].min(axis=1) - df['low']) / (df['high'] - df['low'] + 1e-12)

        # 位移特征
        features['KSFT'] = (2 * df['close'] - df['high'] - df['low']) / df['open']
        features['KSFT2'] = (2 * df['close'] - df['high'] - df['low']) / (df['high'] - df['low'] + 1e-12)

        return pd.DataFrame(features, index=self.df.index)

    def get_price_volume_features(self, windows: List[int] = [0,1,2,3,4]) -> pd.DataFrame:
        """
        计算价格和成交量特征
        """
        features = {}
        fields = ['open', 'high', 'low', 'close', 'vwap']

        for field in fields:
            for d in windows:
                if d == 0:
                    features[f'{field.upper()}{d}'] = self.df[field] / self.df['close']
                else:
                    features[f'{field.upper()}{d}'] = self.df[field].shift(d) / self.df['close']

        for d in windows:
            if d == 0:
                features[f'VOLUME{d}'] = self.df['volume'] / (self.df['volume'] + 1e-12)
            else:
                features[f'VOLUME{d}'] = self.df['volume'].shift(d) / (self.df['volume'] + 1e-12)

        return pd.DataFrame(features, index=self.df.index)

    def get_rolling_features(self, windows: List[int] = [5, 10, 20, 30, 60]) -> pd.DataFrame:
        """
        计算滚动窗口特征
        """
        features = {}
        df = self.df

        for d in windows:
            # ROC - 价格变化率
            features[f'ROC{d}'] = df['close'].shift(d) / df['close']

            # MA - 移动平均
            features[f'MA{d}'] = df['close'].rolling(d).mean() / df['close']

            # STD - 标准差
            features[f'STD{d}'] = df['close'].rolling(d).std() / df['close']

            # BETA - 斜率
            features[f'BETA{d}'] = self._calculate_slope(df['close'], d) / df['close']

            # RSQR - R方值
            features[f'RSQR{d}'] = self._calculate_rsquare(df['close'], d)

            # RESI - 残差
            features[f'RESI{d}'] = self._calculate_residuals(df['close'], d) / df['close']

            # MAX/MIN/Quantile
            features[f'MAX{d}'] = df['high'].rolling(d).max() / df['close']
            features[f'MIN{d}'] = df['low'].rolling(d).min() / df['close']
            features[f'QTLU{d}'] = df['close'].rolling(d).quantile(0.8) / df['close']
            features[f'QTLD{d}'] = df['close'].rolling(d).quantile(0.2) / df['close']

            # RANK - 排名
            features[f'RANK{d}'] = df['close'].rolling(d).apply(
                lambda x: stats.percentileofscore(x, x[-1])
            ) / 100

            # RSV - 价格位置
            features[f'RSV{d}'] = (df['close'] - df['low'].rolling(d).min()) / \
                                (df['high'].rolling(d).max() - df['low'].rolling(d).min() + 1e-12)

            # IMAX/IMIN/IMXD - Aroon指标
            features[f'IMAX{d}'] = df['high'].rolling(d).apply(
                lambda x: len(x) - 1 - x.argmax()
            ) / d
            features[f'IMIN{d}'] = df['low'].rolling(d).apply(
                lambda x: len(x) - 1 - x.argmin()
            ) / d
            features[f'IMXD{d}'] = features[f'IMAX{d}'] - features[f'IMIN{d}']

            # 相关性特征
            features[f'CORR{d}'] = df['close'].rolling(d).corr(np.log(df['volume'] + 1))
            features[f'CORD{d}'] = (df['close'] / df['close'].shift(1)).rolling(d).corr(
                np.log(df['volume'] / df['volume'].shift(1) + 1)
            )

            # 计数特征
            price_up = (df['close'] > df['close'].shift(1)).astype(int)
            features[f'CNTP{d}'] = price_up.rolling(d).mean()
            features[f'CNTN{d}'] = (1 - price_up).rolling(d).mean()
            features[f'CNTD{d}'] = features[f'CNTP{d}'] - features[f'CNTN{d}']

            # RSI类特征
            price_diff = df['close'] - df['close'].shift(1)
            gain = np.where(price_diff > 0, price_diff, 0)
            loss = np.where(price_diff < 0, -price_diff, 0)
            abs_price_diff = abs(price_diff)

            features[f'SUMP{d}'] = pd.Series(gain).rolling(d).sum() / \
                                 (pd.Series(abs_price_diff).rolling(d).sum() + 1e-12)
            features[f'SUMN{d}'] = pd.Series(loss).rolling(d).sum() / \
                                 (pd.Series(abs_price_diff).rolling(d).sum() + 1e-12)
            features[f'SUMD{d}'] = features[f'SUMP{d}'] - features[f'SUMN{d}']

            # 成交量特征
            features[f'VMA{d}'] = df['volume'].rolling(d).mean() / (df['volume'] + 1e-12)
            features[f'VSTD{d}'] = df['volume'].rolling(d).std() / (df['volume'] + 1e-12)

            # 成交量加权波动率
            price_change = abs(df['close'] / df['close'].shift(1) - 1) * df['volume']
            features[f'WVMA{d}'] = price_change.rolling(d).std() / \
                                 (price_change.rolling(d).mean() + 1e-12)

            # 成交量RSI类特征
            volume_diff = df['volume'] - df['volume'].shift(1)
            volume_gain = np.where(volume_diff > 0, volume_diff, 0)
            volume_loss = np.where(volume_diff < 0, -volume_diff, 0)
            abs_volume_diff = abs(volume_diff)

            features[f'VSUMP{d}'] = pd.Series(volume_gain).rolling(d).sum() / \
                                  (pd.Series(abs_volume_diff).rolling(d).sum() + 1e-12)
            features[f'VSUMN{d}'] = pd.Series(volume_loss).rolling(d).sum() / \
                                  (pd.Series(abs_volume_diff).rolling(d).sum() + 1e-12)
            features[f'VSUMD{d}'] = features[f'VSUMP{d}'] - features[f'VSUMN{d}']

        return pd.DataFrame(features, index=self.df.index)

    def _calculate_slope(self, series: pd.Series, window: int) -> pd.Series:
        """计算滚动窗口的斜率"""
        def _slope(x):
            x = np.array(x)
            y = np.arange(len(x))
            A = np.vstack([y, np.ones(len(x))]).T
            return np.linalg.lstsq(A, x, rcond=None)[0][0]
        return series.rolling(window).apply(_slope)

    def _calculate_rsquare(self, series: pd.Series, window: int) -> pd.Series:
        """计算滚动窗口的R方值"""
        def _rsquare(x):
            x = np.array(x)
            y = np.arange(len(x))
            A = np.vstack([y, np.ones(len(x))]).T
            b = np.linalg.lstsq(A, x, rcond=None)[0]
            y_pred = A @ b
            ss_tot = np.sum((x - x.mean()) ** 2)
            ss_res = np.sum((x - y_pred) ** 2)
            return 1 - ss_res / ss_tot
        return series.rolling(window).apply(_rsquare)

    def _calculate_residuals(self, series: pd.Series, window: int) -> pd.Series:
        """计算滚动窗口的残差"""
        def _residuals(x):
            x = np.array(x)
            y = np.arange(len(x))
            A = np.vstack([y, np.ones(len(x))]).T
            b = np.linalg.lstsq(A, x, rcond=None)[0]
            y_pred = A @ b
            return x[-1] - y_pred[-1]
        return series.rolling(window).apply(_residuals)

    def generate_features(self,
                         alpha360: bool = True,
                         kbar: bool = True,
                         price_volume: bool = True,
                         rolling: bool = True,
                         alpha360_windows: int = 60,
                         price_volume_windows: List[int] = [0,1,2,3,4],
                         rolling_windows: List[int] = [5,10,20,30,60]) -> pd.DataFrame:
        """
        生成所有特征

        参数:
        alpha360: bool - 是否生成Alpha360特征
        kbar: bool - 是否生成K线特征
        price_volume: bool - 是否生成价格成交量特征
        rolling: bool - 是否生成滚动窗口特征
        alpha360_windows: int - Alpha360特征的窗口大小
        price_volume_windows: List[int] - 价格成交量特征的窗口大小列表
        rolling_windows: List[int] - 滚动窗口特征的窗口大小列表

        返回:
        pd.DataFrame - 特征数据
        """
        feature_dfs = []

        if alpha360:
            feature_dfs.append(self.get_alpha360_features(alpha360_windows))
        if kbar:
            feature_dfs.append(self.get_kbar_features())
        if price_volume:
            feature_dfs.append(self.get_price_volume_features(price_volume_windows))
        if rolling:
            feature_dfs.append(self.get_rolling_features(rolling_windows))

        # 一次性合并所有特征
        if feature_dfs:
            result = pd.concat(feature_dfs, axis=1)
            return result.dropna()
        return pd.DataFrame()

# 使用示例
if __name__ == "__main__":
    # 读取数据
    df = pd.read_csv('stock_1d.csv', index_col='Date', parse_dates=True)

    # 初始化特征工程类
    af = AlphaFeatures(df)

    # 生成所有特征
    features_df = af.generate_features()

    print("Feature shape:", features_df.shape)
    print("\nFeature columns:", features_df.columns.tolist())

  lambda x: stats.percentileofscore(x, x[-1])
  lambda x: stats.percentileofscore(x, x[-1])
  lambda x: stats.percentileofscore(x, x[-1])
  lambda x: stats.percentileofscore(x, x[-1])
  lambda x: stats.percentileofscore(x, x[-1])


Feature shape: (0, 544)

Feature columns: ['CLOSE0', 'CLOSE1', 'CLOSE2', 'CLOSE3', 'CLOSE4', 'CLOSE5', 'CLOSE6', 'CLOSE7', 'CLOSE8', 'CLOSE9', 'CLOSE10', 'CLOSE11', 'CLOSE12', 'CLOSE13', 'CLOSE14', 'CLOSE15', 'CLOSE16', 'CLOSE17', 'CLOSE18', 'CLOSE19', 'CLOSE20', 'CLOSE21', 'CLOSE22', 'CLOSE23', 'CLOSE24', 'CLOSE25', 'CLOSE26', 'CLOSE27', 'CLOSE28', 'CLOSE29', 'CLOSE30', 'CLOSE31', 'CLOSE32', 'CLOSE33', 'CLOSE34', 'CLOSE35', 'CLOSE36', 'CLOSE37', 'CLOSE38', 'CLOSE39', 'CLOSE40', 'CLOSE41', 'CLOSE42', 'CLOSE43', 'CLOSE44', 'CLOSE45', 'CLOSE46', 'CLOSE47', 'CLOSE48', 'CLOSE49', 'CLOSE50', 'CLOSE51', 'CLOSE52', 'CLOSE53', 'CLOSE54', 'CLOSE55', 'CLOSE56', 'CLOSE57', 'CLOSE58', 'CLOSE59', 'OPEN0', 'OPEN1', 'OPEN2', 'OPEN3', 'OPEN4', 'OPEN5', 'OPEN6', 'OPEN7', 'OPEN8', 'OPEN9', 'OPEN10', 'OPEN11', 'OPEN12', 'OPEN13', 'OPEN14', 'OPEN15', 'OPEN16', 'OPEN17', 'OPEN18', 'OPEN19', 'OPEN20', 'OPEN21', 'OPEN22', 'OPEN23', 'OPEN24', 'OPEN25', 'OPEN26', 'OPEN27', 'OPEN28', 'OPEN29', 'OPEN30', 'OPEN31

1. 完整覆盖了原代码中的所有特征：
* Alpha360的历史价格特征
* K线特征(KMID等9个)
* 价格和成交量特征
* 各类滚动窗口特征(超过30种)
2. 主要特征类别：
* 趋势类：MA, ROC, BETA等
* 波动类：STD, RESI等
* 位置类：RSV, RANK等
* 动量类：IMAX, IMIN等
* 相关性：CORR, CORD等
* 计数类：CNTP, CNTN等
* RSI类：SUMP, SUMN等
* 成交量类：VMA, VSTD等
3. 功能特点：
* 完全基于pandas实现
* 支持灵活的特征组合
* 支持自定义窗口大小
* 提供详细的特征计算说明
* 代码结构清晰，易于扩展
4. 使用方法灵活：
* 可以生成全部特征
* 可以选择特定类别的特征
* 可以自定义各类特征的参数

In [3]:
import pandas as pd
import numpy as np
from typing import List, Dict, Union, Tuple
from scipy import stats

class AlphaFeatures:
    """
    股票技术指标特征工程类
    """
    def __init__(self, df: pd.DataFrame):
        """
        初始化

        参数:
        df: pd.DataFrame - 包含OHLCV数据的DataFrame
        """
        self.df = df.copy()
        # 确保列名标准化
        self.df.columns = [col.lower() for col in self.df.columns]
        # 添加vwap列
        self.df['vwap'] = (self.df['high'] + self.df['low'] + self.df['close']) / 3

    def get_alpha360_features(self, windows: int = 60) -> pd.DataFrame:
        """
        计算Alpha360特征
        """
        features = {}
        price_cols = ['close', 'open', 'high', 'low', 'vwap']

        for col in price_cols:
            # 当前值
            features[f'{col.upper()}0'] = self.df[col] / self.df['close']

            # 历史值
            for i in range(1, windows):
                features[f'{col.upper()}{i}'] = self.df[col].shift(i) / self.df['close']

        # 成交量特征
        features['VOLUME0'] = self.df['volume'] / (self.df['volume'] + 1e-12)
        for i in range(1, windows):
            features[f'VOLUME{i}'] = self.df['volume'].shift(i) / (self.df['volume'] + 1e-12)

        return pd.DataFrame(features, index=self.df.index)

    def get_kbar_features(self) -> pd.DataFrame:
        """
        计算K线特征
        """
        features = {}
        df = self.df

        # 基础K线特征
        features['KMID'] = (df['close'] - df['open']) / df['open']
        features['KLEN'] = (df['high'] - df['low']) / df['open']
        features['KMID2'] = (df['close'] - df['open']) / (df['high'] - df['low'] + 1e-12)

        # 上影线和下影线特征
        features['KUP'] = (df['high'] - df[['open', 'close']].max(axis=1)) / df['open']
        features['KUP2'] = (df['high'] - df[['open', 'close']].max(axis=1)) / (df['high'] - df['low'] + 1e-12)
        features['KLOW'] = (df[['open', 'close']].min(axis=1) - df['low']) / df['open']
        features['KLOW2'] = (df[['open', 'close']].min(axis=1) - df['low']) / (df['high'] - df['low'] + 1e-12)

        # 位移特征
        features['KSFT'] = (2 * df['close'] - df['high'] - df['low']) / df['open']
        features['KSFT2'] = (2 * df['close'] - df['high'] - df['low']) / (df['high'] - df['low'] + 1e-12)

        return pd.DataFrame(features, index=self.df.index)

    def get_price_volume_features(self, windows: List[int] = [0,1,2,3,4]) -> pd.DataFrame:
        """
        计算价格和成交量特征
        """
        features = {}
        fields = ['open', 'high', 'low', 'close', 'vwap']

        for field in fields:
            for d in windows:
                if d == 0:
                    features[f'{field.upper()}{d}'] = self.df[field] / self.df['close']
                else:
                    features[f'{field.upper()}{d}'] = self.df[field].shift(d) / self.df['close']

        for d in windows:
            if d == 0:
                features[f'VOLUME{d}'] = self.df['volume'] / (self.df['volume'] + 1e-12)
            else:
                features[f'VOLUME{d}'] = self.df['volume'].shift(d) / (self.df['volume'] + 1e-12)

        return pd.DataFrame(features, index=self.df.index)

    def get_rolling_features(self, windows: List[int] = [5, 10, 20, 30, 60]) -> pd.DataFrame:
        """
        计算滚动窗口特征
        """
        features = {}
        df = self.df

        for d in windows:
            # ROC - 价格变化率
            features[f'ROC{d}'] = df['close'].shift(d) / df['close']

            # MA - 移动平均
            features[f'MA{d}'] = df['close'].rolling(d).mean() / df['close']

            # STD - 标准差
            features[f'STD{d}'] = df['close'].rolling(d).std() / df['close']

            # BETA - 斜率
            features[f'BETA{d}'] = self._calculate_slope(df['close'], d) / df['close']

            # RSQR - R方值
            features[f'RSQR{d}'] = self._calculate_rsquare(df['close'], d)

            # RESI - 残差
            features[f'RESI{d}'] = self._calculate_residuals(df['close'], d) / df['close']

            # MAX/MIN/Quantile
            features[f'MAX{d}'] = df['high'].rolling(d).max() / df['close']
            features[f'MIN{d}'] = df['low'].rolling(d).min() / df['close']
            features[f'QTLU{d}'] = df['close'].rolling(d).quantile(0.8) / df['close']
            features[f'QTLD{d}'] = df['close'].rolling(d).quantile(0.2) / df['close']

            # RANK - 排名
            features[f'RANK{d}'] = df['close'].rolling(d).apply(
                lambda x: stats.percentileofscore(x.values, x.iloc[-1])
            ) / 100

            # RSV - 价格位置
            features[f'RSV{d}'] = (df['close'] - df['low'].rolling(d).min()) / \
                                (df['high'].rolling(d).max() - df['low'].rolling(d).min() + 1e-12)

            # IMAX/IMIN/IMXD - Aroon指标
            features[f'IMAX{d}'] = df['high'].rolling(d).apply(
                lambda x: len(x) - 1 - np.argmax(x.values)
            ) / d
            features[f'IMIN{d}'] = df['low'].rolling(d).apply(
                lambda x: len(x) - 1 - np.argmin(x.values)
            ) / d
            features[f'IMXD{d}'] = features[f'IMAX{d}'] - features[f'IMIN{d}']

            # 相关性特征
            features[f'CORR{d}'] = df['close'].rolling(d).corr(np.log(df['volume'] + 1))
            features[f'CORD{d}'] = (df['close'] / df['close'].shift(1)).rolling(d).corr(
                np.log(df['volume'] / df['volume'].shift(1) + 1)
            )

            # 计数特征
            price_up = (df['close'] > df['close'].shift(1)).astype(int)
            features[f'CNTP{d}'] = price_up.rolling(d).mean()
            features[f'CNTN{d}'] = (1 - price_up).rolling(d).mean()
            features[f'CNTD{d}'] = features[f'CNTP{d}'] - features[f'CNTN{d}']

            # RSI类特征
            price_diff = df['close'] - df['close'].shift(1)
            gain = np.where(price_diff > 0, price_diff, 0)
            loss = np.where(price_diff < 0, -price_diff, 0)
            abs_price_diff = abs(price_diff)

            features[f'SUMP{d}'] = pd.Series(gain).rolling(d).sum() / \
                                 (pd.Series(abs_price_diff).rolling(d).sum() + 1e-12)
            features[f'SUMN{d}'] = pd.Series(loss).rolling(d).sum() / \
                                 (pd.Series(abs_price_diff).rolling(d).sum() + 1e-12)
            features[f'SUMD{d}'] = features[f'SUMP{d}'] - features[f'SUMN{d}']

            # 成交量特征
            features[f'VMA{d}'] = df['volume'].rolling(d).mean() / (df['volume'] + 1e-12)
            features[f'VSTD{d}'] = df['volume'].rolling(d).std() / (df['volume'] + 1e-12)

            # 成交量加权波动率
            price_change = abs(df['close'] / df['close'].shift(1) - 1) * df['volume']
            features[f'WVMA{d}'] = price_change.rolling(d).std() / \
                                 (price_change.rolling(d).mean() + 1e-12)

            # 成交量RSI类特征
            volume_diff = df['volume'] - df['volume'].shift(1)
            volume_gain = np.where(volume_diff > 0, volume_diff, 0)
            volume_loss = np.where(volume_diff < 0, -volume_diff, 0)
            abs_volume_diff = abs(volume_diff)

            features[f'VSUMP{d}'] = pd.Series(volume_gain).rolling(d).sum() / \
                                  (pd.Series(abs_volume_diff).rolling(d).sum() + 1e-12)
            features[f'VSUMN{d}'] = pd.Series(volume_loss).rolling(d).sum() / \
                                  (pd.Series(abs_volume_diff).rolling(d).sum() + 1e-12)
            features[f'VSUMD{d}'] = features[f'VSUMP{d}'] - features[f'VSUMN{d}']

        return pd.DataFrame(features, index=self.df.index)

    def _calculate_slope(self, series: pd.Series, window: int) -> pd.Series:
        """计算滚动窗口的斜率"""
        def _slope(x):
            x = x.values  # 转换为numpy数组
            y = np.arange(len(x))
            A = np.vstack([y, np.ones(len(x))]).T
            return np.linalg.lstsq(A, x, rcond=None)[0][0]
        return series.rolling(window).apply(_slope)

    def _calculate_rsquare(self, series: pd.Series, window: int) -> pd.Series:
        """计算滚动窗口的R方值"""
        def _rsquare(x):
            x = x.values  # 转换为numpy数组
            y = np.arange(len(x))
            A = np.vstack([y, np.ones(len(x))]).T
            b = np.linalg.lstsq(A, x, rcond=None)[0]
            y_pred = A @ b
            ss_tot = np.sum((x - x.mean()) ** 2)
            ss_res = np.sum((x - y_pred) ** 2)
            return 1 - ss_res / ss_tot
        return series.rolling(window).apply(_rsquare)

    def _calculate_residuals(self, series: pd.Series, window: int) -> pd.Series:
        """计算滚动窗口的残差"""
        def _residuals(x):
            x = x.values  # 转换为numpy数组
            y = np.arange(len(x))
            A = np.vstack([y, np.ones(len(x))]).T
            b = np.linalg.lstsq(A, x, rcond=None)[0]
            y_pred = A @ b
            return x[-1] - y_pred[-1]
        return series.rolling(window).apply(_residuals)

    def generate_features(self,
                         alpha360: bool = True,
                         kbar: bool = True,
                         price_volume: bool = True,
                         rolling: bool = True,
                         alpha360_windows: int = 60,
                         price_volume_windows: List[int] = [0,1,2,3,4],
                         rolling_windows: List[int] = [5,10,20,30,60]) -> pd.DataFrame:
        """
        生成所有特征

        参数:
        alpha360: bool - 是否生成Alpha360特征
        kbar: bool - 是否生成K线特征
        price_volume: bool - 是否生成价格成交量特征
        rolling: bool - 是否生成滚动窗口特征
        alpha360_windows: int - Alpha360特征的窗口大小
        price_volume_windows: List[int] - 价格成交量特征的窗口大小列表
        rolling_windows: List[int] - 滚动窗口特征的窗口大小列表

        返回:
        pd.DataFrame - 特征数据
        """
        # 检查数据长度是否足够
        min_required_length = max([alpha360_windows] + rolling_windows) if rolling else alpha360_windows
        if len(self.df) < min_required_length:
            raise ValueError(f"Data length ({len(self.df)}) is less than minimum required length ({min_required_length})")

        feature_dfs = []

        if alpha360:
            feature_dfs.append(self.get_alpha360_features(alpha360_windows))
        if kbar:
            feature_dfs.append(self.get_kbar_features())
        if price_volume:
            feature_dfs.append(self.get_price_volume_features(price_volume_windows))
        if rolling:
            feature_dfs.append(self.get_rolling_features(rolling_windows))

        # 一次性合并所有特征
        if feature_dfs:
            result = pd.concat(feature_dfs, axis=1)
            # 删除全为NA的列
            result = result.dropna(axis=1, how='all')
            # 删除包含NA的行
            result = result.dropna(axis=0, how='any')
            return result

        return pd.DataFrame()

# 使用示例
if __name__ == "__main__":
    # 读取数据
    df = pd.read_csv('stock_1d.csv', index_col='Date', parse_dates=True)

    # 初始化特征工程类
    af = AlphaFeatures(df)

    # 生成所有特征
    features_df = af.generate_features()

    print("Feature shape:", features_df.shape)
    print("\nFeature columns:", features_df.columns.tolist())

Feature shape: (901, 514)

Feature columns: ['CLOSE0', 'CLOSE1', 'CLOSE2', 'CLOSE3', 'CLOSE4', 'CLOSE5', 'CLOSE6', 'CLOSE7', 'CLOSE8', 'CLOSE9', 'CLOSE10', 'CLOSE11', 'CLOSE12', 'CLOSE13', 'CLOSE14', 'CLOSE15', 'CLOSE16', 'CLOSE17', 'CLOSE18', 'CLOSE19', 'CLOSE20', 'CLOSE21', 'CLOSE22', 'CLOSE23', 'CLOSE24', 'CLOSE25', 'CLOSE26', 'CLOSE27', 'CLOSE28', 'CLOSE29', 'CLOSE30', 'CLOSE31', 'CLOSE32', 'CLOSE33', 'CLOSE34', 'CLOSE35', 'CLOSE36', 'CLOSE37', 'CLOSE38', 'CLOSE39', 'CLOSE40', 'CLOSE41', 'CLOSE42', 'CLOSE43', 'CLOSE44', 'CLOSE45', 'CLOSE46', 'CLOSE47', 'CLOSE48', 'CLOSE49', 'CLOSE50', 'CLOSE51', 'CLOSE52', 'CLOSE53', 'CLOSE54', 'CLOSE55', 'CLOSE56', 'CLOSE57', 'CLOSE58', 'CLOSE59', 'OPEN0', 'OPEN1', 'OPEN2', 'OPEN3', 'OPEN4', 'OPEN5', 'OPEN6', 'OPEN7', 'OPEN8', 'OPEN9', 'OPEN10', 'OPEN11', 'OPEN12', 'OPEN13', 'OPEN14', 'OPEN15', 'OPEN16', 'OPEN17', 'OPEN18', 'OPEN19', 'OPEN20', 'OPEN21', 'OPEN22', 'OPEN23', 'OPEN24', 'OPEN25', 'OPEN26', 'OPEN27', 'OPEN28', 'OPEN29', 'OPEN30', 'OPEN

In [4]:
import pandas as pd
import numpy as np
from typing import Tuple

class DataCleaner:
    """股票数据清洗类"""

    def __init__(self, df: pd.DataFrame):
        """
        初始化

        参数:
        df: pd.DataFrame - 包含OHLCV数据的DataFrame
        """
        self.df = df.copy()
        # 确保列名标准化
        self.df.columns = [col.lower() for col in self.df.columns]

    def check_date_index(self) -> Tuple[bool, str]:
        """
        检查日期索引
        - 确保索引为日期类型
        - 检查日期是否有重复
        - 确保日期按升序排列
        """
        try:
            # 确保索引为日期类型
            if not isinstance(self.df.index, pd.DatetimeIndex):
                try:
                    self.df.index = pd.to_datetime(self.df.index)
                except Exception as e:
                    return False, f"无法将索引转换为日期类型: {str(e)}"

            # 检查日期重复
            if self.df.index.duplicated().any():
                dup_dates = self.df.index[self.df.index.duplicated()].unique()
                return False, f"存在重复日期: {dup_dates}"

            # 检查日期排序
            if not self.df.index.is_monotonic_increasing:
                return False, "日期未按升序排列"

            return True, "日期索引检查通过"

        except Exception as e:
            return False, f"日期检查过程出错: {str(e)}"

    def check_price_validity(self) -> Tuple[bool, str]:
        """
        检查价格数据的有效性
        - 价格为非负数
        - 高低价关系合理
        - 异常涨跌幅
        """
        try:
            # 检查是否有负数或0价格
            for col in ['open', 'high', 'low', 'close']:
                if (self.df[col] <= 0).any():
                    invalid_dates = self.df[self.df[col] <= 0].index
                    return False, f"{col}列在{invalid_dates}存在非正数值"

            # 检查高低价关系
            price_errors = []
            if (self.df['high'] < self.df['low']).any():
                dates = self.df[self.df['high'] < self.df['low']].index
                price_errors.append(f"最高价小于最低价 在 {dates}")

            if (self.df['high'] < self.df['open']).any():
                dates = self.df[self.df['high'] < self.df['open']].index
                price_errors.append(f"最高价小于开盘价 在 {dates}")

            if (self.df['high'] < self.df['close']).any():
                dates = self.df[self.df['high'] < self.df['close']].index
                price_errors.append(f"最高价小于收盘价 在 {dates}")

            if (self.df['low'] > self.df['open']).any():
                dates = self.df[self.df['low'] > self.df['open']].index
                price_errors.append(f"最低价大于开盘价 在 {dates}")

            if (self.df['low'] > self.df['close']).any():
                dates = self.df[self.df['low'] > self.df['close']].index
                price_errors.append(f"最低价大于收盘价 在 {dates}")

            if price_errors:
                return False, "\n".join(price_errors)

            # 检查异常涨跌幅
            daily_returns = self.df['close'].pct_change()
            abnormal_returns = daily_returns[abs(daily_returns) > 0.1]  # 涨跌幅超过10%
            if not abnormal_returns.empty:
                return False, f"存在异常涨跌幅: \n{abnormal_returns}"

            return True, "价格数据检查通过"

        except Exception as e:
            return False, f"价格检查过程出错: {str(e)}"

    def check_volume_validity(self) -> Tuple[bool, str]:
        """
        检查成交量数据的有效性
        - 成交量为非负整数
        - 检查异常成交量
        """
        try:
            # 检查负数成交量
            if (self.df['volume'] < 0).any():
                invalid_dates = self.df[self.df['volume'] < 0].index
                return False, f"存在负数成交量 在 {invalid_dates}"

            # 检查成交量是否为整数
            if not self.df['volume'].dtype.kind in 'iu':  # i:整数, u:无符号整数
                return False, "成交量不是整数类型"

            # 检查异常成交量
            volume_mean = self.df['volume'].mean()
            volume_std = self.df['volume'].std()
            abnormal_volume = self.df[abs(self.df['volume'] - volume_mean) > 5 * volume_std]
            if not abnormal_volume.empty:
                return False, f"存在异常成交量: \n{abnormal_volume}"

            return True, "成交量数据检查通过"

        except Exception as e:
            return False, f"成交量检查过程出错: {str(e)}"

    def check_missing_values(self) -> Tuple[bool, str]:
        """
        检查缺失值
        """
        try:
            missing = self.df.isnull().sum()
            if missing.any():
                return False, f"存在缺失值:\n{missing[missing > 0]}"
            return True, "无缺失值"

        except Exception as e:
            return False, f"缺失值检查过程出错: {str(e)}"

    def check_trading_dates(self) -> Tuple[bool, str]:
        """
        检查交易日期的连续性
        """
        try:
            # 获取所有工作日
            all_workdays = pd.date_range(start=self.df.index.min(),
                                       end=self.df.index.max(),
                                       freq='B')

            # 检查缺失的交易日
            missing_dates = set(all_workdays) - set(self.df.index)
            if missing_dates:
                return False, f"缺失以下交易日:\n{sorted(missing_dates)}"

            return True, "交易日期检查通过"

        except Exception as e:
            return False, f"交易日期检查过程出错: {str(e)}"

    def clean_data(self) -> pd.DataFrame:
        """
        清洗数据
        - 排序
        - 删除重复日期
        - 删除异常数据
        """
        # 排序
        self.df = self.df.sort_index()

        # 删除重复日期
        self.df = self.df[~self.df.index.duplicated(keep='first')]

        # 删除非正常价格的数据
        self.df = self.df[
            (self.df['open'] > 0) &
            (self.df['high'] > 0) &
            (self.df['low'] > 0) &
            (self.df['close'] > 0) &
            (self.df['high'] >= self.df['low']) &
            (self.df['high'] >= self.df['open']) &
            (self.df['high'] >= self.df['close']) &
            (self.df['low'] <= self.df['open']) &
            (self.df['low'] <= self.df['close'])
        ]

        # 删除非正常成交量的数据
        volume_mean = self.df['volume'].mean()
        volume_std = self.df['volume'].std()
        self.df = self.df[abs(self.df['volume'] - volume_mean) <= 5 * volume_std]

        return self.df

    def run_all_checks(self) -> None:
        """
        运行所有检查
        """
        checks = [
            ('日期索引检查', self.check_date_index()),
            ('价格数据检查', self.check_price_validity()),
            ('成交量数据检查', self.check_volume_validity()),
            ('缺失值检查', self.check_missing_values()),
            ('交易日期检查', self.check_trading_dates())
        ]

        print("数据检查报告:")
        print("=" * 50)
        for check_name, (status, message) in checks:
            print(f"\n{check_name}:")
            print("状态:", "通过" if status else "失败")
            print("详情:", message)
        print("\n" + "=" * 50)

# 使用示例
if __name__ == "__main__":
    # 读取数据
    df = pd.read_csv('stock_1d.csv', index_col='Date', parse_dates=True)

    # 创建清洗器实例
    cleaner = DataCleaner(df)

    # 运行所有检查
    cleaner.run_all_checks()

    # 清洗数据
    clean_df = cleaner.clean_data()

    print("\n清洗后的数据概要:")
    print(clean_df.describe())

    # 保存清洗后的数据
    clean_df.to_csv('clean_data.csv')

数据检查报告:

日期索引检查:
状态: 通过
详情: 日期索引检查通过

价格数据检查:
状态: 失败
详情: 存在异常涨跌幅: 
Date
2020-03-13    0.119808
2020-03-16   -0.128647
2020-03-24    0.100326
2020-07-31    0.104689
Name: close, dtype: float64

成交量数据检查:
状态: 失败
详情: 存在异常成交量: 
                 close        high        low        open     volume
Date                                                                
2020-02-28   66.338745   67.564279  62.215630   62.431618  426510000
2020-03-12   60.240219   65.523343  60.184404   62.111276  418474000
2020-03-20   55.631737   61.113855  55.330813   59.985395  401693200
2020-07-31  103.427498  103.578365  98.137370  100.142463  374336800

缺失值检查:
状态: 通过
详情: 无缺失值

交易日期检查:
状态: 失败
详情: 缺失以下交易日:
[Timestamp('2020-01-20 00:00:00'), Timestamp('2020-02-17 00:00:00'), Timestamp('2020-04-10 00:00:00'), Timestamp('2020-05-25 00:00:00'), Timestamp('2020-07-03 00:00:00'), Timestamp('2020-09-07 00:00:00'), Timestamp('2020-11-26 00:00:00'), Timestamp('2020-12-25 00:00:00'), Timestamp('2021-01-01 00:00:00'), Tim

In [5]:
class DataCleaner:
    """股票数据清洗类"""

    def __init__(self, df: pd.DataFrame, price_limit: float = None):
        """
        初始化

        参数:
        df: pd.DataFrame - 包含OHLCV数据的DataFrame
        price_limit: float - 涨跌幅限制，如0.1表示10%涨跌幅限制，None表示无涨跌幅限制
        """
        self.df = df.copy()
        # 确保列名标准化
        self.df.columns = [col.lower() for col in self.df.columns]
        self.price_limit = price_limit


    def check_date_index(self) -> Tuple[bool, str]:
        """
        检查日期索引
        - 确保索引为日期类型
        - 检查日期是否有重复
        - 确保日期按升序排列
        """
        try:
            # 确保索引为日期类型
            if not isinstance(self.df.index, pd.DatetimeIndex):
                try:
                    self.df.index = pd.to_datetime(self.df.index)
                except Exception as e:
                    return False, f"无法将索引转换为日期类型: {str(e)}"

            # 检查日期重复
            if self.df.index.duplicated().any():
                dup_dates = self.df.index[self.df.index.duplicated()].unique()
                return False, f"存在重复日期: {dup_dates}"

            # 检查日期排序
            if not self.df.index.is_monotonic_increasing:
                return False, "日期未按升序排列"

            return True, "日期索引检查通过"

        except Exception as e:
            return False, f"日期检查过程出错: {str(e)}"



    def check_volume_validity(self) -> Tuple[bool, str]:
        """
        检查成交量数据的有效性
        - 成交量为非负整数
        - 检查异常成交量
        """
        try:
            # 检查负数成交量
            if (self.df['volume'] < 0).any():
                invalid_dates = self.df[self.df['volume'] < 0].index
                return False, f"存在负数成交量 在 {invalid_dates}"

            # 检查成交量是否为整数
            if not self.df['volume'].dtype.kind in 'iu':  # i:整数, u:无符号整数
                return False, "成交量不是整数类型"

            # 检查异常成交量
            volume_mean = self.df['volume'].mean()
            volume_std = self.df['volume'].std()
            abnormal_volume = self.df[abs(self.df['volume'] - volume_mean) > 5 * volume_std]
            if not abnormal_volume.empty:
                return False, f"存在异常成交量: \n{abnormal_volume}"

            return True, "成交量数据检查通过"

        except Exception as e:
            return False, f"成交量检查过程出错: {str(e)}"


    def check_missing_values(self) -> Tuple[bool, str]:
        """
        检查缺失值
        """
        try:
            missing = self.df.isnull().sum()
            if missing.any():
                return False, f"存在缺失值:\n{missing[missing > 0]}"
            return True, "无缺失值"

        except Exception as e:
            return False, f"缺失值检查过程出错: {str(e)}"


    def check_trading_dates(self) -> Tuple[bool, str]:
        """
        检查交易日期的连续性
        """
        try:
            # 获取所有工作日
            all_workdays = pd.date_range(start=self.df.index.min(),
                                       end=self.df.index.max(),
                                       freq='B')

            # 检查缺失的交易日
            missing_dates = set(all_workdays) - set(self.df.index)
            if missing_dates:
                return False, f"缺失以下交易日:\n{sorted(missing_dates)}"

            return True, "交易日期检查通过"

        except Exception as e:
            return False, f"交易日期检查过程出错: {str(e)}"

    def check_price_limit(self) -> Tuple[bool, str]:
        """
        检查涨跌幅限制

        返回:
        Tuple[bool, str] - (检查是否通过, 详细信息)
        """
        if self.price_limit is None:
            return True, "无涨跌幅限制"

        try:
            # 计算理论涨停价和跌停价
            prev_close = self.df['close'].shift(1)
            up_limit = prev_close * (1 + self.price_limit)
            down_limit = prev_close * (1 - self.price_limit)

            # 检查是否超过涨跌幅限制
            violations = []

            # 检查最高价是否超过涨停
            high_violations = self.df[self.df['high'] > up_limit]
            if not high_violations.empty:
                violations.append(f"以下日期最高价超过涨停价:\n{high_violations[['high']].join(up_limit.rename('涨停价'))}")

            # 检查最低价是否超过跌停
            low_violations = self.df[self.df['low'] < down_limit]
            if not low_violations.empty:
                violations.append(f"以下日期最低价超过跌停价:\n{low_violations[['low']].join(down_limit.rename('跌停价'))}")

            # 检查OHLC是否都在涨跌停范围内
            for col in ['open', 'high', 'low', 'close']:
                over_limit = self.df[
                    (self.df[col] > up_limit) |
                    (self.df[col] < down_limit)
                ]
                if not over_limit.empty:
                    violations.append(f"以下日期{col}价格超出涨跌停范围:\n{over_limit[[col]]}")

            if violations:
                return False, "\n".join(violations)

            return True, f"所有价格都在{self.price_limit*100}%涨跌幅限制范围内"

        except Exception as e:
            return False, f"涨跌幅检查过程出错: {str(e)}"

    def check_price_validity(self) -> Tuple[bool, str]:
        """
        检查价格数据的有效性
        - 价格为非负数
        - 高低价关系合理
        """
        try:
            # 检查是否有负数或0价格
            for col in ['open', 'high', 'low', 'close']:
                if (self.df[col] <= 0).any():
                    invalid_dates = self.df[self.df[col] <= 0].index
                    return False, f"{col}列在{invalid_dates}存在非正数值"

            # 检查高低价关系
            price_errors = []
            if (self.df['high'] < self.df['low']).any():
                dates = self.df[self.df['high'] < self.df['low']].index
                price_errors.append(f"最高价小于最低价 在 {dates}")

            if (self.df['high'] < self.df['open']).any():
                dates = self.df[self.df['high'] < self.df['open']].index
                price_errors.append(f"最高价小于开盘价 在 {dates}")

            if (self.df['high'] < self.df['close']).any():
                dates = self.df[self.df['high'] < self.df['close']].index
                price_errors.append(f"最高价小于收盘价 在 {dates}")

            if (self.df['low'] > self.df['open']).any():
                dates = self.df[self.df['low'] > self.df['open']].index
                price_errors.append(f"最低价大于开盘价 在 {dates}")

            if (self.df['low'] > self.df['close']).any():
                dates = self.df[self.df['low'] > self.df['close']].index
                price_errors.append(f"最低价大于收盘价 在 {dates}")

            if price_errors:
                return False, "\n".join(price_errors)

            return True, "价格数据检查通过"

        except Exception as e:
            return False, f"价格检查过程出错: {str(e)}"

    def clean_data(self) -> pd.DataFrame:
        """
        清洗数据
        - 排序
        - 删除重复日期
        - 删除异常数据
        - 删除超出涨跌幅限制的数据
        """
        # 排序
        self.df = self.df.sort_index()

        # 删除重复日期
        self.df = self.df[~self.df.index.duplicated(keep='first')]

        # 删除非正常价格的数据
        self.df = self.df[
            (self.df['open'] > 0) &
            (self.df['high'] > 0) &
            (self.df['low'] > 0) &
            (self.df['close'] > 0) &
            (self.df['high'] >= self.df['low']) &
            (self.df['high'] >= self.df['open']) &
            (self.df['high'] >= self.df['close']) &
            (self.df['low'] <= self.df['open']) &
            (self.df['low'] <= self.df['close'])
        ]

        # 删除超出涨跌幅限制的数据
        if self.price_limit is not None:
            prev_close = self.df['close'].shift(1)
            up_limit = prev_close * (1 + self.price_limit)
            down_limit = prev_close * (1 - self.price_limit)

            self.df = self.df[
                (self.df['open'].between(down_limit, up_limit)) &
                (self.df['high'].between(down_limit, up_limit)) &
                (self.df['low'].between(down_limit, up_limit)) &
                (self.df['close'].between(down_limit, up_limit))
            ]

        # 删除非正常成交量的数据
        volume_mean = self.df['volume'].mean()
        volume_std = self.df['volume'].std()
        self.df = self.df[abs(self.df['volume'] - volume_mean) <= 5 * volume_std]

        return self.df

    def run_all_checks(self) -> None:
        """
        运行所有检查
        """
        checks = [
            ('日期索引检查', self.check_date_index()),
            ('价格数据检查', self.check_price_validity()),
            ('涨跌幅检查', self.check_price_limit()),
            ('成交量数据检查', self.check_volume_validity()),
            ('缺失值检查', self.check_missing_values()),
            ('交易日期检查', self.check_trading_dates())
        ]

        print("数据检查报告:")
        print("=" * 50)
        for check_name, (status, message) in checks:
            print(f"\n{check_name}:")
            print("状态:", "通过" if status else "失败")
            print("详情:", message)
        print("\n" + "=" * 50)

# 使用示例
if __name__ == "__main__":
    # 读取数据
    df = pd.read_csv('stock_1d.csv', index_col='Date', parse_dates=True)

    # 创建不同涨跌幅限制的清洗器实例

    # 无涨跌幅限制
    cleaner_no_limit = DataCleaner(df, price_limit=None)
    print("\n无涨跌幅限制的检查结果:")
    cleaner_no_limit.run_all_checks()

#    # 10%涨跌幅限制
#    cleaner_10pct = DataCleaner(df, price_limit=0.1)
#    print("\n10%涨跌幅限制的检查结果:")
#    cleaner_10pct.run_all_checks()
#
#    # 20%涨跌幅限制
#    cleaner_20pct = DataCleaner(df, price_limit=0.2)
#    print("\n20%涨跌幅限制的检查结果:")
#    cleaner_20pct.run_all_checks()
#
#    # 30%涨跌幅限制
#    cleaner_30pct = DataCleaner(df, price_limit=0.3)
#    print("\n30%涨跌幅限制的检查结果:")
#    cleaner_30pct.run_all_checks()


无涨跌幅限制的检查结果:
数据检查报告:

日期索引检查:
状态: 通过
详情: 日期索引检查通过

价格数据检查:
状态: 通过
详情: 价格数据检查通过

涨跌幅检查:
状态: 通过
详情: 无涨跌幅限制

成交量数据检查:
状态: 失败
详情: 存在异常成交量: 
                 close        high        low        open     volume
Date                                                                
2020-02-28   66.338745   67.564279  62.215630   62.431618  426510000
2020-03-12   60.240219   65.523343  60.184404   62.111276  418474000
2020-03-20   55.631737   61.113855  55.330813   59.985395  401693200
2020-07-31  103.427498  103.578365  98.137370  100.142463  374336800

缺失值检查:
状态: 通过
详情: 无缺失值

交易日期检查:
状态: 失败
详情: 缺失以下交易日:
[Timestamp('2020-01-20 00:00:00'), Timestamp('2020-02-17 00:00:00'), Timestamp('2020-04-10 00:00:00'), Timestamp('2020-05-25 00:00:00'), Timestamp('2020-07-03 00:00:00'), Timestamp('2020-09-07 00:00:00'), Timestamp('2020-11-26 00:00:00'), Timestamp('2020-12-25 00:00:00'), Timestamp('2021-01-01 00:00:00'), Timestamp('2021-01-18 00:00:00'), Timestamp('2021-02-15 00:00:00'), Timestamp('2021-04-02

In [6]:
!pip install python-dateutil
!pip install borax
!pip install holidays



In [7]:
from typing import Tuple, List, Dict
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, date
from pandas.tseries.offsets import CustomBusinessDay
from dateutil.relativedelta import MO, TH
from borax.calendars.lunardate import LunarDate
#import holidays
import holidays as holidays_lib

class MarketCalendar:
    """市场交易日历类"""

    @staticmethod
    def get_lunar_holiday(year: int, month: int, day: int, days: int = 1) -> List[datetime]:
        """
        获取农历假期对应的公历日期

        参数:
        year: int - 年份
        month: int - 农历月份
        day: int - 农历日期
        days: int - 假期天数

        返回:
        List[datetime] - 假期日期列表（公历）
        """
        lunar_date = LunarDate(year, month, day)
        solar_date = lunar_date.to_solar_date()
        return [datetime.combine(solar_date + timedelta(days=i), datetime.min.time())
                for i in range(days)]

#    def get_us_holidays(year: int) -> List[datetime]:
#        """
#        获取美股市场假期
#
#        参数:
#        year: int - 年份
#
#        返回:
#        List[datetime] - 假期日期列表
#        """
#        # 使用holidays库获取美国联邦假期
#        us_holiday_dict = holidays.US(years=year)
#
#        # 获取基本假期
#        holiday_dates = set()
#
#        # 添加固定假期和特殊计算的假期
#        for date_obj, name in us_holiday_dict.items():
#            # 将date对象转换为datetime对象
#            if isinstance(date_obj, datetime):
#                holiday_date = date_obj
#            else:
#                holiday_date = datetime.combine(date_obj, datetime.min.time())
#
#            # 排除非交易所假期
#            if name in [
#                "New Year's Day",
#                "Martin Luther King Jr. Day",
#                "Presidents' Day",
#                "Good Friday",  # 需要特殊处理
#                "Memorial Day",
#                "Juneteenth National Independence Day",
#                "Independence Day",
#                "Labor Day",
#                "Thanksgiving",
#                "Christmas Day"
#            ]:
#                holiday_dates.add(holiday_date)  # 使用转换后的holiday_date
#
#        # 特殊处理:
#        # 1. 如果假期在周六，则周五休市
#        # 2. 如果假期在周日，则周一休市
#        final_dates = set()
#        for date_obj in holiday_dates:
#            if date_obj.weekday() == 5:  # 周六
#                final_dates.add(date_obj - timedelta(days=1))
#            elif date_obj.weekday() == 6:  # 周日
#                final_dates.add(date_obj + timedelta(days=1))
#            else:
#                final_dates.add(date_obj)
#
#        # 添加耶稣受难日（Good Friday）
#        def get_easter_sunday(year):
#            a = year % 19
#            b = year // 100
#            c = year % 100
#            d = b // 4
#            e = b % 4
#            f = (b + 8) // 25
#            g = (b - f + 1) // 3
#            h = (19 * a + b - d - g + 15) % 30
#            i = c // 4
#            k = c % 4
#            l = (32 + 2 * e + 2 * i - h - k) % 7
#            m = (a + 11 * h + 22 * l) // 451
#            month = (h + l - 7 * m + 114) // 31
#            day = ((h + l - 7 * m + 114) % 31) + 1
#            return datetime(year, month, day)
#
#        easter_sunday = get_easter_sunday(year)
#        good_friday = easter_sunday - timedelta(days=2)
#        final_dates.add(good_friday)
#
#        return sorted(list(final_dates))

    @staticmethod
    def get_hk_holidays(year: int) -> List[datetime]:
        """
        获取港股市场假期

        参数:
        year: int - 年份

        返回:
        List[datetime] - 假期日期列表
        """
        holidays = []

        # 固定日期的假期
        fixed_holidays = {
            "NEW_YEARS_DAY": [(1, 1, 1)],  # (月,日,天数)
            "LABOR_DAY": [(5, 1, 1)],
            "HONG_KONG_SAR_DAY": [(7, 1, 1)],
            "NATIONAL_DAY": [(10, 1, 1)],
            "CHRISTMAS": [(12, 25, 2)],  # 圣诞节和节礼日
        }

        # 添加固定假期
        for dates in fixed_holidays.values():
            for month, day, days in dates:
                holiday_date = datetime(year, month, day)
                for i in range(days):
                    holidays.append(holiday_date + timedelta(days=i))

        # 农历假期
        lunar_holidays = {
            "CHINESE_NEW_YEAR": [(1, 1, 3)],  # 春节
            "CHING_MING_FESTIVAL": [(3, 5, 1)],  # 清明节
            "DRAGON_BOAT_FESTIVAL": [(5, 5, 1)],  # 端午节
            "MID_AUTUMN_FESTIVAL": [(8, 15, 1)],  # 中秋节
            "CHUNG_YEUNG_FESTIVAL": [(9, 9, 1)],  # 重阳节
        }

        # 添加农历假期
        for dates in lunar_holidays.values():
            for month, day, days in dates:
                holidays.extend(MarketCalendar.get_lunar_holiday(year, month, day, days))

        # 复活节和耶稣受难日
        def get_easter_sunday(year):
            a = year % 19
            b = year // 100
            c = year % 100
            d = b // 4
            e = b % 4
            f = (b + 8) // 25
            g = (b - f + 1) // 3
            h = (19 * a + b - d - g + 15) % 30
            i = c // 4
            k = c % 4
            l = (32 + 2 * e + 2 * i - h - k) % 7
            m = (a + 11 * h + 22 * l) // 451
            month = (h + l - 7 * m + 114) // 31
            day = ((h + l - 7 * m + 114) % 31) + 1
            return datetime(year, month, day)

        easter_sunday = get_easter_sunday(year)
        good_friday = easter_sunday - timedelta(days=2)
        easter_monday = easter_sunday + timedelta(days=1)
        holidays.extend([good_friday, easter_monday])

        return sorted(list(set(holidays)))


    @staticmethod
    def get_us_holidays(year: int) -> List[datetime]:
        """
        获取美股市场假期

        参数:
        year: int - 年份

        返回:
        List[datetime] - 假期日期列表
        """
        try:
            # 创建美国假期日历实例
            us_calendar = holidays_lib.country_holidays('US', years=year)

            # 获取基本假期
            holiday_dates = set()

            # 添加固定假期和特殊计算的假期
            for date_obj, name in us_calendar.items():
                # 将date对象转换为datetime对象
                if isinstance(date_obj, datetime):
                    holiday_date = date_obj
                else:
                    holiday_date = datetime.combine(date_obj, datetime.min.time())

                # 排除非交易所假期
                if name in [
                    "New Year's Day",
                    "Martin Luther King Jr. Day",
                    "Presidents' Day",
                    "Good Friday",  # 需要特殊处理
                    "Memorial Day",
                    "Juneteenth National Independence Day",
                    "Independence Day",
                    "Labor Day",
                    "Thanksgiving",
                    "Christmas Day"
                ]:
                    holiday_dates.add(holiday_date)

            # 特殊处理:
            # 1. 如果假期在周六，则周五休市
            # 2. 如果假期在周日，则周一休市
            final_dates = set()
            for date_obj in holiday_dates:
                if date_obj.weekday() == 5:  # 周六
                    final_dates.add(date_obj - timedelta(days=1))
                elif date_obj.weekday() == 6:  # 周日
                    final_dates.add(date_obj + timedelta(days=1))
                else:
                    final_dates.add(date_obj)

            # 添加耶稣受难日（Good Friday）
            def get_easter_sunday(year):
                a = year % 19
                b = year // 100
                c = year % 100
                d = b // 4
                e = b % 4
                f = (b + 8) // 25
                g = (b - f + 1) // 3
                h = (19 * a + b - d - g + 15) % 30
                i = c // 4
                k = c % 4
                l = (32 + 2 * e + 2 * i - h - k) % 7
                m = (a + 11 * h + 22 * l) // 451
                month = (h + l - 7 * m + 114) // 31
                day = ((h + l - 7 * m + 114) % 31) + 1
                return datetime(year, month, day)

            easter_sunday = get_easter_sunday(year)
            good_friday = easter_sunday - timedelta(days=2)
            final_dates.add(good_friday)

            return sorted(list(final_dates))

        except Exception as e:
            print(f"获取美国假期时出错: {str(e)}")  # 添加错误信息打印
            raise  # 重新抛出异常


    @staticmethod
    def get_cn_holidays(year: int) -> List[datetime]:
        """
        获取A股市场假期

        参数:
        year: int - 年份

        返回:
        List[datetime] - 假期日期列表
        """
        holidays = []

        # 固定日期的假期
        fixed_holidays = {
            "NEW_YEARS_DAY": [(1, 1, 1)],  # (月,日,天数)
            "LABOR_DAY": [(5, 1, 5)],  # 五一假期
            "NATIONAL_DAY": [(10, 1, 7)],  # 国庆节
        }

        # 添加固定假期
        for dates in fixed_holidays.values():
            for month, day, days in dates:
                holiday_date = datetime(year, month, day)
                for i in range(days):
                    holidays.append(holiday_date + timedelta(days=i))

        # 农历假期
        lunar_holidays = {
            "CHINESE_NEW_YEAR": [(1, 1, 7)],  # 春节
            "CHING_MING_FESTIVAL": [(3, 5, 3)],  # 清明节
            "DRAGON_BOAT_FESTIVAL": [(5, 5, 3)],  # 端午节
            "MID_AUTUMN_FESTIVAL": [(8, 15, 3)],  # 中秋节
        }

        # 添加农历假期
        for dates in lunar_holidays.values():
            for month, day, days in dates:
                holidays.extend(MarketCalendar.get_lunar_holiday(year, month, day, days))

        return sorted(list(set(holidays)))

    @staticmethod
    def get_market_holidays(market: str, year: int) -> List[datetime]:
        """
        获取指定市场和年份的假期列表

        参数:
        market: str - 市场类型 ('US', 'HK', 'CN')
        year: int - 年份

        返回:
        List[datetime] - 假期日期列表
        """
        #market_holiday_funcs = {
        #    'US': MarketCalendar.get_us_holidays,
        #    'HK': MarketCalendar.get_hk_holidays,
        #    'CN': MarketCalendar.get_cn_holidays
        #}
        #
        #if market not in market_holiday_funcs:
        #    raise ValueError(f"不支持的市场类型: {market}")
        #
        #return market_holiday_funcs[market](year)

        # 将市场类型转换为大写
        market = market.upper()

        # 使用对应的假期获取函数
        if market == 'US':
            return MarketCalendar.get_us_holidays(year)
        elif market == 'HK':
            return MarketCalendar.get_hk_holidays(year)
        elif market == 'CN':
            return MarketCalendar.get_cn_holidays(year)
        else:
            raise ValueError(f"不支持的市场类型: {market}")

class DataCleaner:
    """股票数据清洗类"""

    def __init__(self, df: pd.DataFrame, market: str = 'CN', price_limit: float = None):
        """
        初始化

        参数:
        df: pd.DataFrame - 包含OHLCV数据的DataFrame
        market: str - 市场类型 ('US', 'HK', 'CN')
        price_limit: float - 涨跌幅限制
        """
        self.df = df.copy()
        self.market = market.upper()
        self.price_limit = price_limit

        # 确保列名标准化
        self.df.columns = [col.lower() for col in self.df.columns]

        # 验证市场类型
        valid_markets = ['US', 'HK', 'CN']
        if self.market not in valid_markets:
            raise ValueError(f"市场类型必须是 {valid_markets} 之一")

        # 设置默认涨跌幅限制
        if price_limit is None:
            if market == 'CN':
                self.price_limit = 0.1  # A股默认10%涨跌幅
            elif market == 'HK':
                self.price_limit = None  # 港股无涨跌幅限制
            else:
                self.price_limit = None  # 美股无涨跌幅限制

# 使用示例
if __name__ == "__main__":
    # 读取数据
    df = pd.read_csv('stock_1d.csv', index_col='Date', parse_dates=True)

    # 测试不同市场的假期计算
    year = 2023
    markets = ['US', 'HK', 'CN']

    for market in markets:
        print(f"\n{market}市场 {year}年假期列表:")
        print("="*50)
        holidays = MarketCalendar.get_market_holidays(market, year)
        for holiday in holidays:
            print(holiday.strftime('%Y-%m-%d'))


US市场 2023年假期列表:
2023-01-02
2023-01-16
2023-04-07
2023-05-29
2023-06-19
2023-07-04
2023-09-04
2023-12-25

HK市场 2023年假期列表:
2023-01-01
2023-01-22
2023-01-23
2023-01-24
2023-04-07
2023-04-10
2023-04-24
2023-05-01
2023-06-22
2023-07-01
2023-09-29
2023-10-01
2023-10-23
2023-12-25
2023-12-26

CN市场 2023年假期列表:
2023-01-01
2023-01-22
2023-01-23
2023-01-24
2023-01-25
2023-01-26
2023-01-27
2023-01-28
2023-04-24
2023-04-25
2023-04-26
2023-05-01
2023-05-02
2023-05-03
2023-05-04
2023-05-05
2023-06-22
2023-06-23
2023-06-24
2023-09-29
2023-09-30
2023-10-01
2023-10-02
2023-10-03
2023-10-04
2023-10-05
2023-10-06
2023-10-07


In [8]:
class DataCleaner:
    """股票数据清洗类"""

    def __init__(self, df: pd.DataFrame, market: str = 'US', price_limit: float = None):
        """
        初始化

        参数:
        df: pd.DataFrame - 包含OHLCV数据的DataFrame
        market: str - 市场类型 ('US', 'HK', 'CN')，默认为'US'
        price_limit: float - 涨跌幅限制，如0.1表示10%涨跌幅限制，None表示无涨跌幅限制
        """
        self.df = df.copy()
        self.market = market.upper()
        # 确保列名标准化
        self.df.columns = [col.lower() for col in self.df.columns]

        # 验证市场类型
        valid_markets = ['US', 'HK', 'CN']
        if self.market not in valid_markets:
            raise ValueError(f"市场类型必须是 {valid_markets} 之一")

        # 设置默认涨跌幅限制
        if price_limit is None:
            if market == 'CN':
                self.price_limit = 0.1  # A股默认10%涨跌幅
            else:
                self.price_limit = None  # 其他市场默认无涨跌幅限制
        else:
            self.price_limit = price_limit


    def check_date_index(self) -> Tuple[bool, str]:
        """
        检查日期索引
        - 确保索引为日期类型
        - 检查日期是否有重复
        - 确保日期按升序排列
        """
        try:
            # 确保索引为日期类型
            if not isinstance(self.df.index, pd.DatetimeIndex):
                try:
                    self.df.index = pd.to_datetime(self.df.index)
                except Exception as e:
                    return False, f"无法将索引转换为日期类型: {str(e)}"

            # 检查日期重复
            if self.df.index.duplicated().any():
                dup_dates = self.df.index[self.df.index.duplicated()].unique()
                return False, f"存在重复日期: {dup_dates}"

            # 检查日期排序
            if not self.df.index.is_monotonic_increasing:
                return False, "日期未按升序排列"

            return True, "日期索引检查通过"

        except Exception as e:
            return False, f"日期检查过程出错: {str(e)}"



    def check_volume_validity(self) -> Tuple[bool, str]:
        """
        检查成交量数据的有效性
        - 成交量为非负整数
        - 检查异常成交量
        """
        try:
            # 检查负数成交量
            if (self.df['volume'] < 0).any():
                invalid_dates = self.df[self.df['volume'] < 0].index
                return False, f"存在负数成交量 在 {invalid_dates}"

            # 检查成交量是否为整数
            if not self.df['volume'].dtype.kind in 'iu':  # i:整数, u:无符号整数
                return False, "成交量不是整数类型"

            # 检查异常成交量
            volume_mean = self.df['volume'].mean()
            volume_std = self.df['volume'].std()
            abnormal_volume = self.df[abs(self.df['volume'] - volume_mean) > 5 * volume_std]
            if not abnormal_volume.empty:
                return False, f"存在异常成交量: \n{abnormal_volume}"

            return True, "成交量数据检查通过"

        except Exception as e:
            return False, f"成交量检查过程出错: {str(e)}"


    def check_missing_values(self) -> Tuple[bool, str]:
        """
        检查缺失值
        """
        try:
            missing = self.df.isnull().sum()
            if missing.any():
                return False, f"存在缺失值:\n{missing[missing > 0]}"
            return True, "无缺失值"

        except Exception as e:
            return False, f"缺失值检查过程出错: {str(e)}"


    def check_trading_dates(self) -> Tuple[bool, str]:
        """
        检查交易日期的连续性

        返回:
        Tuple[bool, str] - (检查是否通过, 详细信息)
        """
        try:
            # 获取数据的起止年份
            start_year = self.df.index.min().year
            end_year = self.df.index.max().year

            # 收集所有假期
            all_holidays = set()
            try:
                for year in range(start_year, end_year + 1):
                    holidays = MarketCalendar.get_market_holidays(self.market, year)
                    MarketCalendar.get_market_holidays(market, year)
                    all_holidays.update(holidays)
            except Exception as e:
                return False, f"获取假期数据失败: {str(e)}"

            # 获取所有工作日
            all_workdays = pd.date_range(
                start=self.df.index.min(),
                end=self.df.index.max(),
                freq='B'
            )

            # 排除节假日，得到应该交易的日期
            trading_days = set(all_workdays) - all_holidays

            # 检查缺失的交易日
            missing_dates = trading_days - set(self.df.index)

            # 检查多余的交易日（在节假日进行的交易）
            extra_dates = set(self.df.index) - trading_days

            messages = []
            if missing_dates:
                messages.append(f"缺失以下交易日:\n{sorted(missing_dates)}")

            if extra_dates:
                messages.append(f"以下日期为非交易日，但有交易数据:\n{sorted(extra_dates)}")

            if messages:
                return False, "\n".join(messages)

            return True, "交易日期检查通过"

        except Exception as e:
            return False, f"交易日期检查过程出错: {str(e)}"

    def check_price_limit(self) -> Tuple[bool, str]:
        """
        检查涨跌幅限制

        返回:
        Tuple[bool, str] - (检查是否通过, 详细信息)
        """
        if self.price_limit is None:
            return True, "无涨跌幅限制"

        try:
            # 计算理论涨停价和跌停价
            prev_close = self.df['close'].shift(1)
            up_limit = prev_close * (1 + self.price_limit)
            down_limit = prev_close * (1 - self.price_limit)

            # 检查是否超过涨跌幅限制
            violations = []

            # 检查最高价是否超过涨停
            high_violations = self.df[self.df['high'] > up_limit]
            if not high_violations.empty:
                violations.append(f"以下日期最高价超过涨停价:\n{high_violations[['high']].join(up_limit.rename('涨停价'))}")

            # 检查最低价是否超过跌停
            low_violations = self.df[self.df['low'] < down_limit]
            if not low_violations.empty:
                violations.append(f"以下日期最低价超过跌停价:\n{low_violations[['low']].join(down_limit.rename('跌停价'))}")

            # 检查OHLC是否都在涨跌停范围内
            for col in ['open', 'high', 'low', 'close']:
                over_limit = self.df[
                    (self.df[col] > up_limit) |
                    (self.df[col] < down_limit)
                ]
                if not over_limit.empty:
                    violations.append(f"以下日期{col}价格超出涨跌停范围:\n{over_limit[[col]]}")

            if violations:
                return False, "\n".join(violations)

            return True, f"所有价格都在{self.price_limit*100}%涨跌幅限制范围内"

        except Exception as e:
            return False, f"涨跌幅检查过程出错: {str(e)}"

    def check_price_validity(self) -> Tuple[bool, str]:
        """
        检查价格数据的有效性
        - 价格为非负数
        - 高低价关系合理
        """
        try:
            # 检查是否有负数或0价格
            for col in ['open', 'high', 'low', 'close']:
                if (self.df[col] <= 0).any():
                    invalid_dates = self.df[self.df[col] <= 0].index
                    return False, f"{col}列在{invalid_dates}存在非正数值"

            # 检查高低价关系
            price_errors = []
            if (self.df['high'] < self.df['low']).any():
                dates = self.df[self.df['high'] < self.df['low']].index
                price_errors.append(f"最高价小于最低价 在 {dates}")

            if (self.df['high'] < self.df['open']).any():
                dates = self.df[self.df['high'] < self.df['open']].index
                price_errors.append(f"最高价小于开盘价 在 {dates}")

            if (self.df['high'] < self.df['close']).any():
                dates = self.df[self.df['high'] < self.df['close']].index
                price_errors.append(f"最高价小于收盘价 在 {dates}")

            if (self.df['low'] > self.df['open']).any():
                dates = self.df[self.df['low'] > self.df['open']].index
                price_errors.append(f"最低价大于开盘价 在 {dates}")

            if (self.df['low'] > self.df['close']).any():
                dates = self.df[self.df['low'] > self.df['close']].index
                price_errors.append(f"最低价大于收盘价 在 {dates}")

            if price_errors:
                return False, "\n".join(price_errors)

            return True, "价格数据检查通过"

        except Exception as e:
            return False, f"价格检查过程出错: {str(e)}"

    def clean_data(self) -> pd.DataFrame:
        """
        清洗数据
        - 排序
        - 删除重复日期
        - 删除异常数据
        - 删除超出涨跌幅限制的数据
        """
        # 排序
        self.df = self.df.sort_index()

        # 删除重复日期
        self.df = self.df[~self.df.index.duplicated(keep='first')]

        # 删除非正常价格的数据
        self.df = self.df[
            (self.df['open'] > 0) &
            (self.df['high'] > 0) &
            (self.df['low'] > 0) &
            (self.df['close'] > 0) &
            (self.df['high'] >= self.df['low']) &
            (self.df['high'] >= self.df['open']) &
            (self.df['high'] >= self.df['close']) &
            (self.df['low'] <= self.df['open']) &
            (self.df['low'] <= self.df['close'])
        ]

        # 删除超出涨跌幅限制的数据
        if self.price_limit is not None:
            prev_close = self.df['close'].shift(1)
            up_limit = prev_close * (1 + self.price_limit)
            down_limit = prev_close * (1 - self.price_limit)

            self.df = self.df[
                (self.df['open'].between(down_limit, up_limit)) &
                (self.df['high'].between(down_limit, up_limit)) &
                (self.df['low'].between(down_limit, up_limit)) &
                (self.df['close'].between(down_limit, up_limit))
            ]

        # 删除非正常成交量的数据
        volume_mean = self.df['volume'].mean()
        volume_std = self.df['volume'].std()
        self.df = self.df[abs(self.df['volume'] - volume_mean) <= 5 * volume_std]

        return self.df

    def run_all_checks(self) -> None:
        """
        运行所有检查
        """
        checks = [
            ('日期索引检查', self.check_date_index()),
            ('价格数据检查', self.check_price_validity()),
            ('涨跌幅检查', self.check_price_limit()),
            ('成交量数据检查', self.check_volume_validity()),
            ('缺失值检查', self.check_missing_values()),
            ('交易日期检查', self.check_trading_dates())
        ]

        print("数据检查报告:")
        print("=" * 50)
        for check_name, (status, message) in checks:
            print(f"\n{check_name}:")
            print("状态:", "通过" if status else "失败")
            print("详情:", message)
        print("\n" + "=" * 50)

# 使用示例
if __name__ == "__main__":
    # 读取数据
    df = pd.read_csv('stock_1d.csv', index_col='Date', parse_dates=True)

    # 创建不同涨跌幅限制的清洗器实例

    # 无涨跌幅限制
    cleaner_no_limit = DataCleaner(df, market='US', price_limit=None)
    print("\n无涨跌幅限制的检查结果:")
    cleaner_no_limit.run_all_checks()

#    # 10%涨跌幅限制
#    cleaner_10pct = DataCleaner(df, price_limit=0.1)
#    print("\n10%涨跌幅限制的检查结果:")
#    cleaner_10pct.run_all_checks()
#
#    # 20%涨跌幅限制
#    cleaner_20pct = DataCleaner(df, price_limit=0.2)
#    print("\n20%涨跌幅限制的检查结果:")
#    cleaner_20pct.run_all_checks()
#
#    # 30%涨跌幅限制
#    cleaner_30pct = DataCleaner(df, price_limit=0.3)
#    print("\n30%涨跌幅限制的检查结果:")
#    cleaner_30pct.run_all_checks()


无涨跌幅限制的检查结果:
数据检查报告:

日期索引检查:
状态: 通过
详情: 日期索引检查通过

价格数据检查:
状态: 通过
详情: 价格数据检查通过

涨跌幅检查:
状态: 通过
详情: 无涨跌幅限制

成交量数据检查:
状态: 失败
详情: 存在异常成交量: 
                 close        high        low        open     volume
Date                                                                
2020-02-28   66.338745   67.564279  62.215630   62.431618  426510000
2020-03-12   60.240219   65.523343  60.184404   62.111276  418474000
2020-03-20   55.631737   61.113855  55.330813   59.985395  401693200
2020-07-31  103.427498  103.578365  98.137370  100.142463  374336800

缺失值检查:
状态: 通过
详情: 无缺失值

交易日期检查:
状态: 失败
详情: 缺失以下交易日:
[Timestamp('2020-02-17 00:00:00'), Timestamp('2020-11-26 00:00:00'), Timestamp('2021-02-15 00:00:00'), Timestamp('2021-11-25 00:00:00'), Timestamp('2022-02-21 00:00:00'), Timestamp('2022-11-24 00:00:00'), Timestamp('2023-02-20 00:00:00')]
以下日期为非交易日，但有交易数据:
[Timestamp('2021-06-18 00:00:00'), Timestamp('2021-12-31 00:00:00')]

