<a href="https://colab.research.google.com/github/wannasmile/colab_code_note/blob/main/QUANT021.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!cp -rf /content/drive/MyDrive/hstech_stock_data /content/hstech_stock_data

Mounted at /content/drive


In [2]:
import pandas as pd

# 股票代码列表
hstech_stock_codes = [
    '1810.HK', '9988.HK', '0700.HK', '3690.HK', '9618.HK', '1024.HK',
    '0981.HK', '2015.HK', '9868.HK', '9999.HK', '2382.HK', '0992.HK',
    '0285.HK', '9626.HK', '1347.HK', '0020.HK', '2018.HK', '0268.HK',
    '1833.HK', '9698.HK', '9888.HK', '9961.HK', '0772.HK', '9866.HK',
    '0522.HK', '6618.HK', '3888.HK', '6690.HK', '6060.HK', '0909.HK'
]

# 指定保存数据的目录
data_directory = "hstech_stock_data"

stock_codes = hstech_stock_codes

for stock_code in stock_codes:
    try:
        features_filename = f"{data_directory}/{stock_code}_features.csv"
        label_filename = f"{data_directory}/{stock_code}_label.csv"
        features_label_filename = f"{data_directory}/{stock_code}_features_label.csv"

        # 读取两个 CSV 文件
        features_df = pd.read_csv(features_filename, index_col='Date', parse_dates=True)
        label_df = pd.read_csv(label_filename, index_col='Date', parse_dates=True)

        # 合并两个 DataFrame
        features_label_df = pd.merge(features_df, label_df[['label']], left_index=True, right_index=True, how='inner')

        # 将label列中的NaN值替换为'ING'
        features_label_df['label'] = features_label_df['label'].fillna('ING')

        # 打印标签分布
        print(f"\n{stock_code} 标签分布:")
        print(features_label_df['label'].value_counts(dropna=False))

        # 保存合并后的 DataFrame 到一个新的 CSV 文件
        features_label_df.to_csv(features_label_filename)

        print(f"处理完成: {stock_code}")

    except Exception as e:
        print(f"处理 {stock_code} 时出错: {str(e)}")
        continue

print("\n所有股票处理完成")


1810.HK 标签分布:
label
ING       461
BOTTOM    132
TOP       132
Name: count, dtype: int64
处理完成: 1810.HK

9988.HK 标签分布:
label
ING       442
BOTTOM    142
TOP       141
Name: count, dtype: int64
处理完成: 9988.HK

0700.HK 标签分布:
label
ING       472
BOTTOM    127
TOP       126
Name: count, dtype: int64
处理完成: 0700.HK

3690.HK 标签分布:
label
ING       457
BOTTOM    134
TOP       134
Name: count, dtype: int64
处理完成: 3690.HK

9618.HK 标签分布:
label
ING       442
BOTTOM    142
TOP       141
Name: count, dtype: int64
处理完成: 9618.HK

1024.HK 标签分布:
label
ING       474
BOTTOM    126
TOP       125
Name: count, dtype: int64
处理完成: 1024.HK

0981.HK 标签分布:
label
ING       483
BOTTOM    121
TOP       121
Name: count, dtype: int64
处理完成: 0981.HK

2015.HK 标签分布:
label
ING       445
BOTTOM    140
TOP       140
Name: count, dtype: int64
处理完成: 2015.HK

9868.HK 标签分布:
label
ING       439
BOTTOM    143
TOP       143
Name: count, dtype: int64
处理完成: 9868.HK

9999.HK 标签分布:
label
ING       436
BOTTOM    145
TOP       144
Name: coun

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb
from datetime import timedelta
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight


class LabelEncoderSingleton:
    """
    单例模式的标签编码器，确保所有实例使用相同的编码映射
    """
    _instance = None
    _initialized = False
    _labels = None

    @classmethod
    def initialize(cls, labels):
        """
        初始化标签列表，必须在创建实例之前调用
        """
        if cls._instance is not None:
            raise RuntimeError("标签编码器已经被初始化，不能重新设置标签")
        cls._labels = labels

    def __new__(cls):
        if cls._labels is None:
            raise RuntimeError("请先使用 LabelEncoderSingleton.initialize(...) 初始化标签列表")

        if cls._instance is None:
            cls._instance = super(LabelEncoderSingleton, cls).__new__(cls)
        return cls._instance

    def __init__(self):
        if not self._initialized:
            self.encoder = LabelEncoder()
            # 使用预设的标签列表
            self.encoder.fit(self._labels)
            self._initialized = True

    def transform(self, y):
        return self.encoder.transform(y)

    def inverse_transform(self, y):
        return self.encoder.inverse_transform(y)

    @property
    def classes_(self):
        return self.encoder.classes_

class SlidingWindowModel:
    def __init__(self,
                 data: pd.DataFrame,
                 feature_columns: list,
                 label_column: str = 'label',
                 observation_window: int = 7,
                 performance_window: int = 2,
                 oot_samples: int = 30):
        """
        初始化滑动窗口模型
        """
        self.data = data.copy()
        self.feature_columns = feature_columns
        self.label_column = label_column
        self.observation_window = observation_window
        self.performance_window = performance_window
        self.oot_samples = oot_samples

        # 处理NaN值
        print("\n处理前的标签分布:")
        print(self.data[label_column].value_counts(dropna=False))

        # 移除标签为NaN的样本
        self.data = self.data.dropna(subset=[label_column])
        print("\n处理后的标签分布:")
        print(self.data[label_column].value_counts())

        # 使用单例标签编码器
        self.le = LabelEncoderSingleton()
        self.data['label_encoded'] = self.le.transform(self.data[label_column])

        # 保存类别标签
        self.classes_ = self.le.classes_

        # 初始化模型
        self.model = None

        # 打印初始化信息
        print(f"\n数据集大小: {len(self.data)}")
        print(f"特征数量: {len(self.feature_columns)}")
        print("\n标签编码映射:")
        for i, label in enumerate(self.le.classes_):
            print(f"{label}: {i}")

    def generate_single_sample(self, window_data: pd.DataFrame) -> np.ndarray:
        """
        从观察窗口生成特征

        现在会包含t时刻的特征，以及如果observation_window>1，
        还会包含t-1, t-2等时刻的特征和统计量
        """
        features = []

        # 获取最新时间点(t时刻)的特征
        latest_data = window_data.iloc[-1]
        for col in self.feature_columns:
            # 添加t时刻的特征值
            feature_value = latest_data[col]
            if pd.isna(feature_value):
                feature_value = window_data[col].dropna().iloc[-1] if not window_data[col].dropna().empty else 0
            features.append(feature_value)

            # 只有当观察窗口大于1时才添加历史数据和统计特征
            if self.observation_window > 2:  # 改为>2，因为=2时只需要t和t-1的原始值
                historical_data = window_data[col].iloc[:-1]  # 不包括最新的数据点

                # 添加统计特征
                features.extend([
                    historical_data.mean(),
                    historical_data.std() if len(historical_data.dropna()) > 1 else 0,
                    historical_data.max() if not historical_data.dropna().empty else 0,
                    historical_data.min() if not historical_data.dropna().empty else 0,
                    (feature_value - historical_data.iloc[0]) if not historical_data.isna().any() else 0
                ])
            elif self.observation_window == 2:  # 当窗口为2时，只添加t-1时刻的原始值
                t_minus_1_value = window_data[col].iloc[0]
                if pd.isna(t_minus_1_value):
                    t_minus_1_value = 0
                features.append(t_minus_1_value)

        return np.array(features)

    def generate_samples(self) -> tuple:
        """
        生成训练样本和OOT样本
        现在使用t时刻的特征来预测t时刻的标签
        """
        if not isinstance(self.data.index, pd.DatetimeIndex):
            self.data.index = pd.to_datetime(self.data.index)

        # 由于现在预测同期标签，不需要额外的performance_window
        total_samples_needed = self.observation_window

        # 计算OOT数据的起始位置
        oot_start_idx = len(self.data) - self.oot_samples - total_samples_needed

        # 划分训练集和OOT集
        train_data = self.data.iloc[:oot_start_idx].copy()
        oot_data = self.data.iloc[oot_start_idx:].copy()

        print(f"\n训练数据起止日期: {train_data.index[0]} 到 {train_data.index[-1]}")
        print(f"OOT数据起止日期: {oot_data.index[0]} 到 {oot_data.index[-1]}")

        X_train_list, y_train_list, dates_train = [], [], []
        X_oot_list, y_oot_list, dates_oot = [], [], []

        # 生成训练样本
        for i in range(len(train_data) - total_samples_needed + 1):
            current_idx = i + total_samples_needed - 1  # t时刻的索引

            # 确保t时刻有有效的标签
            if pd.isna(train_data.iloc[current_idx]['label_encoded']):
                continue

            # 获取观察窗口的数据（包括t时刻）
            window_data = train_data.iloc[i:i+self.observation_window][self.feature_columns]
            features = self.generate_single_sample(window_data)

            # 使用t时刻的标签
            label = train_data.iloc[current_idx]['label_encoded']

            X_train_list.append(features)
            y_train_list.append(label)
            dates_train.append(train_data.index[current_idx])

        # 生成OOT样本
        for i in range(len(oot_data) - total_samples_needed + 1):
            current_idx = i + total_samples_needed - 1  # t时刻的索引

            # 确保t时刻有有效的标签
            if pd.isna(oot_data.iloc[current_idx]['label_encoded']):
                continue

            # 获取观察窗口的数据（包括t时刻）
            window_data = oot_data.iloc[i:i+self.observation_window][self.feature_columns]
            features = self.generate_single_sample(window_data)

            # 使用t时刻的标签
            label = oot_data.iloc[current_idx]['label_encoded']

            X_oot_list.append(features)
            y_oot_list.append(label)
            dates_oot.append(oot_data.index[current_idx])

        X_train = np.array(X_train_list)
        y_train = np.array(y_train_list)
        X_oot = np.array(X_oot_list)
        y_oot = np.array(y_oot_list)

        print(f"\n训练样本形状: X_train: {X_train.shape}, y_train: {y_train.shape}")
        print(f"OOT样本形状: X_oot: {X_oot.shape}, y_oot: {y_oot.shape}")

        print("\n训练集标签分布:")
        train_dist = pd.Series(self.le.inverse_transform(y_train)).value_counts()
        print(train_dist)
        print("\nOOT集标签分布:")
        oot_dist = pd.Series(self.le.inverse_transform(y_oot)).value_counts()
        print(oot_dist)

        return X_train, y_train, X_oot, y_oot, dates_train, dates_oot

    def plot_feature_importance(self, feature_importance: pd.DataFrame, top_n: int = 20):
        """
        绘制特征重要性图
        """
        plt.figure(figsize=(12, 6))
        sns.barplot(x='importance', y='feature',
                   data=feature_importance.head(top_n))
        plt.title(f'Top {top_n} Most Important Features')
        plt.xlabel('Importance')
        plt.ylabel('Feature')
        plt.tight_layout()
        plt.show()


    def evaluate_with_threshold(self, y_true, y_prob, thresholds=None):
        """
        使用不同阈值评估模型性能

        参数:
        y_true: 真实标签
        y_prob: 预测概率 (n_samples, n_classes)
        thresholds: 阈值字典，例如 {'TOP': 0.4, 'BOTTOM': 0.4}
        """
        if thresholds is None:
            thresholds = {'TOP': 0.4, 'BOTTOM': 0.4}

        # 获取每个类别的索引
        label_indices = {label: idx for idx, label in enumerate(self.classes_)}

        # 使用阈值进行预测
        predictions = []
        for probs in y_prob:
            top_prob = probs[label_indices['TOP']]
            bottom_prob = probs[label_indices['BOTTOM']]

            if top_prob >= thresholds['TOP']:
                pred = 'TOP'
            elif bottom_prob >= thresholds['BOTTOM']:
                pred = 'BOTTOM'
            else:
                pred = 'ING'
            predictions.append(pred)

        # 转换为数值标签
        y_pred = self.le.transform(predictions)

        # 计算评估指标
        report = classification_report(y_true, y_pred,
                                    labels=range(len(self.classes_)),
                                    target_names=self.classes_)
        conf_matrix = confusion_matrix(y_true, y_pred)

        return {
            'predictions': predictions,
            'probabilities': y_prob,
            'report': report,
            'confusion_matrix': conf_matrix
        }

    def train_model(self, model_type: str = 'xgb', thresholds=None, sampling_method='weight', smote_target_ratios=None):
        """
        训练模型并评估性能
        """
        X_train, y_train, X_oot, y_oot, dates_train, dates_oot = self.generate_samples()

        if len(X_train) == 0 or len(X_oot) == 0:
            raise ValueError("生成的样本为空，请检查窗口大小设置")

        # 保存原始训练数据的索引
        original_train_indices = np.arange(len(X_train))

        # 打印原始类别分布
        print("\n原始训练集类别分布:")
        original_dist = pd.Series(self.le.inverse_transform(y_train)).value_counts()
        print(original_dist)

        # 根据选择的方法处理类别不平衡
        if sampling_method == 'smote':
            if smote_target_ratios is None:
                # 默认设置：将少数类的样本量增加到多数类的70%
                majority_class_count = original_dist.max()
                smote_target_ratios = {}
                for label in original_dist.index:
                    if label != original_dist.index[0]:  # 假设第一个是多数类
                        smote_target_ratios[label] = 0.7

            # 构建SMOTE的采样策略字典
            sampling_strategy = {}
            for label, ratio in smote_target_ratios.items():
                target_count = int(original_dist.max() * ratio)
                current_count = original_dist[label]
                if target_count > current_count:  # 只有需要增加样本时才设置
                    sampling_strategy[self.le.transform([label])[0]] = target_count

            print("\nSMOTE采样策略:")
            print(sampling_strategy)

            # 使用SMOTE进行重采样
            smote = SMOTE(random_state=42, sampling_strategy=sampling_strategy)
            X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

            # 创建重采样后的日期数组
            # 对于原始样本，保持原始日期
            # 对于合成样本，使用最近的原始样本的日期
            n_synthetic = len(X_train_resampled) - len(X_train)
            dates_train_resampled = np.empty(len(X_train_resampled), dtype=object)

            # 复制原始日期
            dates_train_resampled[:len(dates_train)] = dates_train

            # 为合成样本分配日期（使用最近的原始样本的日期）
            for i in range(len(dates_train), len(dates_train_resampled)):
                # 找到最近的原始样本
                distances = np.linalg.norm(X_train_resampled[i] - X_train, axis=1)
                nearest_idx = np.argmin(distances)
                dates_train_resampled[i] = dates_train[nearest_idx]

            # 更新训练数据
            X_train = X_train_resampled
            y_train = y_train_resampled
            dates_train = dates_train_resampled

            # 打印重采样后的类别分布
            print("\nSMOTE重采样后的类别分布:")
            print(pd.Series(self.le.inverse_transform(y_train)).value_counts())

            # SMOTE后使用平衡的类别权重
            class_weights = 'balanced'

        else:  # sampling_method == 'weight'
            # 计算类别权重，给予少数类更高的权重
            class_weights = compute_class_weight(
                class_weight='balanced',
                classes=np.unique(y_train),
                y=y_train
            )
            class_weights = dict(zip(np.unique(y_train), class_weights))

            # 进一步调整权重，可以根据需要修改这些系数
            weight_multipliers = {
                self.le.transform(['TOP'])[0]: 2.0,    # 进一步增加TOP类的权重
                self.le.transform(['BOTTOM'])[0]: 2.0  # 进一步增加BOTTOM类的权重
            }

            for k, v in weight_multipliers.items():
                if k in class_weights:
                    class_weights[k] *= v

            print("\n使用的类别权重:")
            for k, v in class_weights.items():
                print(f"{self.le.inverse_transform([k])[0]}: {v:.2f}")

        # 创建和训练模型
        if model_type == 'rf':
            self.model = RandomForestClassifier(
                n_estimators=100,
                max_depth=10,
                random_state=42,
                class_weight=class_weights if sampling_method == 'weight' else None,
                n_jobs=-1
            )
        elif model_type == 'xgb':
            if sampling_method == 'weight':
                sample_weights = np.ones(len(y_train))
                for k, v in class_weights.items():
                    sample_weights[y_train == k] = v

                self.model = xgb.XGBClassifier(
                    n_estimators=100,
                    max_depth=10,
                    learning_rate=0.1,
                    random_state=42,
                    n_jobs=-1
                )
            else:
                self.model = xgb.XGBClassifier(
                    n_estimators=100,
                    max_depth=10,
                    learning_rate=0.1,
                    random_state=42,
                    n_jobs=-1
                )

        # 训练模型
        if model_type == 'xgb' and sampling_method == 'weight':
            self.model.fit(X_train, y_train, sample_weight=sample_weights)
        else:
            self.model.fit(X_train, y_train)

        # 获取预测概率
        train_proba = self.model.predict_proba(X_train)
        oot_proba = self.model.predict_proba(X_oot)

        # 评估训练集性能
        print("\n训练集性能:")
        train_results = self.evaluate_with_threshold(y_train, train_proba, thresholds)
        print(train_results['report'])

        # 评估OOT样本性能
        print("\nOOT样本性能:")
        oot_results = self.evaluate_with_threshold(y_oot, oot_proba, thresholds)
        print(oot_results['report'])
        print("\nOOT样本混淆矩阵:")
        print(oot_results['confusion_matrix'])

        # 特征重要性分析
        if model_type == 'xgb':
            feature_names = []
            for f in self.feature_columns:
                if self.observation_window > 1:
                    feature_names.extend([
                        f"{f}_latest",
                        f"{f}_mean",
                        f"{f}_std",
                        f"{f}_max",
                        f"{f}_min",
                        f"{f}_change"
                    ])
                else:
                    feature_names.append(f)  # 当observation_window为1时，只使用原始特征名

            importance = self.model.feature_importances_
            feature_importance = pd.DataFrame({
                'feature': feature_names[:len(importance)],
                'importance': importance
            })
            feature_importance = feature_importance.sort_values('importance', ascending=False)

            print("\n前20个最重要的特征:")
            print(feature_importance.head(20))

            # 绘制特征重要性图
            self.plot_feature_importance(feature_importance)


        # 创建包含概率的预测结果DataFrame
        train_predictions = pd.DataFrame({
            'date': dates_train,
            'predicted': train_results['predictions'],
            'true_label': self.le.inverse_transform(y_train)
        })

        for i, class_name in enumerate(self.classes_):
            train_predictions[f'prob_{class_name}'] = train_proba[:, i]

        oot_predictions = pd.DataFrame({
            'date': dates_oot,
            'predicted': oot_results['predictions'],
            'true_label': self.le.inverse_transform(y_oot)
        })

        for i, class_name in enumerate(self.classes_):
            oot_predictions[f'prob_{class_name}'] = oot_proba[:, i]

        return {
            'dates_train': dates_train,
            'dates_oot': dates_oot,
            'train_predictions': train_predictions,
            'oot_predictions': oot_predictions,
            'train_results': train_results,
            'oot_results': oot_results
        }


In [4]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime, timedelta

def process_multiple_stocks(data_directory: str,
                          observation_window: int = 1,
                          performance_window: int = 1,
                          oot_samples: int = 30,
                          initial_samples_to_skip: int = 30):
    """
    处理多个股票文件并生成合并的训练和OOT样本

    参数:
    data_directory: 存放股票文件的目录
    observation_window: 观察窗口大小
    performance_window: 性能窗口大小
    oot_samples: OOT样本数量
    initial_samples_to_skip: 开始时要跳过的样本数量
    """

    # 在创建任何实例之前初始化标签列表
    LabelEncoderSingleton.initialize(['TOP', 'BOTTOM', 'ING'])

    # 存储所有股票的训练和OOT样本
    all_train_samples = []
    all_oot_samples = []

    # 获取所有股票文件
    stock_files = [f for f in os.listdir(data_directory) if f.endswith('_features_label.csv')]

    print(f"发现 {len(stock_files)} 个股票文件")


    # 创建标签编码器单例实例
    label_encoder = LabelEncoderSingleton()
    print("\n统一的标签编码映射:")
    for i, label in enumerate(label_encoder.classes_):
        print(f"{label}: {i}")

    # 处理每个股票文件
    for stock_file in tqdm(stock_files, desc="处理股票文件"):
        try:
            # 读取数据
            stock_code = stock_file.split('_')[0]
            data = pd.read_csv(f"{data_directory}/{stock_file}", index_col='Date', parse_dates=True)

            # 删除指定列
            columns_to_remove = ['Close', 'High', 'Low', 'Open']
            for col in columns_to_remove:
                if col in data.columns:
                    data = data.drop(columns=[col])

            # 跳过开始的一些样本（处理初始NaN值）
            start_date = data.index[0] + timedelta(days=initial_samples_to_skip)
            data = data[data.index >= start_date]

            # 获取特征列
            feature_columns = data.columns.tolist()
            feature_columns.remove('label')

            # 创建模型实例
            model = SlidingWindowModel(
                data=data,
                feature_columns=feature_columns,
                observation_window=observation_window,
                performance_window=performance_window,
                oot_samples=oot_samples
            )

            # 生成样本
            X_train, y_train, X_oot, y_oot, dates_train, dates_oot = model.generate_samples()

            # 如果生成的样本为空，跳过这个股票
            if len(X_train) == 0 or len(X_oot) == 0:
                print(f"警告: 股票 {stock_code} 没有生成有效样本，跳过")
                continue

            # 创建训练样本DataFrame
            train_df = pd.DataFrame(X_train)
            train_df['date'] = dates_train
            train_df['stock_code'] = stock_code
            train_df['label'] = model.le.inverse_transform(y_train)

            # 创建OOT样本DataFrame
            oot_df = pd.DataFrame(X_oot)
            oot_df['date'] = dates_oot
            oot_df['stock_code'] = stock_code
            oot_df['label'] = model.le.inverse_transform(y_oot)

            # 添加到总样本列表
            all_train_samples.append(train_df)
            all_oot_samples.append(oot_df)

        except Exception as e:
            print(f"处理股票 {stock_code} 时出错: {str(e)}")
            continue

    # 合并所有样本
    if not all_train_samples or not all_oot_samples:
        raise ValueError("没有生成有效的样本")

    final_train = pd.concat(all_train_samples, axis=0)
    final_oot = pd.concat(all_oot_samples, axis=0)

    # 按日期排序
    final_train = final_train.sort_values('date')
    final_oot = final_oot.sort_values('date')

    print("\n最终样本统计:")
    print(f"训练样本数量: {len(final_train)}")
    print(f"OOT样本数量: {len(final_oot)}")
    print("\n训练样本标签分布:")
    print(final_train['label'].value_counts())
    print("\nOOT样本标签分布:")
    print(final_oot['label'].value_counts())

    return final_train, final_oot

# 使用示例
if __name__ == "__main__":
    try:
        # 设置参数
        data_directory = "/content/hstech_stock_data"  # 存放股票文件的目录
        observation_window = 2  # 使用t和t-1的指标
        performance_window = 1  # 预测t的label
        oot_samples = 30  # OOT样本数量
        initial_samples_to_skip = 30  # 跳过开始的30个样本

        # 处理所有股票文件
        final_train, final_oot = process_multiple_stocks(
            data_directory=data_directory,
            observation_window=observation_window,
            performance_window=performance_window,
            oot_samples=oot_samples,
            initial_samples_to_skip=initial_samples_to_skip
        )

        # 保存处理后的样本
        final_train.to_csv('hstech_combined_t_train_samples.csv', index=False)
        final_oot.to_csv('hstech_combined_t_oot_samples.csv', index=False)

        print("\n样本已保存到文件")

    except Exception as e:
        print(f"错误: {str(e)}")
        import traceback
        traceback.print_exc()

发现 30 个股票文件

统一的标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2


处理股票文件:   0%|          | 0/30 [00:00<?, ?it/s]


处理前的标签分布:
label
ING       453
TOP       127
BOTTOM    127
Name: count, dtype: int64

处理后的标签分布:
label
ING       453
TOP       127
BOTTOM    127
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:   3%|▎         | 1/30 [00:18<09:00, 18.62s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       428
TOP       123
BOTTOM    123
Name: count, dtype: int64

OOT集标签分布:
ING       23
TOP        4
BOTTOM     4
Name: count, dtype: int64

处理前的标签分布:
label
ING       425
TOP       141
BOTTOM    141
Name: count, dtype: int64

处理后的标签分布:
label
ING       425
TOP       141
BOTTOM    141
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:   7%|▋         | 2/30 [00:38<09:02, 19.36s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       402
TOP       136
BOTTOM    136
Name: count, dtype: int64

OOT集标签分布:
ING       22
BOTTOM     5
TOP        4
Name: count, dtype: int64

处理前的标签分布:
label
ING       425
TOP       141
BOTTOM    141
Name: count, dtype: int64

处理后的标签分布:
label
ING       425
TOP       141
BOTTOM    141
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  10%|█         | 3/30 [00:57<08:33, 19.02s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       400
TOP       137
BOTTOM    137
Name: count, dtype: int64

OOT集标签分布:
ING       23
TOP        4
BOTTOM     4
Name: count, dtype: int64

处理前的标签分布:
label
ING       453
BOTTOM    127
TOP       127
Name: count, dtype: int64

处理后的标签分布:
label
ING       453
BOTTOM    127
TOP       127
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  13%|█▎        | 4/30 [01:16<08:14, 19.03s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       436
TOP       119
BOTTOM    119
Name: count, dtype: int64

OOT集标签分布:
ING       16
TOP        8
BOTTOM     7
Name: count, dtype: int64

处理前的标签分布:
label
ING       461
TOP       123
BOTTOM    123
Name: count, dtype: int64

处理后的标签分布:
label
ING       461
TOP       123
BOTTOM    123
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  17%|█▋        | 5/30 [01:35<07:57, 19.10s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       440
TOP       117
BOTTOM    117
Name: count, dtype: int64

OOT集标签分布:
ING       19
TOP        6
BOTTOM     6
Name: count, dtype: int64

处理前的标签分布:
label
ING       426
TOP       141
BOTTOM    140
Name: count, dtype: int64

处理后的标签分布:
label
ING       426
TOP       141
BOTTOM    140
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  20%|██        | 6/30 [01:54<07:34, 18.95s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       404
TOP       135
BOTTOM    135
Name: count, dtype: int64

OOT集标签分布:
ING       20
TOP        6
BOTTOM     5
Name: count, dtype: int64

处理前的标签分布:
label
ING       469
BOTTOM    119
TOP       119
Name: count, dtype: int64

处理后的标签分布:
label
ING       469
BOTTOM    119
TOP       119
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  23%|██▎       | 7/30 [02:13<07:22, 19.24s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       449
BOTTOM    113
TOP       112
Name: count, dtype: int64

OOT集标签分布:
ING       19
BOTTOM     6
TOP        6
Name: count, dtype: int64

处理前的标签分布:
label
ING       459
TOP       124
BOTTOM    124
Name: count, dtype: int64

处理后的标签分布:
label
ING       459
TOP       124
BOTTOM    124
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  27%|██▋       | 8/30 [02:32<06:58, 19.03s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       435
TOP       120
BOTTOM    119
Name: count, dtype: int64

OOT集标签分布:
ING       22
BOTTOM     5
TOP        4
Name: count, dtype: int64

处理前的标签分布:
label
ING       434
TOP       137
BOTTOM    136
Name: count, dtype: int64

处理后的标签分布:
label
ING       434
TOP       137
BOTTOM    136
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  30%|███       | 9/30 [02:50<06:36, 18.87s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       411
TOP       132
BOTTOM    131
Name: count, dtype: int64

OOT集标签分布:
ING       21
BOTTOM     5
TOP        5
Name: count, dtype: int64

处理前的标签分布:
label
ING       434
TOP       137
BOTTOM    136
Name: count, dtype: int64

处理后的标签分布:
label
ING       434
TOP       137
BOTTOM    136
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  33%|███▎      | 10/30 [03:11<06:25, 19.26s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       412
TOP       131
BOTTOM    131
Name: count, dtype: int64

OOT集标签分布:
ING       20
TOP        6
BOTTOM     5
Name: count, dtype: int64

处理前的标签分布:
label
ING       466
TOP       121
BOTTOM    120
Name: count, dtype: int64

处理后的标签分布:
label
ING       466
TOP       121
BOTTOM    120
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  37%|███▋      | 11/30 [03:29<06:02, 19.08s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       441
TOP       117
BOTTOM    116
Name: count, dtype: int64

OOT集标签分布:
ING       23
BOTTOM     4
TOP        4
Name: count, dtype: int64

处理前的标签分布:
label
ING       431
TOP       138
BOTTOM    138
Name: count, dtype: int64

处理后的标签分布:
label
ING       431
TOP       138
BOTTOM    138
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  40%|████      | 12/30 [03:48<05:43, 19.07s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       409
TOP       133
BOTTOM    132
Name: count, dtype: int64

OOT集标签分布:
ING       21
TOP        5
BOTTOM     5
Name: count, dtype: int64

处理前的标签分布:
label
ING       449
TOP       129
BOTTOM    129
Name: count, dtype: int64

处理后的标签分布:
label
ING       449
TOP       129
BOTTOM    129
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  43%|████▎     | 13/30 [04:08<05:25, 19.16s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       426
TOP       124
BOTTOM    124
Name: count, dtype: int64

OOT集标签分布:
ING       21
TOP        5
BOTTOM     5
Name: count, dtype: int64

处理前的标签分布:
label
ING       457
TOP       125
BOTTOM    125
Name: count, dtype: int64

处理后的标签分布:
label
ING       457
TOP       125
BOTTOM    125
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  47%|████▋     | 14/30 [04:26<05:03, 18.98s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       434
TOP       120
BOTTOM    120
Name: count, dtype: int64

OOT集标签分布:
ING       22
BOTTOM     5
TOP        4
Name: count, dtype: int64

处理前的标签分布:
label
ING       444
TOP       132
BOTTOM    131
Name: count, dtype: int64

处理后的标签分布:
label
ING       444
TOP       132
BOTTOM    131
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  50%|█████     | 15/30 [04:47<04:50, 19.37s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       422
TOP       126
BOTTOM    126
Name: count, dtype: int64

OOT集标签分布:
ING       20
TOP        6
BOTTOM     5
Name: count, dtype: int64

处理前的标签分布:
label
ING       480
TOP       114
BOTTOM    113
Name: count, dtype: int64

处理后的标签分布:
label
ING       480
TOP       114
BOTTOM    113
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  53%|█████▎    | 16/30 [05:06<04:30, 19.33s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       458
TOP       108
BOTTOM    108
Name: count, dtype: int64

OOT集标签分布:
ING       20
TOP        6
BOTTOM     5
Name: count, dtype: int64

处理前的标签分布:
label
ING       420
TOP       144
BOTTOM    143
Name: count, dtype: int64

处理后的标签分布:
label
ING       420
TOP       144
BOTTOM    143
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  57%|█████▋    | 17/30 [05:26<04:15, 19.63s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       398
TOP       138
BOTTOM    138
Name: count, dtype: int64

OOT集标签分布:
ING       20
TOP        6
BOTTOM     5
Name: count, dtype: int64

处理前的标签分布:
label
ING       431
TOP       138
BOTTOM    138
Name: count, dtype: int64

处理后的标签分布:
label
ING       431
TOP       138
BOTTOM    138
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  60%|██████    | 18/30 [05:45<03:54, 19.54s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       412
TOP       131
BOTTOM    131
Name: count, dtype: int64

OOT集标签分布:
ING       17
TOP        7
BOTTOM     7
Name: count, dtype: int64

处理前的标签分布:
label
ING       461
TOP       123
BOTTOM    123
Name: count, dtype: int64

处理后的标签分布:
label
ING       461
TOP       123
BOTTOM    123
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  63%|██████▎   | 19/30 [06:07<03:41, 20.16s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       436
TOP       119
BOTTOM    119
Name: count, dtype: int64

OOT集标签分布:
ING       23
TOP        4
BOTTOM     4
Name: count, dtype: int64

处理前的标签分布:
label
ING       461
TOP       123
BOTTOM    123
Name: count, dtype: int64

处理后的标签分布:
label
ING       461
TOP       123
BOTTOM    123
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  67%|██████▋   | 20/30 [06:27<03:22, 20.20s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       444
TOP       115
BOTTOM    115
Name: count, dtype: int64

OOT集标签分布:
ING       15
TOP        8
BOTTOM     8
Name: count, dtype: int64

处理前的标签分布:
label
ING       450
TOP       129
BOTTOM    128
Name: count, dtype: int64

处理后的标签分布:
label
ING       450
TOP       129
BOTTOM    128
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  70%|███████   | 21/30 [06:47<02:59, 19.96s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       424
TOP       125
BOTTOM    125
Name: count, dtype: int64

OOT集标签分布:
ING       24
TOP        4
BOTTOM     3
Name: count, dtype: int64

处理前的标签分布:
label
ING       428
TOP       140
BOTTOM    139
Name: count, dtype: int64

处理后的标签分布:
label
ING       428
TOP       140
BOTTOM    139
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  73%|███████▎  | 22/30 [07:07<02:40, 20.10s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       403
TOP       136
BOTTOM    135
Name: count, dtype: int64

OOT集标签分布:
ING       24
TOP        4
BOTTOM     3
Name: count, dtype: int64

处理前的标签分布:
label
ING       471
BOTTOM    118
TOP       118
Name: count, dtype: int64

处理后的标签分布:
label
ING       471
BOTTOM    118
TOP       118
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  77%|███████▋  | 23/30 [07:26<02:19, 19.86s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       447
BOTTOM    114
TOP       113
Name: count, dtype: int64

OOT集标签分布:
ING       22
TOP        5
BOTTOM     4
Name: count, dtype: int64

处理前的标签分布:
label
ING       460
TOP       124
BOTTOM    123
Name: count, dtype: int64

处理后的标签分布:
label
ING       460
TOP       124
BOTTOM    123
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  80%|████████  | 24/30 [07:47<02:00, 20.09s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       438
TOP       118
BOTTOM    118
Name: count, dtype: int64

OOT集标签分布:
ING       21
BOTTOM     5
TOP        5
Name: count, dtype: int64

处理前的标签分布:
label
ING       436
TOP       136
BOTTOM    135
Name: count, dtype: int64

处理后的标签分布:
label
ING       436
TOP       136
BOTTOM    135
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  83%|████████▎ | 25/30 [08:06<01:39, 19.84s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       415
TOP       130
BOTTOM    129
Name: count, dtype: int64

OOT集标签分布:
ING       19
BOTTOM     6
TOP        6
Name: count, dtype: int64

处理前的标签分布:
label
ING       404
TOP       152
BOTTOM    151
Name: count, dtype: int64

处理后的标签分布:
label
ING       404
TOP       152
BOTTOM    151
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  87%|████████▋ | 26/30 [08:26<01:19, 19.87s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       382
TOP       146
BOTTOM    146
Name: count, dtype: int64

OOT集标签分布:
ING       20
TOP        6
BOTTOM     5
Name: count, dtype: int64

处理前的标签分布:
label
ING       468
TOP       120
BOTTOM    119
Name: count, dtype: int64

处理后的标签分布:
label
ING       468
TOP       120
BOTTOM    119
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  90%|█████████ | 27/30 [08:46<00:59, 19.88s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       445
TOP       115
BOTTOM    114
Name: count, dtype: int64

OOT集标签分布:
ING       22
TOP        5
BOTTOM     4
Name: count, dtype: int64

处理前的标签分布:
label
ING       438
TOP       135
BOTTOM    134
Name: count, dtype: int64

处理后的标签分布:
label
ING       438
TOP       135
BOTTOM    134
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  93%|█████████▎| 28/30 [09:05<00:39, 19.69s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       415
TOP       130
BOTTOM    129
Name: count, dtype: int64

OOT集标签分布:
ING       22
TOP        5
BOTTOM     4
Name: count, dtype: int64

处理前的标签分布:
label
ING       454
TOP       127
BOTTOM    126
Name: count, dtype: int64

处理后的标签分布:
label
ING       454
TOP       127
BOTTOM    126
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件:  97%|█████████▋| 29/30 [09:26<00:19, 19.96s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       432
TOP       121
BOTTOM    121
Name: count, dtype: int64

OOT集标签分布:
ING       20
TOP        6
BOTTOM     5
Name: count, dtype: int64

处理前的标签分布:
label
ING       458
TOP       125
BOTTOM    124
Name: count, dtype: int64

处理后的标签分布:
label
ING       458
TOP       125
BOTTOM    124
Name: count, dtype: int64

数据集大小: 707
特征数量: 514

标签编码映射:
BOTTOM: 0
ING: 1
TOP: 2

训练数据起止日期: 2022-05-16 00:00:00 到 2025-02-12 00:00:00
OOT数据起止日期: 2025-02-13 00:00:00 到 2025-03-28 00:00:00


处理股票文件: 100%|██████████| 30/30 [09:45<00:00, 19.53s/it]


训练样本形状: X_train: (674, 1028), y_train: (674,)
OOT样本形状: X_oot: (31, 1028), y_oot: (31,)

训练集标签分布:
ING       436
TOP       119
BOTTOM    119
Name: count, dtype: int64

OOT集标签分布:
ING       20
TOP        6
BOTTOM     5
Name: count, dtype: int64






最终样本统计:
训练样本数量: 20220
OOT样本数量: 930

训练样本标签分布:
label
ING       12734
TOP        3746
BOTTOM     3740
Name: count, dtype: int64

OOT样本标签分布:
label
ING       621
TOP       160
BOTTOM    149
Name: count, dtype: int64

样本已保存到文件


In [5]:
from google.colab import drive
drive.mount('/content/drive')

!cp -rf /content/hstech_combined_t_train_samples.csv /content/drive/MyDrive/
!cp -rf /content/hstech_combined_t_oot_samples.csv /content/drive/MyDrive/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
def train_and_evaluate_with_samples(final_train: pd.DataFrame,
                                  final_oot: pd.DataFrame,
                                  model_type: str = 'xgb',
                                  thresholds: dict = {'TOP': 0.4, 'BOTTOM': 0.4}):
    """
    使用合并好的样本训练模型并评估性能
    """
    print(f"\n=== 开始训练模型 ===")
    print(f"模型类型: {model_type}")
    print(f"预测阈值: {thresholds}")

    try:
        # 获取已初始化的标签编码器实例
        le = LabelEncoderSingleton()

        # 准备特征和标签
        feature_cols = [col for col in final_train.columns if col not in ['label', 'date', 'stock_code']]

        X_train = final_train[feature_cols].values
        y_train = le.transform(final_train['label'].values)

        X_oot = final_oot[feature_cols].values
        y_oot = le.transform(final_oot['label'].values)

        print(f"\n=== 样本统计 ===")
        print(f"特征数量: {len(feature_cols)}")
        print(f"训练集样本数: {len(X_train)}")
        print(f"OOT集样本数: {len(X_oot)}")
        print("\n训练集标签分布:")
        print(final_train['label'].value_counts())
        print("\nOOT集标签分布:")
        print(final_oot['label'].value_counts())

        # 创建模型
        if model_type == 'rf':
            model = RandomForestClassifier(
                n_estimators=100,
                max_depth=10,
                random_state=42,
                class_weight='balanced',
                n_jobs=-1
            )
        else:  # xgb
            model = xgb.XGBClassifier(
                n_estimators=100,
                max_depth=10,
                learning_rate=0.1,
                random_state=42,
                n_jobs=-1
            )

        print("\n开始训练模型...")
        model.fit(X_train, y_train)

        # 获取预测概率
        train_proba = model.predict_proba(X_train)
        oot_proba = model.predict_proba(X_oot)

        # 使用阈值进行预测
        def predict_with_threshold(probas, thresholds):
            predictions = []
            for proba in probas:
                if proba[0] >= thresholds['TOP']:  # TOP的概率
                    predictions.append(0)  # TOP的编码
                elif proba[1] >= thresholds['BOTTOM']:  # BOTTOM的概率
                    predictions.append(1)  # BOTTOM的编码
                else:
                    predictions.append(2)  # ING的编码
            return np.array(predictions)

        # 获取预测结果
        y_train_pred = predict_with_threshold(train_proba, thresholds)
        y_oot_pred = predict_with_threshold(oot_proba, thresholds)

        # 打印分类报告
        print("\n=== 模型性能评估 ===")
        print("\n训练集性能:")
        print(classification_report(
            y_train,
            y_train_pred,
            target_names=le.classes_
        ))

        print("\nOOT集性能:")
        print(classification_report(
            y_oot,
            y_oot_pred,
            target_names=le.classes_
        ))

        # 打印混淆矩阵
        print("\nOOT集混淆矩阵:")
        print(confusion_matrix(y_oot, y_oot_pred))

        # 特征重要性分析
        feature_importance = pd.DataFrame({
            'feature': feature_cols,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)

        print("\n=== 特征重要性（前10个） ===")
        print(feature_importance.head(10))

        # 保存预测结果
        train_predictions = pd.DataFrame({
            'date': final_train['date'],
            'true_label': final_train['label'],
            'predicted_label': le.inverse_transform(y_train_pred),
            'prob_TOP': train_proba[:, 0],
            'prob_BOTTOM': train_proba[:, 1],
            'prob_ING': train_proba[:, 2]
        })

        oot_predictions = pd.DataFrame({
            'date': final_oot['date'],
            'true_label': final_oot['label'],
            'predicted_label': le.inverse_transform(y_oot_pred),
            'prob_TOP': oot_proba[:, 0],
            'prob_BOTTOM': oot_proba[:, 1],
            'prob_ING': oot_proba[:, 2]
        })

        return {
            'model': model,
            'feature_importance': feature_importance,
            'train_predictions': train_predictions,
            'oot_predictions': oot_predictions,
            'train_metrics': classification_report(y_train, y_train_pred, output_dict=True),
            'oot_metrics': classification_report(y_oot, y_oot_pred, output_dict=True),
            'confusion_matrix': confusion_matrix(y_oot, y_oot_pred)
        }

    except Exception as e:
        print(f"\n训练过程出错: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

# 使用示例
if __name__ == "__main__":
    # 假设final_train和final_oot已经准备好

    # 训练和评估模型
    results = train_and_evaluate_with_samples(
        final_train=final_train,
        final_oot=final_oot,
        model_type='xgb',
        thresholds={'TOP': 0.4, 'BOTTOM': 0.4}
    )

    if results is not None:
        # 保存预测结果
        results['train_predictions'].to_csv('train_predictions.csv', index=False)
        results['oot_predictions'].to_csv('oot_predictions.csv', index=False)

        # 查看每个类别的具体预测效果
        print("\n=== 各类别预测统计 ===")
        oot_preds = results['oot_predictions']

        for label in ['TOP', 'BOTTOM', 'ING']:
            print(f"\n{label}类别统计:")
            mask = oot_preds['true_label'] == label
            subset = oot_preds[mask]
            print(f"样本数量: {len(subset)}")
            print("预测分布:")
            print(subset['predicted_label'].value_counts())
            print(f"平均预测概率:")
            print(f"TOP: {subset['prob_TOP'].mean():.3f}")
            print(f"BOTTOM: {subset['prob_BOTTOM'].mean():.3f}")
            print(f"ING: {subset['prob_ING'].mean():.3f}")


=== 开始训练模型 ===
模型类型: xgb
预测阈值: {'TOP': 0.4, 'BOTTOM': 0.4}

=== 样本统计 ===
特征数量: 1028
训练集样本数: 20220
OOT集样本数: 930

训练集标签分布:
label
ING       12734
TOP        3746
BOTTOM     3740
Name: count, dtype: int64

OOT集标签分布:
label
ING       621
TOP       160
BOTTOM    149
Name: count, dtype: int64

开始训练模型...

=== 模型性能评估 ===

训练集性能:
              precision    recall  f1-score   support

      BOTTOM       1.00      1.00      1.00      3740
         ING       1.00      1.00      1.00     12734
         TOP       1.00      1.00      1.00      3746

    accuracy                           1.00     20220
   macro avg       1.00      1.00      1.00     20220
weighted avg       1.00      1.00      1.00     20220


OOT集性能:
              precision    recall  f1-score   support

      BOTTOM       0.46      0.46      0.46       149
         ING       0.72      0.82      0.76       621
         TOP       0.50      0.22      0.30       160

    accuracy                           0.66       930
   macro avg    

In [7]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def analyze_threshold_performance(oot_predictions: pd.DataFrame,
                               threshold_range: np.ndarray = np.arange(0.1, 0.9, 0.1)):
    """
    分析不同阈值下的模型性能
    """
    print("\n=== 不同阈值下的性能分析 ===")

    # 为每个标签创建性能指标存储
    metrics = {
        'TOP': {'precision': [], 'recall': [], 'f1': [], 'support': []},
        'BOTTOM': {'precision': [], 'recall': [], 'f1': [], 'support': []},
        'ING': {'precision': [], 'recall': [], 'f1': [], 'support': []}
    }

    for threshold in threshold_range:
        print(f"\n阈值 = {threshold:.1f}")

        # 根据阈值生成预测
        predictions = []
        for _, row in oot_predictions.iterrows():
            if row['prob_TOP'] >= threshold:
                predictions.append('TOP')
            elif row['prob_BOTTOM'] >= threshold:
                predictions.append('BOTTOM')
            else:
                predictions.append('ING')

        # 计算每个类别的指标
        for label in ['TOP', 'BOTTOM', 'ING']:
            true_positives = sum((oot_predictions['true_label'] == label) &
                               (np.array(predictions) == label))
            false_positives = sum((oot_predictions['true_label'] != label) &
                                (np.array(predictions) == label))
            false_negatives = sum((oot_predictions['true_label'] == label) &
                                (np.array(predictions) != label))

            # 计算指标
            precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
            recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
            support = sum(oot_predictions['true_label'] == label)

            # 存储指标
            metrics[label]['precision'].append(precision)
            metrics[label]['recall'].append(recall)
            metrics[label]['f1'].append(f1)
            metrics[label]['support'].append(support)

            print(f"\n{label}类别:")
            print(f"Precision: {precision:.3f}")
            print(f"Recall: {recall:.3f}")
            print(f"F1-score: {f1:.3f}")
            print(f"Support: {support}")

        # 创建混淆矩阵
        true_labels = oot_predictions['true_label'].values
        pred_labels = np.array(predictions)
        unique_labels = ['TOP', 'BOTTOM', 'ING']

        conf_matrix = np.zeros((len(unique_labels), len(unique_labels)), dtype=int)
        label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}

        for true, pred in zip(true_labels, pred_labels):
            conf_matrix[label_to_idx[true]][label_to_idx[pred]] += 1

        print("\n混淆矩阵:")
        matrix_df = pd.DataFrame(
            conf_matrix,
            index=unique_labels,
            columns=unique_labels
        )
        matrix_df['Total'] = matrix_df.sum(axis=1)
        total_row = matrix_df.sum()
        matrix_df.loc['Total'] = total_row
        print(matrix_df)

    # 使用plotly创建性能曲线图
    fig = make_subplots(
        rows=3, cols=1,
        subplot_titles=('TOP类别性能指标', 'BOTTOM类别性能指标', 'ING类别性能指标'),
        vertical_spacing=0.1
    )

    # 颜色设置
    colors = {'precision': '#1f77b4', 'recall': '#ff7f0e', 'f1': '#2ca02c'}

    # 为每个类别添加曲线
    for idx, label in enumerate(['TOP', 'BOTTOM', 'ING'], 1):
        # 添加Precision曲线
        fig.add_trace(
            go.Scatter(
                x=threshold_range,
                y=metrics[label]['precision'],
                name=f'{label} Precision',
                line=dict(color=colors['precision']),
                showlegend=True if idx==1 else False
            ),
            row=idx, col=1
        )

        # 添加Recall曲线
        fig.add_trace(
            go.Scatter(
                x=threshold_range,
                y=metrics[label]['recall'],
                name=f'{label} Recall',
                line=dict(color=colors['recall']),
                showlegend=True if idx==1 else False
            ),
            row=idx, col=1
        )

        # 添加F1-score曲线
        fig.add_trace(
            go.Scatter(
                x=threshold_range,
                y=metrics[label]['f1'],
                name=f'{label} F1-score',
                line=dict(color=colors['f1']),
                showlegend=True if idx==1 else False
            ),
            row=idx, col=1
        )

    # 更新布局
    fig.update_layout(
        height=900,  # 增加图形高度
        title_text="不同阈值下的模型性能",
        showlegend=True,
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1
        )
    )

    # 更新x轴和y轴标签
    fig.update_xaxes(title_text="阈值", row=3, col=1)
    for i in range(1, 4):
        fig.update_yaxes(title_text="指标值", row=i, col=1)
        fig.update_xaxes(gridcolor='lightgrey', row=i, col=1)
        fig.update_yaxes(gridcolor='lightgrey', row=i, col=1)

    # 显示图形
    fig.show()

    # 返回最佳阈值
    best_thresholds = {}
    for label in ['TOP', 'BOTTOM', 'ING']:
        best_idx = np.argmax(metrics[label]['f1'])
        best_thresholds[label] = {
            'threshold': threshold_range[best_idx],
            'precision': metrics[label]['precision'][best_idx],
            'recall': metrics[label]['recall'][best_idx],
            'f1': metrics[label]['f1'][best_idx]
        }

    print("\n=== 最佳阈值 ===")
    for label, metrics_dict in best_thresholds.items():
        print(f"\n{label}类别:")
        print(f"最佳阈值: {metrics_dict['threshold']:.2f}")
        print(f"对应的Precision: {metrics_dict['precision']:.3f}")
        print(f"对应的Recall: {metrics_dict['recall']:.3f}")
        print(f"对应的F1-score: {metrics_dict['f1']:.3f}")

    return best_thresholds

# 使用示例
if results is not None:
    # 分析不同阈值下的性能
    threshold_range = np.arange(0.1, 0.9, 0.1)  # 从0.1到0.8，步长0.1
    best_thresholds = analyze_threshold_performance(
        results['oot_predictions'],
        threshold_range
    )


=== 不同阈值下的性能分析 ===

阈值 = 0.1

TOP类别:
Precision: 0.041
Recall: 0.100
F1-score: 0.058
Support: 160

BOTTOM类别:
Precision: 0.037
Recall: 0.134
F1-score: 0.058
Support: 149

ING类别:
Precision: 0.000
Recall: 0.000
F1-score: 0.000
Support: 621

混淆矩阵:
        TOP  BOTTOM  ING  Total
TOP      16     141    3    160
BOTTOM  129      20    0    149
ING     244     377    0    621
Total   389     538    3    930

阈值 = 0.2

TOP类别:
Precision: 0.029
Recall: 0.050
F1-score: 0.037
Support: 160

BOTTOM类别:
Precision: 0.063
Recall: 0.268
F1-score: 0.102
Support: 149

ING类别:
Precision: 0.500
Recall: 0.013
F1-score: 0.025
Support: 621

混淆矩阵:
        TOP  BOTTOM  ING  Total
TOP       8     144    8    160
BOTTOM  109      40    0    149
ING     160     453    8    621
Total   277     637   16    930

阈值 = 0.3

TOP类别:
Precision: 0.019
Recall: 0.025
F1-score: 0.022
Support: 160

BOTTOM类别:
Precision: 0.075
Recall: 0.349
F1-score: 0.124
Support: 149

ING类别:
Precision: 0.514
Recall: 0.029
F1-score: 0.055
Support:


=== 最佳阈值 ===

TOP类别:
最佳阈值: 0.10
对应的Precision: 0.041
对应的Recall: 0.100
对应的F1-score: 0.058

BOTTOM类别:
最佳阈值: 0.50
对应的Precision: 0.135
对应的Recall: 0.638
对应的F1-score: 0.223

ING类别:
最佳阈值: 0.80
对应的Precision: 0.568
对应的Recall: 0.488
对应的F1-score: 0.525
