In [None]:
!pip install gplearn

Collecting gplearn
  Downloading gplearn-0.4.2-py3-none-any.whl.metadata (4.3 kB)
Downloading gplearn-0.4.2-py3-none-any.whl (25 kB)
Installing collected packages: gplearn
Successfully installed gplearn-0.4.2


In [None]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder

from gplearn.genetic import SymbolicTransformer, SymbolicClassifier, SymbolicRegressor
from gplearn.functions import make_function
from gplearn.fitness import make_fitness
from google.colab import drive

import hashlib

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
stock_data = pd.read_csv("/content/drive/MyDrive/final_data.csv")
stock_data  = stock_data [stock_data ['date'].between('2004-01-01', '2024-12-31')]

In [None]:
id_col = ["PERMNO", 'CUSIP', 'Ticker', 'SICCD','NAICS']
time_col = ['MthCalDt']
factor_col = ['mom_12','mom_6', 'vol_12', 'vol_6', 'rev_1', 'rvol_1', 'beta',"rsi_6", 'trend_strength']
market_col = ['qmj_safety','seas_11_15na', 'ret_3_1', 'iskew_ff3_21d', 'rskew_21d', 'sti_gr1a',
              'earnings_variability', 'nfna_gr1a', 'seas_16_20an', 'corr_1260d']
fin_col = ['capxy', 'chechy', 'cshfdy', 'cshpry', 'dltry', 'dpcy', 'epspxy', 'oibdpy', 'txty']

In [None]:
all_features = factor_col + market_col + fin_col

In [None]:

"""
def safe_div(x1, x2):
    return np.where(np.abs(x2) < 1e-8, 0, x1 / x2)

def abs_func(x):
    return np.abs(x)

def sign_func(x):
    return np.sign(x)

def max_func(x1, x2):
    return np.maximum(x1, x2)

def min_func(x1, x2):
    return np.minimum(x1, x2)


safe_division = make_function(function=safe_div, name='safe_div', arity=2)
absolute = make_function(function=abs_func, name='abs', arity=1)
sign = make_function(function=sign_func, name='sign', arity=1)
maximum = make_function(function=max_func, name='max', arity=2)
minimum = make_function(function=min_func, name='min', arity=2)


function_set = ['add', 'sub', 'mul', safe_division, 'sqrt', 'log', 'abs',
                'neg', 'inv', maximum, minimum, sign]

class PanelGeneticFactorGenerator:
    def __init__(self, population_size=500, generations=10,
                 tournament_size=20, stopping_criteria=0.01,
                 const_range=(-1., 1.), init_depth=(2, 6),
                 init_method='half and half', function_set=None,
                 parsimony_coefficient=0.01, random_state=42):

        self.population_size = population_size
        self.generations = generations
        self.tournament_size = tournament_size
        self.stopping_criteria = stopping_criteria
        self.const_range = const_range
        self.init_depth = init_depth
        self.init_method = init_method
        self.function_set = function_set or ['add', 'sub', 'mul', 'div']
        self.parsimony_coefficient = parsimony_coefficient
        self.random_state = random_state

        self.generated_factors = {}
        self.factor_expressions = {}

    def prepare_data(self, df, feature_cols, target_col):

        X = df[feature_cols].copy()
        X = X.fillna(X.median())
        X = X.replace([np.inf, -np.inf], np.nan).fillna(X.median())

        y = df[target_col].fillna(df[target_col].median())
        y = y.replace([np.inf, -np.inf], np.nan).fillna(y.median())

        return X, y

    def generate_supervised_factors(self, df, feature_cols, target_col, n_factors=10):

        print(f"Start generating {n_factors} supervision factors")

        X, y = self.prepare_data(df, feature_cols, target_col)

        print(f"Data Shape: X={X.shape}, y={y.shape}")

        for i in range(n_factors):
            print(f"生成第 {i+1}/{n_factors} 个监督因子...")

            try:
                regressor = SymbolicRegressor(
                    population_size=self.population_size,
                    generations=self.generations,
                    tournament_size=self.tournament_size,
                    stopping_criteria=self.stopping_criteria,
                    const_range=self.const_range,
                    init_depth=self.init_depth,
                    init_method=self.init_method,
                    function_set=self.function_set,
                    parsimony_coefficient=self.parsimony_coefficient,
                    random_state=self.random_state + i,  # 不同的随机种子
                    verbose=0,
                    n_jobs=1  # 避免并行计算问题
                )

                regressor.fit(X, y)
                factor_values = regressor.predict(X)

                factor_name = f'genetic_factor_{i+1}'
                self.generated_factors[factor_name] = factor_values
                self.factor_expressions[factor_name] = str(regressor._program)

                print(f"{factor_name} generate successfully")

            except Exception as e:
                print(f"{i+1} failed {e}")
                continue

        print(f"{len(self.generated_factors)} factors generate successfully")
        return self.generated_factors

    def get_factor_dataframe(self, original_df):

        if not self.generated_factors:
            print("No factor was generated")
            return None

        factor_df = pd.DataFrame(self.generated_factors, index=original_df.index)

        """
        """
        for col in id_col + time_col:
            if col in original_df.columns:
                factor_df[col] = original_df[col]

        return factor_df
      """
      """
    def print_factor_expressions(self):

        print("\n=== factor expressions ===")
        for factor_name, expression in self.factor_expressions.items():
            print(f"{factor_name}: {expression}")

    def evaluate_factors(self, df, target_col):

        if not self.generated_factors:
            print("No factor was generated")
            return

        print("\n=== factor evaluate ===")
        target = df[target_col]

        for factor_name, factor_values in self.generated_factors.items():

            corr = np.corrcoef(factor_values, target)[0, 1]

            factor_returns = pd.Series(factor_values)
            ic_mean = corr
            ic_std = np.std(factor_returns)
            ir = ic_mean / ic_std if ic_std != 0 else 0

            print(f"{factor_name}:")
            print(f"  correlation: {corr:.4f}")
            print(f"  IC: {ir:.4f}")
            print(f"  length of expression: {len(self.factor_expressions[factor_name])}")
            print()

def run_genetic_factor_generation(df, target_col, n_factors=10):

    print("Start generation")

    if target_col is None:
        raise ValueError("Must be a target_col")

    if isinstance(target_col, list):
        target_col = target_col[0]

    print(f"Shape of data: {df.shape}")
    print(f"Target: '{target_col}'")


    generator = PanelGeneticFactorGenerator(
        population_size=500,
        generations=10,
        tournament_size=20,
        random_state=42
    )

    try:
        generated_factors = generator.generate_supervised_factors(
            df, all_features, target_col, n_factors=n_factors
        )
    except Exception as e:
        print(f"Falied: {e}")
        return None, None

    factor_df = generator.get_factor_dataframe(df)
    generator.print_factor_expressions()

    print("Factor evaluate...")
    try:
        generator.evaluate_factors(df, target_col)
    except Exception as e:
        print(f"Evaluation failed: {e}")

    print(f"{len(generator.generated_factors)} new factor")

    return generator, factor_df
"""

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 120)

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-np.clip(x, -500, 500)))

def tanh_func(x):
    return np.tanh(x)

def safe_log(x):
    return np.log(np.abs(x) + 1e-8)

def safe_div(x1, x2):
    return np.where(np.abs(x2) < 1e-8, 0, x1 / x2)

def safe_sqrt(x):
    return np.sqrt(np.abs(x))

def safe_exp(x):
    return np.exp(np.clip(x, -10, 10))

sigmoid_func = make_function(function=sigmoid, name='sigmoid', arity=1)
tanh_function = make_function(function=tanh_func, name='tanh', arity=1)
safe_log_func = make_function(function=safe_log, name='safe_log', arity=1)
safe_div_func = make_function(function=safe_div, name='safe_div', arity=2)
safe_sqrt_func = make_function(function=safe_sqrt, name='safe_sqrt', arity=1)
safe_exp_func = make_function(function=safe_exp, name='safe_exp', arity=1)

classification_functions = ['add', 'sub', 'mul', safe_div_func, sigmoid_func, tanh_function, safe_sqrt_func, safe_exp_func]

class ClassificationGeneticFactorGenerator:
    def __init__(self, population_size=300, generations=10,
                 tournament_size=20, stopping_criteria=0.01,
                 const_range=(-2., 2.), init_depth=(2, 5),
                 random_state=42, train_ratio=0.7):

        self.population_size = population_size
        self.generations = generations
        self.tournament_size = tournament_size
        self.stopping_criteria = stopping_criteria
        self.const_range = const_range
        self.init_depth = init_depth
        self.random_state = random_state
        self.train_ratio = train_ratio

        self.generated_factors = {}
        self.factor_expressions = {}
        self.target_info = {}
        self.trained_models = {}
        self.label_encoder = None

    def split_data(self, df):
        n_train = int(len(df) * self.train_ratio)
        train_df = df.iloc[:n_train].copy()
        full_df = df.copy()

        print(f"Training set size: {len(train_df)} rows ({self.train_ratio*100:.0f}%)")
        print(f"Full dataset size: {len(full_df)} rows (100%)")

        return train_df, full_df

    def analyze_target(self, y):
        unique_vals = np.unique(y)
        n_unique = len(unique_vals)

        target_info = {
            'unique_values': unique_vals,
            'n_unique': n_unique,
            'is_binary': n_unique == 2,
            'is_multiclass': n_unique > 2 and n_unique <= 10,
            'distribution': pd.Series(y).value_counts().to_dict()
        }

        return target_info

    def prepare_classification_target(self, y):
        target_info = self.analyze_target(y)
        self.target_info = target_info

        if target_info['is_binary']:
            if self.label_encoder is None:
                self.label_encoder = LabelEncoder()
                y_encoded = self.label_encoder.fit_transform(y).astype(float)
            else:
                y_encoded = self.label_encoder.transform(y).astype(float)
            return y_encoded, 'binary'

        elif target_info['is_multiclass']:
            if self.label_encoder is None:
                self.label_encoder = LabelEncoder()
                y_encoded = self.label_encoder.fit_transform(y).astype(float)
            else:
                y_encoded = self.label_encoder.transform(y).astype(float)
            return y_encoded, 'multiclass'

        else:
            return y.astype(float), 'regression'

    def prepare_data(self, df, feature_cols, target_col):
        X = df[feature_cols].copy()
        X = X.fillna(X.median())
        X = X.replace([np.inf, -np.inf], np.nan).fillna(X.median())

        y = df[target_col].copy()
        y = y.fillna(y.mode()[0] if not y.mode().empty else y.iloc[0])

        return X, y

    def apply_factor_to_data(self, X, model, factor_name, task_type):
        try:
            factor_values = model.predict(X)

            if 'binary' in factor_name or task_type == 'binary':
                factor_values = sigmoid(factor_values)
            elif 'ordinal' in factor_name:
                pass

            return factor_values
        except Exception as e:
            print(f"Error applying factor {factor_name}: {e}")
            return np.zeros(len(X))

    def generate_binary_classification_factors(self, X_train, y_train, n_factors=5):
        print(f"Generating {n_factors} binary classification factors...")

        for i in range(n_factors):
            try:
                regressor = SymbolicRegressor(
                    population_size=self.population_size,
                    generations=self.generations,
                    tournament_size=self.tournament_size,
                    stopping_criteria=self.stopping_criteria,
                    const_range=self.const_range,
                    init_depth=self.init_depth,
                    function_set=['add', 'sub', 'mul', safe_div_func, sigmoid_func],
                    parsimony_coefficient=0.01,
                    random_state=self.random_state + i,
                    verbose=0,
                    n_jobs=1
                )

                regressor.fit(X_train, y_train)

                factor_name = f'genetic_binary_{i+1}'
                self.trained_models[factor_name] = regressor
                self.factor_expressions[factor_name] = str(regressor._program)

                factor_values = regressor.predict(X_train)
                factor_values = sigmoid(factor_values)

                try:
                    auc = roc_auc_score(y_train, factor_values)
                    print(f"  {factor_name} training AUC: {auc:.4f}")
                except:
                    corr = np.corrcoef(factor_values, y_train)[0, 1]
                    print(f"  {factor_name} training correlation: {corr:.4f}")

            except Exception as e:
                print(f"  Binary factor {i+1} training failed: {e}")
                continue

    def generate_multiclass_factors(self, X_train, y_train, n_factors=5):
        print(f"Generating {n_factors} multiclass factors...")

        unique_classes = np.unique(y_train)
        n_classes = len(unique_classes)
        factors_per_class = max(1, n_factors // n_classes)

        for class_idx, target_class in enumerate(unique_classes):
            y_binary = (y_train == target_class).astype(float)

            for i in range(factors_per_class):
                try:
                    regressor = SymbolicRegressor(
                        population_size=max(200, self.population_size // 2),
                        generations=max(10, self.generations // 2),
                        tournament_size=self.tournament_size,
                        stopping_criteria=self.stopping_criteria,
                        const_range=self.const_range,
                        init_depth=(1, 4),
                        function_set=['add', 'sub', 'mul', safe_div_func, sigmoid_func],
                        parsimony_coefficient=0.02,
                        random_state=self.random_state + class_idx * 100 + i,
                        verbose=0,
                        n_jobs=1
                    )

                    regressor.fit(X_train, y_binary)

                    factor_name = f'genetic_class_{target_class}_{i+1}'
                    self.trained_models[factor_name] = regressor
                    self.factor_expressions[factor_name] = str(regressor._program)

                    factor_values = regressor.predict(X_train)
                    factor_values = sigmoid(factor_values)

                    try:
                        auc = roc_auc_score(y_binary, factor_values)
                        print(f"  {factor_name} training AUC: {auc:.4f}")
                    except:
                        corr = np.corrcoef(factor_values, y_binary)[0, 1]
                        print(f"  {factor_name} training correlation: {corr:.4f}")

                except Exception as e:
                    print(f"  Class {target_class} factor {i+1} training failed: {e}")
                    continue

    def generate_ordinal_factors(self, X_train, y_train, n_factors=5):
        print(f"Generating {n_factors} ordinal factors...")

        for i in range(n_factors):
            try:
                regressor = SymbolicRegressor(
                    population_size=self.population_size,
                    generations=self.generations,
                    tournament_size=self.tournament_size,
                    stopping_criteria=self.stopping_criteria,
                    const_range=self.const_range,
                    init_depth=self.init_depth,
                    function_set=['add', 'sub', 'mul', safe_div_func, tanh_function],
                    parsimony_coefficient=0.01,
                    random_state=self.random_state + i + 1000,
                    verbose=0,
                    n_jobs=1
                )

                regressor.fit(X_train, y_train)

                factor_name = f'genetic_ordinal_{i+1}'
                self.trained_models[factor_name] = regressor
                self.factor_expressions[factor_name] = str(regressor._program)

                factor_values = regressor.predict(X_train)

                y_min, y_max = y_train.min(), y_train.max()
                factor_min, factor_max = factor_values.min(), factor_values.max()
                if factor_max > factor_min:
                    factor_values = (factor_values - factor_min) / (factor_max - factor_min) * (y_max - y_min) + y_min

                corr = np.corrcoef(factor_values, y_train)[0, 1]
                print(f"  {factor_name} training correlation: {corr:.4f}")

            except Exception as e:
                print(f"  Ordinal factor {i+1} training failed: {e}")
                continue

    def train_factors(self, train_df, feature_cols, target_col, n_factors=10):
        print("=" * 60)
        print("Step 1: Training factors on training set")
        print("=" * 60)

        X_train, y_train = self.prepare_data(train_df, feature_cols, target_col)
        y_processed, task_type = self.prepare_classification_target(y_train)

        if task_type == 'binary':
            self.generate_binary_classification_factors(X_train, y_processed, n_factors)
        elif task_type == 'multiclass':
            multiclass_factors = max(1, n_factors // 2)
            ordinal_factors = n_factors - multiclass_factors
            self.generate_multiclass_factors(X_train, y_processed, multiclass_factors)
            self.generate_ordinal_factors(X_train, y_processed, ordinal_factors)
        else:
            self.generate_ordinal_factors(X_train, y_processed, n_factors)

        print(f"\nTraining completed! Successfully trained {len(self.trained_models)} factors")
        return task_type

    def apply_factors_to_full_data(self, full_df, feature_cols, target_col, task_type):
        print("Step 2: Applying factors to full dataset")

        X_full, y_full = self.prepare_data(full_df, feature_cols, target_col)
        y_processed, _ = self.prepare_classification_target(y_full)

        for factor_name, model in self.trained_models.items():
            try:
                factor_values = self.apply_factor_to_data(X_full, model, factor_name, task_type)
                self.generated_factors[factor_name] = factor_values

                if task_type == 'binary' and 'binary' in factor_name:
                    try:
                        auc = roc_auc_score(y_processed, factor_values)
                        print(f"  {factor_name} full set AUC: {auc:.4f}")
                    except:
                        corr = np.corrcoef(factor_values, y_processed)[0, 1]
                        print(f"  {factor_name} full set correlation: {corr:.4f}")
                else:
                    corr = np.corrcoef(factor_values, y_processed)[0, 1]
                    print(f"  {factor_name} full set correlation: {corr:.4f}")

            except Exception as e:
                print(f"  Failed to apply factor {factor_name} to full set: {e}")
                continue

        print(f"\nFactor application completed! Successfully generated {len(self.generated_factors)} factors on full dataset")

    def generate_classification_factors(self, df, feature_cols, target_col, n_factors=10):

        train_df, full_df = self.split_data(df)

        task_type = self.train_factors(train_df, feature_cols, target_col, n_factors)

        self.apply_factors_to_full_data(full_df, feature_cols, target_col, task_type)

        return self.generated_factors

    def get_factor_dataframe(self, original_df, id_col=None, time_col=None):

        if not self.generated_factors:
            return None

        factor_df = pd.DataFrame(self.generated_factors, index=original_df.index)

        if id_col:
            for col in id_col if isinstance(id_col, list) else [id_col]:
                if col in original_df.columns:
                    factor_df[col] = original_df[col]

        if time_col:
            for col in time_col if isinstance(time_col, list) else [time_col]:
                if col in original_df.columns:
                    factor_df[col] = original_df[col]

        return factor_df

    def print_factor_expressions(self):
        if not self.factor_expressions:
            print("No factor expressions generated")
            return

        print("Factor expressions:")

        for factor_name, expression in self.factor_expressions.items():
            print(f"{factor_name}: {expression}")

    def evaluate_factors_on_splits(self, df, feature_cols, target_col):
        if not self.generated_factors:
            print("No factors generated for evaluation")
            return

        print("\n" + "=" * 60)
        print("Factor performance evaluation (training set vs full set)")
        print("=" * 60)

        train_df, full_df = self.split_data(df)

        X_train, y_train = self.prepare_data(train_df, feature_cols, target_col)
        y_train_processed, task_type = self.prepare_classification_target(y_train)

        X_full, y_full = self.prepare_data(full_df, feature_cols, target_col)
        y_full_processed, _ = self.prepare_classification_target(y_full)

        for factor_name, model in self.trained_models.items():
            try:
                train_factor_values = self.apply_factor_to_data(X_train, model, factor_name, task_type)

                full_factor_values = self.generated_factors[factor_name]

                if task_type == 'binary' and 'binary' in factor_name:
                    try:
                        train_auc = roc_auc_score(y_train_processed, train_factor_values)
                        full_auc = roc_auc_score(y_full_processed, full_factor_values)
                        print(f"{factor_name}:")
                        print(f"  Training set AUC: {train_auc:.4f}")
                        print(f"  Full set AUC: {full_auc:.4f}")
                        print(f"  Performance difference: {abs(full_auc - train_auc):.4f}")
                    except:
                        train_corr = np.corrcoef(train_factor_values, y_train_processed)[0, 1]
                        full_corr = np.corrcoef(full_factor_values, y_full_processed)[0, 1]
                        print(f"{factor_name}:")
                        print(f"  Training set correlation: {train_corr:.4f}")
                        print(f"  Full set correlation: {full_corr:.4f}")
                        print(f"  Performance difference: {abs(full_corr - train_corr):.4f}")
                else:
                    train_corr = np.corrcoef(train_factor_values, y_train_processed)[0, 1]
                    full_corr = np.corrcoef(full_factor_values, y_full_processed)[0, 1]
                    print(f"{factor_name}:")
                    print(f"  Training set correlation: {train_corr:.4f}")
                    print(f"  Full set correlation: {full_corr:.4f}")
                    print(f"  Performance difference: {abs(full_corr - train_corr):.4f}")

            except Exception as e:
                print(f"{factor_name}: Evaluation failed - {e}")

def run_classification_genetic_generation(df, feature_cols, target_col, n_factors=10, train_ratio=0.7):
    generator = ClassificationGeneticFactorGenerator(
        population_size=300,
        generations=8,
        tournament_size=15,
        random_state=42,
        train_ratio=train_ratio
    )

    try:
        print(f"Starting to generate {n_factors} classification factors...")
        print(f"Using first {train_ratio*100:.0f}% of data for training, then applying to full dataset")

        generated_factors = generator.generate_classification_factors(
            df, feature_cols, target_col, n_factors=n_factors
        )

        if not generated_factors:
            print("Factor generation failed")
            return None, None

    except Exception as e:
        print(f"Generation process failed: {e}")
        return None, None

    factor_df = generator.get_factor_dataframe(df)

    generator.print_factor_expressions()

    generator.evaluate_factors_on_splits(df, feature_cols, target_col)

    return generator, factor_df

In [None]:
generator, new_factors = run_classification_genetic_generation(
    stock_data[all_features + ["pred_ret"]+["MthRet"]],
    all_features +["MthRet"],
    target_col='pred_ret',
    n_factors=10
)


开始生成 10 个分类因子...
使用前 70% 数据进行训练，然后应用到全数据集
训练集大小: 1065615 行 (70%)
全数据集大小: 1522308 行 (100%)
第一步: 在训练集上训练因子
生成 10 个有序因子...
  genetic_ordinal_1 训练集相关性: 0.0040
  genetic_ordinal_2 训练集相关性: nan
  genetic_ordinal_3 训练集相关性: 0.0040
  genetic_ordinal_4 训练集相关性: 0.0040
  genetic_ordinal_5 训练集相关性: 0.0040
  genetic_ordinal_6 训练集相关性: 0.0040
  genetic_ordinal_7 训练集相关性: 0.0040
  genetic_ordinal_8 训练集相关性: 0.0040
  genetic_ordinal_9 训练集相关性: nan
  genetic_ordinal_10 训练集相关性: 0.0024

训练完成! 成功训练了 10 个因子
第二步: 将因子应用到全数据集
  genetic_ordinal_1 全集相关性: 0.0014
  genetic_ordinal_2 全集相关性: nan
  genetic_ordinal_3 全集相关性: 0.0014
  genetic_ordinal_4 全集相关性: 0.0014
  genetic_ordinal_5 全集相关性: 0.0014
  genetic_ordinal_6 全集相关性: 0.0014
  genetic_ordinal_7 全集相关性: 0.0014
  genetic_ordinal_8 全集相关性: 0.0014
  genetic_ordinal_9 全集相关性: nan
  genetic_ordinal_10 全集相关性: 0.0001

因子应用完成! 成功生成了 10 个因子到全数据集
因子表达式:
genetic_ordinal_1: tanh(tanh(X25))
genetic_ordinal_2: sub(X28, X28)
genetic_ordinal_3: tanh(tanh(X25))
genetic_ordinal_4: tanh(tan

In [None]:
constant_cols = [col for col in new_factors.columns if new_factors[col].nunique() <= 1]
new_factors.drop(columns=constant_cols, inplace=True)

In [None]:
uni_col = []
for i in new_factors.columns:
  if uni_col == []:
    uni_col.append(i)
  for j in uni_col:
    if np.all(new_factors[i].values == new_factors[j].values):
      break
    uni_col.append(i)
gp_factor = new_factors[uni_col].apply(lambda x: (x-x.mean())/x.std())

In [None]:
class FastRandomForestFactorGenerator:
    def __init__(self, n_estimators=200, random_state=42, n_jobs=-1, train_ratio=0.7):
        self.n_estimators = n_estimators
        self.random_state = random_state
        self.n_jobs = n_jobs
        self.train_ratio = train_ratio

        self.generated_factors = {}
        self.factor_info = {}
        self.rf_models = {}
        self.label_encoder = None

        self._X_train_processed = None
        self._y_train_processed = None
        self._task_type = None

    def split_data(self, df):
        n_train = int(len(df) * self.train_ratio)
        train_df = df.iloc[:n_train].copy()
        full_df = df.copy()

        print(f"Training set size: {len(train_df)} rows ({self.train_ratio*100:.0f}%)")
        print(f"Full dataset size: {len(full_df)} rows (100%)")

        return train_df, full_df

    def prepare_data(self, df, feature_cols, target_col=None, is_training=True):
        print("Processing data...")
        X = df[feature_cols].copy()

        numeric_cols = X.select_dtypes(include=[np.number]).columns
        X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())
        X[numeric_cols] = X[numeric_cols].replace([np.inf, -np.inf], np.nan)
        X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())

        if target_col is not None:
            y = df[target_col].copy()

            if is_training:
                if y.dtype == 'object' or len(np.unique(y)) <= 10:
                    if y.dtype == 'object':
                        self.label_encoder = LabelEncoder()
                        y = self.label_encoder.fit_transform(y)
                    else:
                        self.label_encoder = LabelEncoder()
                        y = self.label_encoder.fit_transform(y)
                    self._task_type = 'classification'
                else:
                    y = y.fillna(y.median())
                    y = y.replace([np.inf, -np.inf], np.nan).fillna(y.median())
                    self._task_type = 'regression'

                self._X_train_processed = X
                self._y_train_processed = y
            else:
                if self._task_type == 'classification' and self.label_encoder is not None:
                    try:
                        y = self.label_encoder.transform(y)
                    except ValueError:
                        known_classes = set(self.label_encoder.classes_)
                        y = y.apply(lambda x: x if x in known_classes else self.label_encoder.classes_[0])
                        y = self.label_encoder.transform(y)
                else:
                    y = y.fillna(y.median())
                    y = y.replace([np.inf, -np.inf], np.nan).fillna(y.median())

            return X, y

        return X

    def _train_main_model(self, X_train, y_train):
        print(f"Training main model for {self._task_type}...")

        if self._task_type == 'classification':
            rf = RandomForestClassifier(
                n_estimators=self.n_estimators,
                random_state=self.random_state,
                n_jobs=self.n_jobs,
                max_depth=8,
                min_samples_split=50
            )
        else:
            rf = RandomForestRegressor(
                n_estimators=self.n_estimators,
                random_state=self.random_state,
                n_jobs=self.n_jobs,
                max_depth=8,
                min_samples_split=50
            )

        rf.fit(X_train, y_train)
        self.rf_models['main'] = rf
        self.rf_models['task_type'] = self._task_type

        print(f"Main model trained ({self._task_type})")
        return rf

    def generate_tree_prediction_factors(self, X_train, y_train, n_factors=10):
        print(f"Generating {n_factors} tree prediction factors...")

        rf = self.rf_models.get('main')
        if rf is None:
            rf = self._train_main_model(X_train, y_train)

        n_factors = min(n_factors, len(rf.estimators_))

        for i in range(n_factors):
            factor_name = f'rf_tree_pred_{i+1}'
            self.factor_info[factor_name] = {
                'type': 'tree_prediction',
                'tree_index': i,
                'task_type': self._task_type
            }

        print(f"Configured {n_factors} tree prediction factors")

    def generate_leaf_index_factors(self, X_train, y_train, n_factors=8):
        print(f"Generating {n_factors} leaf index factors...")

        rf = self.rf_models.get('main')
        if rf is None:
            rf = self._train_main_model(X_train, y_train)

        leaf_indices = rf.apply(X_train)
        n_factors = min(n_factors, leaf_indices.shape[1])

        for i in range(n_factors):
            factor_name = f'rf_leaf_idx_{i+1}'
            self.factor_info[factor_name] = {
                'type': 'leaf_index',
                'tree_index': i
            }

        print(f"Configured {n_factors} leaf index factors")

    def generate_feature_importance_factors(self, X_train, y_train):
        print("Generating feature importance factors...")

        rf = self.rf_models.get('main')
        if rf is None:
            rf = self._train_main_model(X_train, y_train)

        main_importance = rf.feature_importances_
        self.rf_models['main_importance'] = main_importance

        importance_configs = ['main', 'squared', 'log']
        for config in importance_configs:
            factor_name = f'rf_importance_{config}'
            self.factor_info[factor_name] = {
                'type': 'feature_importance',
                'config': config,
                'top_features': list(X_train.columns[np.argsort(main_importance)[-5:][::-1]])
            }

        print("Configured 3 feature importance factors")

    def generate_bootstrap_factors(self, X_train, y_train, n_factors=6):
        print(f"Generating {n_factors} bootstrap factors...")

        rf = self.rf_models.get('main')
        if rf is None:
            rf = self._train_main_model(X_train, y_train)

        for i in range(n_factors):
            factor_name = f'rf_bootstrap_{i+1}'
            self.factor_info[factor_name] = {
                'type': 'bootstrap',
                'bootstrap_seed': self.random_state + i,
                'sample_ratio': 0.7
            }

        print(f"Configured {n_factors} bootstrap factors")

    def generate_tree_depth_factors(self, X_train, y_train, n_factors=4):
        print(f"Generating {n_factors} tree depth factors...")

        rf = self.rf_models.get('main')
        if rf is None:
            rf = self._train_main_model(X_train, y_train)

        leaf_indices = rf.apply(X_train)
        n_factors = min(n_factors, leaf_indices.shape[1])

        for i in range(n_factors):
            factor_name = f'rf_depth_{i+1}'
            self.factor_info[factor_name] = {
                'type': 'tree_depth',
                'tree_index': i
            }

        print(f"Configured {n_factors} tree depth factors")

    def train_factors(self, train_df, feature_cols, target_col):
        print("=" * 60)
        print("Step 1: Training random forest factors on training set")
        print("=" * 60)

        X_train, y_train = self.prepare_data(train_df, feature_cols, target_col, is_training=True)

        self._train_main_model(X_train, y_train)

        self.generate_tree_prediction_factors(X_train, y_train, n_factors=10)
        self.generate_leaf_index_factors(X_train, y_train, n_factors=8)
        self.generate_feature_importance_factors(X_train, y_train)
        self.generate_bootstrap_factors(X_train, y_train, n_factors=6)
        self.generate_tree_depth_factors(X_train, y_train, n_factors=4)

        print(f"\nTraining completed! Successfully configured {len(self.factor_info)} factors")
        return self._task_type

    def apply_factors_to_full_data(self, full_df, feature_cols, target_col, task_type):
        print("\n" + "=" * 60)
        print("Step 2: Applying factors to full dataset")
        print("=" * 60)

        X_full, y_full = self.prepare_data(full_df, feature_cols, target_col, is_training=False)
        rf = self.rf_models['main']

        for factor_name, info in self.factor_info.items():
            try:
                factor_type = info['type']

                if factor_type == 'tree_prediction':
                    tree_idx = info['tree_index']
                    if task_type == 'classification':
                        tree_probs = rf.estimators_[tree_idx].predict_proba(X_full)
                        if tree_probs.shape[1] > 1:
                            factor_values = tree_probs[:, 1]
                        else:
                            factor_values = tree_probs[:, 0]
                    else:
                        factor_values = rf.estimators_[tree_idx].predict(X_full)

                elif factor_type == 'leaf_index':
                    tree_idx = info['tree_index']
                    leaf_indices = rf.apply(X_full)
                    factor_values = leaf_indices[:, tree_idx].astype(float)

                elif factor_type == 'feature_importance':
                    config = info['config']
                    main_importance = self.rf_models['main_importance']

                    if config == 'main':
                        weighted_importance = main_importance
                    elif config == 'squared':
                        weighted_importance = main_importance ** 2
                        weighted_importance = weighted_importance / weighted_importance.sum()
                    elif config == 'log':
                        weighted_importance = np.log(main_importance + 1e-8)
                        weighted_importance = (weighted_importance - weighted_importance.min()) / (weighted_importance.max() - weighted_importance.min() + 1e-8)

                    factor_values = np.dot(X_full, weighted_importance)

                elif factor_type == 'bootstrap':
                    bootstrap_seed = info['bootstrap_seed']
                    sample_ratio = info['sample_ratio']

                    np.random.seed(bootstrap_seed)
                    sample_size = int(sample_ratio * len(X_full))
                    sample_idx = np.random.choice(len(X_full), size=sample_size, replace=True)

                    if task_type == 'classification':
                        base_predictions = rf.predict_proba(X_full)
                        if base_predictions.shape[1] > 1:
                            base_predictions = base_predictions[:, 1]
                        else:
                            base_predictions = base_predictions[:, 0]
                    else:
                        base_predictions = rf.predict(X_full)

                    base_predictions = base_predictions.astype(np.float64)

                    X_sample = X_full.iloc[sample_idx] if hasattr(X_full, 'iloc') else X_full[sample_idx]
                    if task_type == 'classification':
                        sample_predictions = rf.predict_proba(X_sample)
                        if sample_predictions.shape[1] > 1:
                            sample_predictions = sample_predictions[:, 1]
                        else:
                            sample_predictions = sample_predictions[:, 0]
                    else:
                        sample_predictions = rf.predict(X_sample)

                    sample_predictions = sample_predictions.astype(np.float64)

                    factor_values = base_predictions.copy()
                    factor_values[sample_idx] = sample_predictions

                    unsampled_mask = np.ones(len(X_full), dtype=bool)
                    unsampled_mask[sample_idx] = False
                    bootstrap_idx = int(factor_name.split('_')[-1]) - 1
                    factor_values[unsampled_mask] *= (0.95 + bootstrap_idx * 0.01)

                elif factor_type == 'tree_depth':
                    tree_idx = info['tree_index']
                    leaf_indices = rf.apply(X_full)
                    depths = leaf_indices[:, tree_idx].astype(float)
                    factor_values = (depths - depths.min()) / (depths.max() - depths.min() + 1e-8)

                factor_values = factor_values.astype(np.float64)
                self.generated_factors[factor_name] = factor_values

                try:
                    if task_type == 'classification' and 'tree_pred' in factor_name:
                        auc = roc_auc_score(y_full, factor_values)
                        print(f"  {factor_name} full set AUC: {auc:.4f}")
                    else:
                        corr = np.corrcoef(factor_values, y_full)[0, 1]
                        if not np.isnan(corr):
                            print(f"  {factor_name} full set correlation: {corr:.4f}")
                except:
                    print(f"  {factor_name} generated")

            except Exception as e:
                print(f"  Factor {factor_name} application failed: {e}")
                continue

        print(f"\nFactor application completed! Successfully generated {len(self.generated_factors)} factors on full dataset")

    def generate_all_rf_factors(self, df, feature_cols, target_col):
        print("=" * 60)
        print("Starting FAST Random Forest Factor Generation")
        print("=" * 60)

        train_df, full_df = self.split_data(df)

        task_type = self.train_factors(train_df, feature_cols, target_col)

        self.apply_factors_to_full_data(full_df, feature_cols, target_col, task_type)

        print("=" * 60)
        print(f"Generated {len(self.generated_factors)} factors successfully!")
        print(f"Models trained: 1 (vs ~3-5 in slow version)")
        print("=" * 60)

        return self.generated_factors

    def get_factor_dataframe(self, original_df):
        if not self.generated_factors:
            print("No factors were generated")
            return None

        factor_df = pd.DataFrame(self.generated_factors, index=original_df.index)
        return factor_df

    def get_factor_summary(self):
        if not self.generated_factors:
            return None

        summary_data = []
        for factor_name, factor_values in self.generated_factors.items():
            info = self.factor_info.get(factor_name, {})
            summary_data.append({
                'factor_name': factor_name,
                'type': info.get('type', 'unknown'),
                'mean': np.mean(factor_values),
                'std': np.std(factor_values),
                'min': np.min(factor_values),
                'max': np.max(factor_values),
                'unique_values': len(np.unique(factor_values)),
                'info': str(info)
            })

        summary_df = pd.DataFrame(summary_data)
        return summary_df

    def evaluate_factors_on_splits(self, df, feature_cols, target_col):
        if not self.generated_factors:
            print("No factors generated for evaluation")
            return

        print("\n" + "=" * 60)
        print("Factor performance evaluation (training set vs full set)")
        print("=" * 60)

        train_df, full_df = self.split_data(df)

        X_train, y_train = self.prepare_data(train_df, feature_cols, target_col, is_training=True)

        X_full, y_full = self.prepare_data(full_df, feature_cols, target_col, is_training=False)

        rf = self.rf_models['main']
        task_type = self._task_type

        for factor_name in self.generated_factors.keys():
            try:
                info = self.factor_info[factor_name]
                factor_type = info['type']

                if factor_type == 'tree_prediction':
                    tree_idx = info['tree_index']
                    if task_type == 'classification':
                        tree_probs = rf.estimators_[tree_idx].predict_proba(X_train)
                        if tree_probs.shape[1] > 1:
                            train_factor_values = tree_probs[:, 1]
                        else:
                            train_factor_values = tree_probs[:, 0]
                    else:
                        train_factor_values = rf.estimators_[tree_idx].predict(X_train)

                elif factor_type == 'leaf_index':
                    tree_idx = info['tree_index']
                    leaf_indices = rf.apply(X_train)
                    train_factor_values = leaf_indices[:, tree_idx].astype(float)

                else:
                    train_factor_values = None

                full_factor_values = self.generated_factors[factor_name]

                if train_factor_values is not None:
                    if task_type == 'classification' and 'tree_pred' in factor_name:
                        try:
                            train_auc = roc_auc_score(y_train, train_factor_values)
                            full_auc = roc_auc_score(y_full, full_factor_values)
                            print(f"{factor_name}:")
                            print(f"  Training set AUC: {train_auc:.4f}")
                            print(f"  Full set AUC: {full_auc:.4f}")
                            print(f"  Performance difference: {abs(full_auc - train_auc):.4f}")
                        except:
                            train_corr = np.corrcoef(train_factor_values, y_train)[0, 1]
                            full_corr = np.corrcoef(full_factor_values, y_full)[0, 1]
                            if not (np.isnan(train_corr) or np.isnan(full_corr)):
                                print(f"{factor_name}:")
                                print(f"  Training set correlation: {train_corr:.4f}")
                                print(f"  Full set correlation: {full_corr:.4f}")
                                print(f"  Performance difference: {abs(full_corr - train_corr):.4f}")
                    else:
                        train_corr = np.corrcoef(train_factor_values, y_train)[0, 1]
                        full_corr = np.corrcoef(full_factor_values, y_full)[0, 1]
                        if not (np.isnan(train_corr) or np.isnan(full_corr)):
                            print(f"{factor_name}:")
                            print(f"  Training set correlation: {train_corr:.4f}")
                            print(f"  Full set correlation: {full_corr:.4f}")
                            print(f"  Performance difference: {abs(full_corr - train_corr):.4f}")
                else:
                    full_corr = np.corrcoef(full_factor_values, y_full)[0, 1]
                    if not np.isnan(full_corr):
                        print(f"{factor_name}: Full set correlation: {full_corr:.4f}")

            except Exception as e:
                print(f"{factor_name}: Evaluation failed - {e}")

    def evaluate_factors(self, df, target_col):
        if not self.generated_factors:
            return None

        try:
            target = df[target_col].fillna(df[target_col].median())
            if self.label_encoder is not None and target.dtype == 'object':
                target = self.label_encoder.transform(target)
        except:
            target = df[target_col].fillna(df[target_col].median())

        evaluation_results = []

        for factor_name, factor_values in self.generated_factors.items():
            try:
                corr = np.corrcoef(factor_values, target)[0, 1]
                if np.isnan(corr):
                    corr = 0
            except:
                corr = 0

            factor_std = np.std(factor_values)
            unique_ratio = len(np.unique(factor_values)) / len(factor_values)

            evaluation_results.append({
                'factor_name': factor_name,
                'type': self.factor_info.get(factor_name, {}).get('type', 'unknown'),
                'correlation': corr,
                'abs_correlation': abs(corr),
                'std': factor_std,
                'unique_ratio': unique_ratio
            })

        eval_df = pd.DataFrame(evaluation_results)
        eval_df = eval_df.sort_values('abs_correlation', ascending=False)

        print("\nTop 10 factors by correlation:")
        print(eval_df.head(10)[['factor_name', 'type', 'correlation', 'unique_ratio']])

        return eval_df

def run_fast_rf_factor_generation(df, target_col, feature_cols=None, n_jobs=-1, train_ratio=0.7):
    if feature_cols is None:
        feature_cols = [col for col in df.columns if col != target_col]

    print(f"Target column: {target_col}")
    print(f"Found {len(feature_cols)} feature columns")
    print(f"Data shape: {df.shape}")
    print(f"Train ratio: {train_ratio*100:.0f}%")

    generator = FastRandomForestFactorGenerator(
        n_estimators=200,
        random_state=42,
        n_jobs=n_jobs,
        train_ratio=train_ratio
    )

    try:
        print(f"Starting random forest factor generation...")
        print(f"Using first {train_ratio*100:.0f}% of data for training, then applying to full dataset")

        generated_factors = generator.generate_all_rf_factors(df, feature_cols, target_col)

        if not generated_factors:
            print("Factor generation failed")
            return None, None, None, None

    except Exception as e:
        print(f"Generation process failed: {e}")
        return None, None, None, None

    factor_df = generator.get_factor_dataframe(df)

    summary = generator.get_factor_summary()
    if summary is not None:
        print("\nFactor types summary:")
        print(summary.groupby('type').size())

    generator.evaluate_factors_on_splits(df, feature_cols, target_col)

    evaluation_results = generator.evaluate_factors(df, target_col)

    return generator, factor_df, summary, evaluation_results

In [None]:
generator, factor_df, summary, evaluation = run_fast_rf_factor_generation(
    stock_data[all_features + ['pred_cat']+["MthRet"]],
    target_col='pred_cat'
)

🎯 Target column: pred_cat
📊 Found 29 feature columns
📈 Data shape: (1522308, 30)
🔄 Train ratio: 70%
开始生成随机森林因子...
使用前 70% 数据进行训练，然后应用到全数据集
🚀 Starting FAST Random Forest Factor Generation
训练集大小: 1065615 行 (70%)
全数据集大小: 1522308 行 (100%)
第一步: 在训练集上训练随机森林因子
📊 Processing data...
🎯 Training main model for classification...
✅ Main model trained (classification)
🌳 Generating 10 tree prediction factors...
✅ Configured 10 tree prediction factors
🍃 Generating 8 leaf index factors...
✅ Configured 8 leaf index factors
⚖️ Generating feature importance factors...
✅ Configured 3 feature importance factors
🎲 Generating 6 bootstrap factors...
✅ Configured 6 bootstrap factors
📏 Generating 4 tree depth factors...
✅ Configured 4 tree depth factors

训练完成! 成功配置了 31 个因子

第二步: 将因子应用到全数据集
📊 Processing data...
  rf_tree_pred_1 全集 AUC: 0.6714
  rf_tree_pred_2 全集 AUC: 0.6566
  rf_tree_pred_3 全集 AUC: 0.6573
  rf_tree_pred_4 全集 AUC: 0.6268
  rf_tree_pred_5 全集 AUC: 0.6780
  rf_tree_pred_6 全集 AUC: 0.6541
  rf_tree_pre

In [None]:
extra_factor = pd.concat([factor_df, gp_factor],axis =1)

In [None]:
extra_factor

Unnamed: 0,rf_tree_pred_1,rf_tree_pred_2,rf_tree_pred_3,rf_tree_pred_4,rf_tree_pred_5,rf_tree_pred_6,rf_tree_pred_7,rf_tree_pred_8,rf_tree_pred_9,rf_tree_pred_10,...,rf_depth_1,rf_depth_2,rf_depth_3,rf_depth_4,rf_residual_1,rf_residual_2,rf_residual_3,rf_residual_4,genetic_ordinal_1,genetic_ordinal_10
0,0.573348,1.411724,1.364777,-0.988676,0.905704,1.419675,0.989516,1.421544,-1.088521,0.877164,...,0.850267,0.397930,1.050530,-0.007817,0.662695,-0.884270,-0.877767,0.804800,7.779292,0.464701
1,0.573348,1.411724,1.364777,-0.988676,0.905704,1.419675,0.989516,1.421544,-1.088521,0.877164,...,0.850267,0.397930,1.050530,-0.007817,0.662695,-0.884270,-0.877767,0.804800,7.779292,0.125753
2,0.573348,0.346282,1.167933,-0.988676,0.905704,1.824160,0.989516,1.421544,-1.088521,0.877164,...,0.850267,0.373333,-1.253436,-0.007817,0.677820,-0.837223,-0.846042,0.814299,7.779292,-0.188610
3,0.573348,0.959177,1.167933,-0.988676,0.905704,1.824160,0.989516,1.421544,-1.088521,0.877164,...,0.850267,0.213455,-1.253436,-0.007817,0.677733,-0.837495,-0.846227,0.814244,7.779292,-0.394928
4,1.020689,0.831203,1.167933,0.990808,1.358280,0.885778,0.989516,1.421544,0.940734,1.608291,...,1.065179,1.332604,-1.253436,0.623461,0.667617,-0.868960,-0.867521,0.807903,-14.603992,0.018230
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1522303,-1.294664,-1.554286,-1.900849,-1.952891,-0.123315,0.131028,-1.921689,0.459393,-1.259734,-1.276827,...,-1.398046,-1.348435,-0.485447,-1.428192,-0.817636,-0.401882,-0.518514,-0.961912,-0.720313,-0.000249
1522304,-0.156288,-0.960481,-1.900849,-0.928778,-0.123315,0.131028,-1.502551,-0.719297,-0.150601,-1.562804,...,-0.637587,-0.795009,-0.485447,-0.439744,1.280044,1.035977,0.998884,1.131085,-0.674258,-0.001130
1522305,-0.935492,0.412662,-1.900849,-0.748495,0.299333,0.131028,0.520201,0.459393,-0.143008,-0.518277,...,0.800672,0.484019,-0.485447,0.274597,-1.059333,0.349910,0.191443,-1.090437,-0.644826,-0.000089
1522306,-2.259776,-1.561753,-1.900849,-1.687048,-0.123315,0.131028,-1.710091,-0.719297,-1.259734,-1.562804,...,-0.835968,-1.274645,-0.485447,-0.589257,-0.775701,-0.532319,-0.623081,-0.937758,-0.674483,-0.000625


In [None]:
extra_factor.to_csv("/content/drive/MyDrive/extra_factor.csv")