In [1]:
import pandas as pd
import numpy as np
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

class FastXGBPredictor:
    def __init__(self):
        self.model = None
        self.feature_groups = {}
        self.best_params = None
        
    def load_cleaned_data(self, file_path):
        """加载清洗好的数据"""
        print("Loading cleaned data...")
        df = pd.read_csv(file_path)
        print(f"Data shape: {df.shape}")
        return df
    
    def create_feature_groups_by_importance(self, train_df, target_col='forward_returns', top_pct=0.1, mid_pct=0.5):
        """基于特征重要性创建三组特征"""
        print("Creating feature groups by importance...")
        
        # 准备特征和目标
        feature_cols = [col for col in train_df.columns if col not in 
                       ['date_id', 'forward_returns', 'risk_free_rate', 
                        'market_forward_excess_returns', 'is_scored']]
        
        X = train_df[feature_cols].fillna(0)
        y = train_df[target_col]
        
        # 训练初始模型获取特征重要性
        initial_model = xgb.XGBRegressor(
            n_estimators=100,
            max_depth=4,
            learning_rate=0.1,
            random_state=42
        )
        initial_model.fit(X, y)
        
        # 获取特征重要性
        importance_df = pd.DataFrame({
            'feature': feature_cols,
            'importance': initial_model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        # 创建特征组
        n_features = len(importance_df)
        n_top = int(n_features * top_pct)
        n_mid = int(n_features * mid_pct) - n_top
        
        top_features = importance_df.head(n_top)['feature'].tolist()
        mid_features = importance_df.iloc[n_top:n_top+n_mid]['feature'].tolist()
        low_features = importance_df.iloc[n_top+n_mid:]['feature'].tolist()
        
        self.feature_groups = {
            'top_10pct': top_features,
            'mid_10_50pct': mid_features,
            'low_50pct': low_features
        }
        
        print(f"Top {len(top_features)} features (前10%)")
        print(f"Mid {len(mid_features)} features (前10-50%)") 
        print(f"Low {len(low_features)} features (剩余50%)")
        
        return self.feature_groups
    
    def tune_regularization_params(self, X_train, y_train, feature_group_name, features):
        """调试单个特征组的正则化参数"""
        print(f"\nTuning {feature_group_name} with {len(features)} features...")
        
        X_subset = X_train[features].fillna(0)
        
        # 正则化参数网格
        param_combinations = []
        for reg_alpha in [0, 0.1, 0.5, 1, 2]:  # L1正则化
            for reg_lambda in [0.1, 0.5, 1, 2, 5]:  # L2正则化
                for max_depth in [3, 4, 5]:
                    param_combinations.append({
                        'reg_alpha': reg_alpha,
                        'reg_lambda': reg_lambda,
                        'max_depth': max_depth
                    })
        
        # 时间序列交叉验证
        tscv = TimeSeriesSplit(n_splits=3)
        best_score = float('inf')
        best_params = None
        
        # 测试前10个参数组合（为了速度）
        for i, params in enumerate(param_combinations[:10]):
            try:
                model = xgb.XGBRegressor(
                    n_estimators=100,
                    learning_rate=0.1,
                    reg_alpha=params['reg_alpha'],
                    reg_lambda=params['reg_lambda'],
                    max_depth=params['max_depth'],
                    random_state=42,
                    n_jobs=-1
                )
                
                # 交叉验证
                cv_scores = []
                for train_idx, val_idx in tscv.split(X_subset):
                    X_tr, X_val = X_subset.iloc[train_idx], X_subset.iloc[val_idx]
                    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
                    
                    model.fit(X_tr, y_tr)
                    preds = model.predict(X_val)
                    score = mean_squared_error(y_val, preds)
                    cv_scores.append(score)
                
                avg_score = np.mean(cv_scores)
                
                if avg_score < best_score:
                    best_score = avg_score
                    best_params = params
                    best_params['score'] = avg_score
                    
            except Exception as e:
                continue
        
        return best_params
    
    def find_best_regularization_combinations(self, train_df, target_col='forward_returns'):
        """为三组特征找出最好的三个正则化参数组合"""
        print("Finding best regularization combinations...")
        
        X_train = train_df
        y_train = train_df[target_col]
        
        # 为每组特征调试参数
        group_results = {}
        for group_name, features in self.feature_groups.items():
            best_params = self.tune_regularization_params(X_train, y_train, group_name, features)
            if best_params:
                group_results[group_name] = best_params
                print(f"{group_name}: {best_params}")
        
        # 选择表现最好的三个组合
        all_combinations = []
        for group_name, params in group_results.items():
            params_copy = params.copy()
            params_copy['feature_group'] = group_name
            all_combinations.append(params_copy)
        
        # 按分数排序，选择最好的三个
        all_combinations.sort(key=lambda x: x['score'])
        best_combinations = all_combinations[:3]
        
        print(f"\nBest 3 regularization combinations:")
        for i, combo in enumerate(best_combinations, 1):
            print(f"{i}. {combo}")
        
        self.best_params = best_combinations
        return best_combinations
    
    def train_final_model(self, train_df, test_df, target_col='forward_returns'):
        """使用最好的参数组合训练最终模型"""
        print("\nTraining final model with best parameters...")
        
        # 选择最好的特征组
        best_group = self.best_params[0]['feature_group']
        features = self.feature_groups[best_group]
        params = self.best_params[0]
        
        print(f"Using feature group: {best_group} with {len(features)} features")
        print(f"Parameters: {params}")
        
        X_train = train_df[features].fillna(0)
        y_train = train_df[target_col]
        X_test = test_df[features].fillna(0)
        
        # 训练最终模型
        self.model = xgb.XGBRegressor(
            n_estimators=200,
            learning_rate=0.1,
            reg_alpha=params['reg_alpha'],
            reg_lambda=params['reg_lambda'],
            max_depth=params['max_depth'],
            random_state=42,
            n_jobs=-1
        )
        
        self.model.fit(X_train, y_train)
        
        # 预测
        train_pred = self.model.predict(X_train)
        test_pred = self.model.predict(X_test)
        
        # 计算训练集RMSE
        train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
        print(f"Training RMSE: {train_rmse:.6f}")
        
        return test_pred
    
    def calculate_positions(self, predictions, method='adaptive_sigmoid'):
        """计算仓位权重 (0-2)"""
        if method == 'adaptive_sigmoid':
            # 自适应sigmoid，根据预测值的分布调整
            pred_mean = np.mean(predictions)
            pred_std = np.std(predictions)
            
            # 动态调整缩放因子
            if pred_std > 0:
                scale_factor = 1 / (pred_std * 2)
            else:
                scale_factor = 10
                
            # 使用sigmoid函数映射到0-2范围
            positions = 1 + (1 / (1 + np.exp(-(predictions - pred_mean) * scale_factor)) - 0.5) * 2
        
        elif method == 'volatility_adjusted':
            # 波动率调整方法
            pred_vol = np.std(predictions)
            if pred_vol > 0:
                # 标准化预测值
                z_scores = (predictions - np.mean(predictions)) / pred_vol
                # 根据z-score确定仓位
                positions = np.clip(1 + z_scores * 0.5, 0, 2)
            else:
                positions = np.ones_like(predictions)
        
        else:
            # 简单截断方法
            positions = np.clip(predictions * 5 + 1, 0, 2)
        
        # 确保在[0,2]范围内
        positions = np.clip(positions, 0, 2)
        
        print(f"Position stats - Min: {positions.min():.3f}, Max: {positions.max():.3f}, Mean: {positions.mean():.3f}")
        return positions

def main():
    """主函数"""
    predictor = FastXGBPredictor()
    
    # 加载数据
    train_df = predictor.load_cleaned_data('/kaggle/input/cleaned/train_cleaned.csv')
    test_df = predictor.load_cleaned_data('/kaggle/input/hull-tactical-market-prediction/test.csv')  # 假设test数据格式相同
    
    # 创建特征分组
    feature_groups = predictor.create_feature_groups_by_importance(train_df)
    
    # 找出最好的正则化参数组合
    best_combinations = predictor.find_best_regularization_combinations(train_df)
    
    # 训练最终模型并预测
    test_predictions = predictor.train_final_model(train_df, test_df)
    
    # 计算仓位
    positions = predictor.calculate_positions(test_predictions, method='adaptive_sigmoid')
    
    # 创建提交文件
    submission = pd.DataFrame({
        'date_id': test_df['date_id'],
        'weight': positions
    })
    
    # 保存结果
    submission.to_csv('submission.parquet', index=False)
    print(f"\nSubmission saved with {len(submission)} predictions")
    
    # 显示仓位分布
    print("\nPosition distribution:")
    print(submission['weight'].describe())
    
    return predictor, submission



if __name__ == "__main__":
    # 完整版本
    predictor, submission = main()
    


/kaggle/input/cleaned/train_cleaned.csv
/kaggle/input/hull-tactical-market-prediction/train.csv
/kaggle/input/hull-tactical-market-prediction/test.csv
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/default_inference_server.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/default_gateway.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/__init__.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/templates.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/base_gateway.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/relay.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/kaggle_evaluation.proto
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/__init__.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/generated/kaggle_evaluation_pb2.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/genera