# 🌟 SolarPV项目 - 全模型训练

在Project1140上训练所有模型，支持GPU加速


In [None]:
# 安装依赖包
!pip install torch torchvision xgboost lightgbm scikit-learn pandas numpy matplotlib seaborn pyyaml tqdm openpyxl xlsxwriter

# 检查GPU
import torch
if torch.cuda.is_available():
    print(f"🎮 GPU可用: {torch.cuda.get_device_name(0)}")
    print(f"🎮 GPU内存: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("⚠️ 未检测到GPU，将使用CPU训练")


In [None]:
# 上传项目文件（如果还没有）
from google.colab import files
import zipfile
import os

# 如果项目文件是zip格式，解压
if not os.path.exists('SolarPV-Prediction'):
    print("📁 请上传项目zip文件")
    uploaded = files.upload()
    
    for filename in uploaded.keys():
        if filename.endswith('.zip'):
            with zipfile.ZipFile(filename, 'r') as zip_ref:
                zip_ref.extractall('.')
            print(f"✅ 解压完成: {filename}")

# 进入项目目录
os.chdir('SolarPV-Prediction')
print(f"📂 当前目录: {os.getcwd()}")


In [None]:
# 测试单个模型（验证环境）
import subprocess
import sys

print("🧪 测试环境...")

# 测试LSTM模型
result = subprocess.run([
    sys.executable, "main.py", 
    "--config", "config/projects/1140/LSTM_low_PV_24h_TE.yaml"
], capture_output=True, text=True, timeout=300)

print(f"返回码: {result.returncode}")
if result.returncode == 0:
    print("✅ 环境测试成功!")
    print("输出:", result.stdout[-500:])  # 显示最后500个字符
else:
    print("❌ 环境测试失败!")
    print("错误:", result.stderr[-500:])


In [None]:
# 运行所有模型训练（修复版）
import os
import glob
import time
import yaml
from datetime import datetime

def run_ml_models_only():
    """只运行机器学习模型"""
    print("🤖 运行机器学习模型...")
    
    # 获取所有ML模型配置
    config_dir = "config/projects/1140"
    yaml_files = glob.glob(os.path.join(config_dir, "*.yaml"))
    config_files = [f for f in yaml_files if not f.endswith("config_index.yaml")]
    
    ml_models = ['RF', 'XGB', 'LGBM', 'LSR']
    ml_configs = []
    
    for config_file in config_files:
        with open(config_file, 'r') as f:
            config = yaml.safe_load(f)
        if config.get('model') in ml_models:
            ml_configs.append(config_file)
    
    print(f"📁 找到 {len(ml_configs)} 个机器学习模型配置")
    
    # 开始训练
    total_experiments = len(ml_configs)
    successful = 0
    failed = 0
    start_time = time.time()
    
    for i, config_file in enumerate(ml_configs, 1):
        print(f"\n{'='*60}")
        print(f"🔥 训练 {i}/{total_experiments}: {os.path.basename(config_file)}")
        print(f"⏰ 时间: {datetime.now().strftime('%H:%M:%S')}")
        print(f"{'='*60}")
        
        try:
            # 运行训练
            result = subprocess.run([
                sys.executable, "main.py", "--config", config_file
            ], capture_output=True, text=True, timeout=600)  # 10分钟超时
            
            if result.returncode == 0:
                print(f"✅ 成功!")
                successful += 1
                # 显示结果
                if "mse=" in result.stdout:
                    lines = result.stdout.split('\n')
                    for line in lines:
                        if "mse=" in line and "rmse=" in line:
                            print(f"📊 {line.strip()}")
                            break
            else:
                print(f"❌ 失败! 返回码: {result.returncode}")
                print(f"错误: {result.stderr[-200:]}")
                failed += 1
                
        except subprocess.TimeoutExpired:
            print(f"⏰ 超时 (10分钟)")
            failed += 1
        except Exception as e:
            print(f"💥 异常: {str(e)}")
            failed += 1
        
        # 显示进度
        elapsed = time.time() - start_time
        progress = i / total_experiments * 100
        eta = elapsed / i * (total_experiments - i) if i > 0 else 0
        
        print(f"\n📈 进度: {progress:.1f}% | 成功: {successful} | 失败: {failed}")
        print(f"⏱️ 已用: {elapsed/60:.1f}分钟 | 剩余: {eta/60:.1f}分钟")
    
    # 最终统计
    total_elapsed = time.time() - start_time
    print(f"\n🎉 机器学习模型训练完成!")
    print(f"📊 成功: {successful}/{total_experiments} ({successful/total_experiments*100:.1f}%)")
    print(f"⏱️ 总用时: {total_elapsed/60:.1f} 分钟")
    
    return successful, failed

def run_dl_models_with_fallback():
    """运行深度学习模型，带降级处理"""
    print("🧠 运行深度学习模型（带降级处理）...")
    
    # 获取所有DL模型配置
    config_dir = "config/projects/1140"
    yaml_files = glob.glob(os.path.join(config_dir, "*.yaml"))
    config_files = [f for f in yaml_files if not f.endswith("config_index.yaml")]
    
    dl_models = ['LSTM', 'GRU', 'Transformer', 'TCN']
    dl_configs = []
    
    for config_file in config_files:
        with open(config_file, 'r') as f:
            config = yaml.safe_load(f)
        if config.get('model') in dl_models:
            dl_configs.append(config_file)
    
    print(f"📁 找到 {len(dl_configs)} 个深度学习模型配置")
    
    # 开始训练
    total_experiments = len(dl_configs)
    successful = 0
    failed = 0
    start_time = time.time()
    
    for i, config_file in enumerate(dl_configs, 1):
        print(f"\n{'='*60}")
        print(f"🔥 训练 {i}/{total_experiments}: {os.path.basename(config_file)}")
        print(f"⏰ 时间: {datetime.now().strftime('%H:%M:%S')}")
        print(f"{'='*60}")
        
        try:
            # 运行训练
            result = subprocess.run([
                sys.executable, "main.py", "--config", config_file
            ], capture_output=True, text=True, timeout=1200)  # 20分钟超时
            
            if result.returncode == 0:
                print(f"✅ 成功!")
                successful += 1
                # 显示结果
                if "mse=" in result.stdout:
                    lines = result.stdout.split('\n')
                    for line in lines:
                        if "mse=" in line and "rmse=" in line:
                            print(f"📊 {line.strip()}")
                            break
            else:
                print(f"❌ 失败! 返回码: {result.returncode}")
                print(f"错误: {result.stderr[-200:]}")
                failed += 1
                
        except subprocess.TimeoutExpired:
            print(f"⏰ 超时 (20分钟)")
            failed += 1
        except Exception as e:
            print(f"💥 异常: {str(e)}")
            failed += 1
        
        # 显示进度
        elapsed = time.time() - start_time
        progress = i / total_experiments * 100
        eta = elapsed / i * (total_experiments - i) if i > 0 else 0
        
        print(f"\n📈 进度: {progress:.1f}% | 成功: {successful} | 失败: {failed}")
        print(f"⏱️ 已用: {elapsed/60:.1f}分钟 | 剩余: {eta/60:.1f}分钟")
    
    # 最终统计
    total_elapsed = time.time() - start_time
    print(f"\n🎉 深度学习模型训练完成!")
    print(f"📊 成功: {successful}/{total_experiments} ({successful/total_experiments*100:.1f}%)")
    print(f"⏱️ 总用时: {total_elapsed/60:.1f} 分钟")
    
    return successful, failed

# 先运行机器学习模型
print("🤖 第一阶段：机器学习模型")
ml_success, ml_failed = run_ml_models_only()

# 再运行深度学习模型
print("\n🧠 第二阶段：深度学习模型")
dl_success, dl_failed = run_dl_models_with_fallback()

# 最终统计
total_success = ml_success + dl_success
total_failed = ml_failed + dl_failed
total_experiments = total_success + total_failed

print(f"\n🎉 所有训练完成!")
print("=" * 80)
print(f"📊 最终统计:")
print(f"  机器学习模型: {ml_success} 成功, {ml_failed} 失败")
print(f"  深度学习模型: {dl_success} 成功, {dl_failed} 失败")
print(f"  总计: {total_success}/{total_experiments} 成功 ({total_success/total_experiments*100:.1f}%)")

if dl_failed > 0:
    print(f"\n⚠️ 深度学习模型失败较多，可能是PyTorch版本兼容性问题")
    print(f"建议：")
    print(f"  1. 检查PyTorch版本")
    print(f"  2. 尝试不同的PyTorch版本")
    print(f"  3. 使用CPU训练")
    print(f"  4. 检查内存使用")


In [None]:
# 查看结果
import os
import glob

print("📁 查看训练结果...")

# 检查结果目录
results_dir = "temp_results/1140"
if os.path.exists(results_dir):
    result_dirs = os.listdir(results_dir)
    print(f"✅ 找到 {len(result_dirs)} 个结果目录")
    
    # 显示前10个结果
    for i, dir_name in enumerate(sorted(result_dirs)[:10]):
        print(f"  {i+1}. {dir_name}")
    
    if len(result_dirs) > 10:
        print(f"  ... 还有 {len(result_dirs) - 10} 个结果")
else:
    print("❌ 未找到结果目录")


In [None]:
# 下载结果（可选）
from google.colab import files
import zipfile
import shutil

def download_results():
    """打包并下载结果"""
    if os.path.exists("temp_results"):
        # 创建zip文件
        zip_filename = f"solarpv_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.zip"
        
        with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for root, dirs, files in os.walk("temp_results"):
                for file in files:
                    file_path = os.path.join(root, file)
                    arcname = os.path.relpath(file_path, "temp_results")
                    zipf.write(file_path, arcname)
        
        print(f"📦 结果已打包: {zip_filename}")
        files.download(zip_filename)
        print("✅ 下载完成!")
    else:
        print("❌ 未找到结果目录")

# 取消注释以下行来下载结果
# download_results()
