In [2]:
import os
import math
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
from scipy.special import logsumexp
from collections import OrderedDict
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from scipy.signal import savgol_filter
from scipy.ndimage import gaussian_filter1d

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset

import wandb

import warnings
warnings.filterwarnings('ignore')

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

SEED = 42
seed_everything(SEED)

In [4]:
def init_wandb(project_name="geology-forecast-challenge-sweep-gpu-bayes-30-server-HybridCNNLSTM", config=None):
    # Если run уже существует, просто возвращаем его
    if wandb.run is not None:
        return wandb.run
    
    try:
        wandb_api_key = os.environ['WANDB_API_KEY']
        
        wandb.login(key=wandb_api_key)
        
        run = wandb.init(
            project=project_name,
            config=config,
            tags=["LSTM", "Geology Forecast Challenge", "Feature Engineering"],
            reinit=True
        )
        
        print("W&B successfully initialized")
        return run
    
    except Exception as e:
        print(f"Error initializing W&B: {str(e)}")
        return None

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [6]:
train = pd.read_csv("data/train.csv").fillna(0)
test = pd.read_csv("data/test.csv").fillna(0)
sub = pd.read_csv('data/sample_submission.csv')

In [7]:
FEATURES = [c for c in test.columns if c != 'geology_id']
TARGETS = [c for c in sub.columns if c != 'geology_id']
solution = train[['geology_id'] + TARGETS].copy()
train_sub = train[['geology_id'] + TARGETS].copy()

In [8]:
def engineer_features(data, is_test=False):
    feature_data = pd.DataFrame({'geology_id': data['geology_id']})
    
    historical_cols = [col for col in data.columns if col != 'geology_id' and col.startswith('-') or col == '0']
    
    historical_cols.sort(key=lambda x: int(x) if x.isdigit() else int(x))
    
    historical_data = data[historical_cols].values
    
    # 1. Calculate local slopes (first derivative)
    slopes = np.zeros_like(historical_data)
    for i in range(1, historical_data.shape[1]):
        slopes[:, i] = historical_data[:, i] - historical_data[:, i-1]
    
    # 2. Calculate curvature (second derivative)
    curvature = np.zeros_like(historical_data)
    for i in range(1, slopes.shape[1]-1):
        curvature[:, i] = slopes[:, i+1] - slopes[:, i]
    
    # 3. Create rolling statistics for last N points
    window_sizes = [5, 10, 20]
    for window in window_sizes:

        if historical_data.shape[1] >= window:
            
            feature_data[f'mean_last_{window}'] = np.mean(historical_data[:, -window:], axis=1)
            feature_data[f'std_last_{window}'] = np.std(historical_data[:, -window:], axis=1)

            x = np.arange(window)
            
            for i in range(historical_data.shape[0]):
                
                y = historical_data[i, -window:]
                
                if np.all(y == 0):
                    feature_data.loc[i, f'trend_last_{window}'] = 0
                else:
                    slope = np.polyfit(x, y, 1)[0]
                    feature_data.loc[i, f'trend_last_{window}'] = slope
    
    # 4. Calculate smoothed versions of data (different levels of smoothing)
    smooth_windows = [3, 5, 9]
    for window in smooth_windows:
        # Savitzky-Golay filter requires window_length > polyorder
        if window <= 3: 
            continue
        
        if historical_data.shape[1] >= window:
            sg_window = window if window % 2 == 1 else window + 1
            
            for i in range(historical_data.shape[0]):
                data_slice = historical_data[i, -50:]
                if len(data_slice) >= sg_window:
                    try:
                        polyorder = 2 if sg_window <= 5 else 3
                        smoothed = savgol_filter(data_slice, sg_window, polyorder, mode='nearest')
                        feature_data.loc[i, f'sg_smooth_{window}'] = smoothed[-1]
                        
                        if len(smoothed) >= 3:
                            feature_data.loc[i, f'sg_smooth_slope_{window}'] = smoothed[-1] - smoothed[-2]
                    except Exception as e:
                        feature_data.loc[i, f'sg_smooth_{window}'] = data_slice[-1]
                        if len(data_slice) >= 3:
                            feature_data.loc[i, f'sg_smooth_slope_{window}'] = data_slice[-1] - data_slice[-2]
                else:
                    feature_data.loc[i, f'sg_smooth_{window}'] = historical_data[i, -1] if historical_data.shape[1] > 0 else 0
                    feature_data.loc[i, f'sg_smooth_slope_{window}'] = 0
    
    # 5. Calculate frequency-domain features (FFT-based)
    if historical_data.shape[1] >= 32: 
        for i in range(historical_data.shape[0]):
            fft_vals = np.abs(np.fft.rfft(historical_data[i, -32:]))
            feature_data.loc[i, 'dominant_freq'] = np.argmax(fft_vals[1:]) + 1 if len(fft_vals) > 1 else 0
            feature_data.loc[i, 'dominant_power'] = np.max(fft_vals[1:]) if len(fft_vals) > 1 else 0
            feature_data.loc[i, 'total_power'] = np.sum(fft_vals[1:]) if len(fft_vals) > 1 else 0
    
    # 6. Detect potential fault indicators
    if historical_data.shape[1] >= 5:
        max_changes = []
        for i in range(historical_data.shape[0]):
            max_change = 0
            for j in range(historical_data.shape[1] - 5):
                change = np.max(historical_data[i, j:j+5]) - np.min(historical_data[i, j:j+5])
                max_change = max(max_change, change)
            max_changes.append(max_change)
        feature_data['max_change_5pt'] = max_changes
    
    # 7. Calculate geological dip angle features
    if historical_data.shape[1] >= 10:
        dips = []
        for i in range(historical_data.shape[0]):
            # Use linear regression to find dip angle
            x = np.arange(10)
            y = historical_data[i, -10:]
            slope = np.polyfit(x, y, 1)[0]
            # Convert to degrees (slope is rise/run, arctangent gives angle)
            dip_angle = np.degrees(np.arctan(slope))
            dips.append(dip_angle)
        feature_data['dip_angle'] = dips
    
    # 8. Add the raw historical data (last 50 points)
    for i in range(min(50, historical_data.shape[1])):
        feature_data[f'raw_{i}'] = historical_data[:, -(i+1)]
    
    # 9. Create interaction features from important raw features
    if 'mean_last_5' in feature_data.columns and 'trend_last_10' in feature_data.columns:
        feature_data['mean_trend_interaction'] = feature_data['mean_last_5'] * feature_data['trend_last_10']
    
    if 'dip_angle' in feature_data.columns and 'max_change_5pt' in feature_data.columns:
        feature_data['dip_change_interaction'] = feature_data['dip_angle'] * feature_data['max_change_5pt']

    # 10. Non-linear transformations of important features
    for col in feature_data.columns:
        if col != 'geology_id' and not col.startswith('raw_'):
            feature_data[f'{col}_squared'] = feature_data[col] ** 2
            # Log transform for any potentially positive-only features
            if np.all(feature_data[col] > 0):
                feature_data[f'{col}_log'] = np.log1p(feature_data[col])
    
    # Add the original features as well
    for col in historical_cols:
        feature_data[col] = data[col]
    
    return feature_data

In [9]:
class LSTMForecastModel(nn.Module):
    def __init__(
        self,
        input_size,
        hidden_size,
        num_layers,
        output_size,
        dropout=0.2,
    ):
        super().__init__()
        
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
        )
        
        self.layer_norm = nn.LayerNorm(hidden_size)
        
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.activation = nn.GELU()
        
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        
        lstm_out = lstm_out[:, -1, :]
        
        lstm_out = self.layer_norm(lstm_out)
        
        x = self.fc1(lstm_out)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

In [10]:
class TransformerForecastModel(nn.Module):
    def __init__(
        self,
        input_size,
        d_model=512,
        nhead=8,
        num_layers=4,
        dim_feedforward=2048,
        dropout=0.1,
        output_size=300
    ):
        super().__init__()
        
        self.input_projection = nn.Linear(input_size, d_model)
        
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=d_model, 
            nhead=nhead, 
            dim_feedforward=dim_feedforward, 
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        
        self.output_layer = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, output_size)
        )
        
    def forward(self, x):
        
        x = self.input_projection(x)
        
        x = self.pos_encoder(x)
        
        transformer_output = self.transformer_encoder(x)

        output = self.output_layer(transformer_output[:, -1, :])
        
        return output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(1)]
        return self.dropout(x)

In [11]:
class TCNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, dilation, dropout=0.2):
        super().__init__()
        
        # Padding для сохранения размерности
        padding = (kernel_size - 1) * dilation
        
        self.conv = nn.Conv1d(
            in_channels, 
            out_channels, 
            kernel_size, 
            padding=padding, 
            dilation=dilation
        )
        
        self.relu = nn.GELU()
        self.dropout = nn.Dropout(dropout)
        
        self.residual = nn.Conv1d(in_channels, out_channels, 1) if in_channels != out_channels else nn.Identity()
        
        self.layer_norm = nn.LayerNorm(out_channels)
        
    def forward(self, x):
        residual = self.residual(x)
        
        out = self.conv(x)
        
        out = out[:, :, :-self.conv.padding[0]]
        
        out = out + residual
        
        out = self.relu(out)
        out = self.dropout(out)
        
        out = out.permute(0, 2, 1)
        out = self.layer_norm(out)
        out = out.permute(0, 2, 1) 
        
        return out

In [12]:
class TCNForecastModel(nn.Module):
    def __init__(
        self,
        input_size,
        hidden_size=128,
        kernel_size=3,
        num_layers=8,
        dropout=0.2,
        output_size=300
    ):
        super().__init__()
        
        self.input_projection = nn.Linear(input_size, hidden_size)
        
        self.tcn_blocks = nn.ModuleList()
        for i in range(num_layers):
            dilation = 2 ** i  # 1, 2, 4, 8, ...
            self.tcn_blocks.append(
                TCNBlock(hidden_size, hidden_size, kernel_size, dilation, dropout)
            )
        
        self.output_layer = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, output_size)
        )
        
    def forward(self, x):
        x = self.input_projection(x)
        
        x = x.permute(0, 2, 1)
        
        for block in self.tcn_blocks:
            x = block(x)
        
        x = x[:, :, -1]
        
        x = self.output_layer(x)
        
        return x

In [13]:
class HybridCNNLSTMModel(nn.Module):
    def __init__(
        self,
        input_size,
        cnn_filters=[64, 128, 128, 256],
        kernel_size=3,
        lstm_hidden=512,
        lstm_layers=2,
        dropout=0.2,
        output_size=300
    ):
        super().__init__()
        
        self.cnn_layers = nn.ModuleList()
        
        self.cnn_layers.append(nn.Conv1d(input_size, cnn_filters[0], kernel_size, padding=kernel_size//2))
        
        for i in range(1, len(cnn_filters)):
            self.cnn_layers.append(
                nn.Conv1d(cnn_filters[i-1], cnn_filters[i], kernel_size, padding=kernel_size//2)
            )
        
        self.batch_norms = nn.ModuleList([
            nn.BatchNorm1d(filters) for filters in cnn_filters
        ])
        
        self.act = nn.GELU()
        self.dropout = nn.Dropout(dropout)
        
        self.lstm = nn.LSTM(
            input_size=cnn_filters[-1],
            hidden_size=lstm_hidden,
            num_layers=lstm_layers,
            batch_first=True,
            dropout=dropout if lstm_layers > 1 else 0,
            bidirectional=True 
        )
        
        self.layer_norm = nn.LayerNorm(lstm_hidden * 2) 
        
        self.output_layer = nn.Sequential(
            nn.Linear(lstm_hidden * 2, lstm_hidden),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(lstm_hidden, output_size)
        )
        
    def forward(self, x):
        x = x.permute(0, 2, 1)
        
        for i, (conv, bn) in enumerate(zip(self.cnn_layers, self.batch_norms)):
            x = conv(x)
            x = bn(x)
            x = self.act(x)
            x = self.dropout(x)
        
        x = x.permute(0, 2, 1)
        
        lstm_out, _ = self.lstm(x)
        
        lstm_out = lstm_out[:, -1, :]
        
        lstm_out = self.layer_norm(lstm_out)

        x = self.output_layer(lstm_out)
        
        return x

In [14]:
class GeologyDataset(Dataset):
    def __init__(self, features, targets=None, is_test=False, scale_features=True):
        self.features = features
        self.targets = targets
        self.is_test = is_test
        
        if scale_features:
            self.feature_scaler = StandardScaler()
            self.features = self.feature_scaler.fit_transform(self.features)
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        x = self.features[idx]
        
        x = x.reshape(-1, 1)
        
        if self.is_test:
            return x
        else:
            y = self.targets[idx]
            return x, y


In [15]:
def preprocess_data(df, engineered_features=None, target_cols=None, is_test=False):
    if engineered_features is None:
        # If no engineered features provided, use raw data
        feature_cols = [c for c in df.columns if c != 'geology_id' and (c.startswith('-') or c == '0')]
        X = df[feature_cols].values
    else:
        # Use the engineered features, dropping the ID column
        X = engineered_features.drop('geology_id', axis=1).values
    
    if not is_test:
        y = df[target_cols].values
        return X, y
    
    return X

In [16]:
def compute_nll_score(solution, submission, row_id_column_name='geology_id'):
    solution_copy = solution.copy()
    submission_copy = submission.copy()
    
    del solution_copy[row_id_column_name]
    del submission_copy[row_id_column_name]

    NEGATIVE_PART = -299
    LARGEST_CHUNK = 600
    SMALLEST_CHUNK = 350
    TOTAL_REALIZATIONS = 10
    INFLATION_SIGMA = 600
    
    sigma_2 = np.ones((LARGEST_CHUNK+NEGATIVE_PART-1))
    from_ranges = [1, 61, 245]
    to_ranges_excl = [61, 245, 301]
    log_slopes = [1.0406028049510443, 0.0, 7.835345062351012]
    log_offsets = [-6.430669850650689, -2.1617411566043896, -45.24876794412965]

    for growth_mode in range(len(from_ranges)):
        for i in range(from_ranges[growth_mode], to_ranges_excl[growth_mode]):
            sigma_2[i-1] = np.exp(np.log(i)*log_slopes[growth_mode]+log_offsets[growth_mode])

    sigma_2 *= INFLATION_SIGMA
  
    cov_matrix_inv_diag = 1. / sigma_2
    
    num_rows = solution_copy.shape[0]
    num_columns = LARGEST_CHUNK + NEGATIVE_PART - 1
    
    p = 1./TOTAL_REALIZATIONS
    log_p = np.log(p)
    
    solution_arr = np.zeros((num_rows, TOTAL_REALIZATIONS, num_columns))
    submission_arr = np.zeros((num_rows, TOTAL_REALIZATIONS, num_columns))
    
    for k in range(TOTAL_REALIZATIONS):
        for i in range(num_columns):
            if k == 0:
                column_name = str(i+1)
            else:
                column_name = f"r_{k}_pos_{i+1}"
            solution_arr[:, k, i] = solution_copy[column_name].values
            submission_arr[:, k, i] = submission_copy[column_name].values

    misfit = solution_arr - submission_arr
    inner_product_matrix = np.sum(cov_matrix_inv_diag * misfit * misfit, axis=2)
    
    nll = -logsumexp(log_p - inner_product_matrix, axis=1)
    
    return nll.mean()

In [17]:
def train_model_with_nll_loss(model, train_loader, optimizer, device, epoch=0, total_epochs=30):
    model.train()
    train_losses = []
    
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{total_epochs}")
    
    for data, target in pbar:
        data, target = data.to(device, dtype=torch.float32), target.to(device, dtype=torch.float32)
        
        optimizer.zero_grad()
        output = model(data)
        
        target_mean = target.mean(dim=0)
        target_std = target.std(dim=0) + 1e-6
        
        normalized_output = (output - target_mean) / target_std
        normalized_target = (target - target_mean) / target_std
        
        loss = F.mse_loss(normalized_output, normalized_target)
        
        if output.shape[1] > 1:
            smoothness_penalty = torch.mean(torch.abs(output[:, 1:] - output[:, :-1]))
            loss += 0.01 * smoothness_penalty
        
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        
        train_losses.append(loss.item())
        pbar.set_postfix({'loss': f"{loss.item():.6f}"})
    
    return np.mean(train_losses)

In [18]:
def validate_model(model, val_loader, device):
    model.eval()
    val_losses = []
    val_preds = []
    val_targets = []
    
    with torch.no_grad():
        for data, target in tqdm(val_loader, desc="Validating"):
            data, target = data.to(device, dtype=torch.float32), target.to(device, dtype=torch.float32)
            output = model(data)
            
            target_mean = target.mean(dim=0)
            target_std = target.std(dim=0) + 1e-6
            
            normalized_output = (output - target_mean) / target_std
            normalized_target = (target - target_mean) / target_std
            
            loss = F.mse_loss(normalized_output, normalized_target)
            
            val_losses.append(loss.item())
            val_preds.append(output.cpu().numpy())
            val_targets.append(target.cpu().numpy())
    
    val_preds = np.concatenate(val_preds)
    val_targets = np.concatenate(val_targets)
    
    return np.mean(val_losses), val_preds, val_targets

In [19]:
def generate_diverse_realizations(base_predictions, num_samples=10, diversity_factor=0.2):
    num_rows, num_cols = base_predictions.shape
    realizations = np.zeros((num_samples, num_rows, num_cols))

    realizations[0] = base_predictions

    for i in range(1, num_samples):
        realization = base_predictions.copy()
        
        for j in range(num_rows):
            noise = np.random.normal(0, diversity_factor, num_cols)
            
            smoothed_noise = gaussian_filter1d(noise, sigma=5.0)
            
            position_factor = np.linspace(0.1, 1.0, num_cols)
            scaled_noise = smoothed_noise * position_factor
            
            realization[j] += scaled_noise

            for k in range(1, num_cols):

                max_change = 2.0 * (k/num_cols + 0.1)  # Allow larger changes further away
                diff = realization[j, k] - realization[j, k-1]
                if abs(diff) > max_change:
                    realization[j, k] = realization[j, k-1] + np.sign(diff) * max_change
        
        realizations[i] = realization
    
    return realizations

In [20]:
def generate_fold_realizations(base_predictions, num_realizations=10):
    realizations = generate_diverse_realizations(
        base_predictions, 
        num_samples=num_realizations,
        diversity_factor=0.15  # Control the diversity level
    )
    return realizations

In [21]:
def visualize_features(train_features, y):
    feature_cols = [col for col in train_features.columns if col != 'geology_id']
    
    target_cols = [str(i) for i in range(1, 11)]
    target_cols = [col for col in target_cols if col in train.columns]
    
    correlations = []
    for tcol in target_cols:
        if tcol in train.columns:
            for fcol in feature_cols:
                corr = np.corrcoef(train_features[fcol], train[tcol])[0, 1]
                correlations.append((fcol, tcol, abs(corr)))
    
    top_correlations = sorted(correlations, key=lambda x: x[2], reverse=True)[:15]
    
    plt.figure(figsize=(12, 8))
    plot_data = pd.DataFrame(top_correlations, columns=['Feature', 'Target', 'Correlation'])
    sns.barplot(data=plot_data, x='Correlation', y='Feature', hue='Target')
    plt.title('Top Feature Correlations with Targets')
    plt.tight_layout()
    
    try:
        wandb.log({"feature_correlations": wandb.Image(plt)})
    except:
        pass
    
    plt.close()

In [22]:
def get_default_config(model_type):    
    if model_type == 'LSTM':
        return {
            'model_type': 'LSTM',
            'hidden_size': 512,
            'num_layers': 2,
            'dropout': 0.2,
            'learning_rate': 5e-4,
            'weight_decay': 1e-5,
            'batch_size': 128,
            'epochs': 50,
            'seed': SEED,
            'feature_engineering': 'advanced',
            'optimizer': 'adamw',
            'scheduler': 'onecycle'
        }
    elif model_type == 'Transformer':
        return {
            'model_type': 'Transformer',
            'd_model': 512,
            'nhead': 8,
            'num_layers': 4,
            'dim_feedforward': 1024,
            'dropout': 0.2,
            'learning_rate': 4e-4,
            'weight_decay': 1e-5,
            'batch_size': 128,
            'epochs': 50,
            'seed': SEED,
            'feature_engineering': 'advanced',
            'optimizer': 'adamw',
            'scheduler': 'cosine'
        }
    elif model_type == 'TCN':
        return {
            'model_type': 'TCN',
            'hidden_size': 256,
            'kernel_size': 3,
            'num_layers': 8,
            'dropout': 0.2,
            'learning_rate': 5e-4,
            'weight_decay': 1e-5,
            'batch_size': 128,
            'epochs': 50,
            'seed': SEED,
            'feature_engineering': 'advanced',
            'optimizer': 'adam',
            'scheduler': 'onecycle'
        }
    elif model_type == 'HybridCNNLSTM':
        return {
            'model_type': 'HybridCNNLSTM',
            'cnn_filters': [64, 128, 128, 256],
            'kernel_size': 3,
            'hidden_size': 512,
            'num_layers': 2,
            'dropout': 0.3,
            'learning_rate': 3e-4,
            'weight_decay': 1e-5,
            'batch_size': 128,
            'epochs': 60,
            'seed': SEED,
            'feature_engineering': 'advanced',
            'optimizer': 'adamw',
            'scheduler': 'cosine'
        }
    else:
        raise ValueError(f"Неизвестный тип модели: {model_type}")

In [23]:
def create_model(config, device):
    model_type = config['model_type']
    input_features_size = 1 
    output_size = 3000
    
    if model_type == 'LSTM':
        model = LSTMForecastModel(
            input_size=input_features_size,
            hidden_size=config['hidden_size'],
            num_layers=config['num_layers'],
            output_size=output_size,
            dropout=config['dropout']
        )
    elif model_type == 'Transformer':
        model = TransformerForecastModel(
            input_size=input_features_size,
            d_model=config.get('d_model', 512),
            nhead=config.get('nhead', 8),
            num_layers=config['num_layers'],
            dim_feedforward=config.get('dim_feedforward', 2048),
            dropout=config['dropout'],
            output_size=output_size
        )
    elif model_type == 'TCN':
        model = TCNForecastModel(
            input_size=input_features_size,
            hidden_size=config['hidden_size'],
            kernel_size=config.get('kernel_size', 3),
            num_layers=config['num_layers'],
            dropout=config['dropout'],
            output_size=output_size
        )
    elif model_type == 'HybridCNNLSTM':
        model = HybridCNNLSTMModel(
            input_size=input_features_size,
            cnn_filters=config.get('cnn_filters', [64, 128, 128, 256]),
            kernel_size=config.get('kernel_size', 3),
            lstm_hidden=config['hidden_size'],
            lstm_layers=config['num_layers'],
            dropout=config['dropout'],
            output_size=output_size
        )
    else:
        raise ValueError(f"Неизвестный тип модели: {model_type}")
    
    return model.to(device)

In [24]:
def train_and_predict_single_model(
    X_train, 
    y_train,
    X_val,
    y_val,
    X_test,
    config,
    run_name=None,
    autofinish=True,
    wandb_run=None
):
    try:
        # Используем переданный wandb_run или инициализируем новый
        run = wandb_run if wandb_run is not None else init_wandb(project_name=run_name, config=config)
        
        train_dataset = GeologyDataset(X_train, y_train)
        val_dataset = GeologyDataset(X_val, y_val)
        test_dataset = GeologyDataset(X_test, is_test=True)
        
        train_loader = DataLoader(
            train_dataset, 
            batch_size=config['batch_size'], 
            shuffle=True,
            pin_memory=True, 
            num_workers=2  
        )
        val_loader = DataLoader(
            val_dataset, 
            batch_size=config['batch_size'], 
            shuffle=False,
            pin_memory=True,
            num_workers=2
        )
        test_loader = DataLoader(
            test_dataset, 
            batch_size=config['batch_size'], 
            shuffle=False,
            pin_memory=True,
            num_workers=2
        )
        
        model = create_model(config, device)
        
        if config.get('optimizer', 'adamw').lower() == 'adamw':
            optimizer = optim.AdamW(
                model.parameters(),
                lr=config['learning_rate'],
                weight_decay=config['weight_decay'],
                eps=1e-8
            )
        elif config.get('optimizer', 'adamw').lower() == 'adam':
            optimizer = optim.Adam(
                model.parameters(),
                lr=config['learning_rate'],
                weight_decay=config['weight_decay'],
                eps=1e-8
            )
        elif config.get('optimizer', 'adamw').lower() == 'sgd':
            optimizer = optim.SGD(
                model.parameters(),
                lr=config['learning_rate'],
                momentum=config.get('momentum', 0.9),
                weight_decay=config['weight_decay']
            )
        
        if config.get('scheduler', 'onecycle').lower() == 'onecycle':
            steps_per_epoch = len(train_loader)
            scheduler = optim.lr_scheduler.OneCycleLR(
                optimizer,
                max_lr=config['learning_rate'],
                epochs=config['epochs'],
                steps_per_epoch=steps_per_epoch,
                pct_start=0.3,
                div_factor=25,
                final_div_factor=1000,
            )
        elif config.get('scheduler', 'onecycle').lower() == 'cosine':
            scheduler = optim.lr_scheduler.CosineAnnealingLR(
                optimizer, 
                T_max=config['epochs'] // 2,
                eta_min=config['learning_rate'] / 1000
            )
        elif config.get('scheduler', 'onecycle').lower() == 'reduce':
            scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,
                mode='min',
                factor=0.5,
                patience=5,
                min_lr=config['learning_rate'] / 100
            )
        
        best_val_loss = float('inf')
        val_predictions = None
        
        # early stopping
        patience = 10
        counter = 0
        min_delta = 1e-4
        
        print(f"Training model {config['model_type']}...")
        for epoch in range(config['epochs']):
            train_loss = train_model_with_nll_loss(
                model, train_loader, optimizer, device, epoch, config['epochs']
            )
            
            val_loss, val_preds, val_targets = validate_model(model, val_loader, device)
            
            val_predictions = val_preds
            
            if config.get('scheduler', 'onecycle').lower() in ['onecycle', 'cosine']:
                scheduler.step()
            elif config.get('scheduler', 'onecycle').lower() == 'reduce':
                scheduler.step(val_loss)
    
            if run:
                run.log({
                    "epoch": epoch + 1,
                    "train_loss": train_loss,
                    "val_loss": val_loss,
                    "learning_rate": optimizer.param_groups[0]['lr']
                })
            
            print(f"Epoch {epoch+1}/{config['epochs']} - Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}")
            
            # loss check
            if epoch + 1 == 12 and val_loss > 1.0:
                print(f"Validation loss > 1.0 at epoch 10 ({val_loss:.6f}). Stopping training.")
                if run:
                    run.log({"early_stop_reason": "high_loss_at_epoch_10", "best_val_loss": best_val_loss})
                break
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                model_path = f"model_{config['model_type']}.pt"
                torch.save(model.state_dict(), model_path)
                if run:
                    run.save(model_path)
                print(f"Saved new best model with validation loss: {val_loss:.6f}")
                counter = 0 
            else:
                if val_loss > best_val_loss - min_delta:
                    counter += 1
                    if counter >= patience:
                        print(f"Early stopping triggered at epoch {epoch+1}. Best val_loss: {best_val_loss:.6f}")
                        if run:
                            run.log({"early_stop_reason": "no_improvement", "best_val_loss": best_val_loss})
                        break
        
        model.load_state_dict(torch.load(f"model_{config['model_type']}.pt"))
        model.eval()
        test_preds = []
        
        with torch.no_grad():
            for data in tqdm(test_loader, desc="Predicting test data"):
                if isinstance(data, list):
                    data = data[0]
                data = data.to(device, dtype=torch.float32)
                output = model(data)
                test_preds.append(output.cpu().numpy())
        
        base_test_predictions = np.concatenate(test_preds)
        
        val_preds_df = pd.DataFrame(
            data=val_predictions,
            columns=TARGETS,
        )
        val_preds_df['geology_id'] = val_dataset.features[:, 0]
        
        val_solution_df = pd.DataFrame(
            data=y_val,
            columns=TARGETS,
        )
        val_solution_df['geology_id'] = val_dataset.features[:, 0]
        
        nll_score = compute_nll_score(val_solution_df, val_preds_df)
        
        # Завершаем run только если мы его создали и autofinish=True
        if run and autofinish and wandb_run is None:
            run.log({"val_nll_score": nll_score})
            run.finish()
        
        return base_test_predictions, val_predictions, nll_score
        
    finally:
        torch.cuda.empty_cache()
        
        from torch.utils.data import _utils
        if hasattr(_utils.worker, "_worker_pool"):
            _utils.worker._worker_pool.close()
            _utils.worker._worker_pool.join()
        elif hasattr(_utils.worker, "_shutdown_all_workers"):
            _utils.worker._shutdown_all_workers()
        
        import gc
        gc.collect()

In [25]:
def create_submission(base_predictions, config, val_nll_score):
    submission = sub.copy()
    
    for i in range(300):
        col_name = str(i+1)
        submission[col_name] = base_predictions[:, i]
    
    realizations = generate_diverse_realizations(
        base_predictions, 
        num_samples=10, 
        diversity_factor=0.15 + 0.05 * (config.get('diversity_factor', 1.0) - 1.0)
    )
    
    for r_idx in range(1, 10): 
        for i in range(300):
            col_name = f"r_{r_idx}_pos_{i+1}"
            submission[col_name] = realizations[r_idx][:, i]
    
    submission_file = f"submission_{config['model_type']}_{val_nll_score:.6f}.csv"
    submission.to_csv(submission_file, index=False)
    print(f"\nSubmission file saved: {submission_file}")
    
    expected_cols = sub.columns.tolist()
    actual_cols = submission.columns.tolist()
    
    if set(expected_cols) != set(actual_cols):
        print("WARNING: Submission columns don't match expected format!")
        missing = set(expected_cols) - set(actual_cols)
        extra = set(actual_cols) - set(expected_cols)
        if missing:
            print(f"Missing columns: {missing}")
        if extra:
            print(f"Extra columns: {extra}")
    else:
        print("Submission format validated successfully!")
    
    return submission

In [26]:
def visualize_model_predictions(submission, config, run=None):
    try:
        plt.figure(figsize=(15, 8))

        sample_indices = [0, 10, 20]
        
        for sample_idx in sample_indices:
            plt.figure(figsize=(15, 8))
            
            x_coords = np.arange(1, 301)
            plt.plot(x_coords, submission.iloc[sample_idx, 1:301].values, 
                    color='blue', label='Baseline', linewidth=2)
            
            colors = ['red', 'green', 'purple']
            for r in range(1, 4): 
                cols = [f"r_{r}_pos_{i+1}" for i in range(300)]
                plt.plot(x_coords, submission.loc[submission.index[sample_idx], cols].values,
                        color=colors[(r-1) % len(colors)], label=f'Realization {r}', alpha=0.7)
            
            plt.title(f"{config['model_type']} - Sample {sample_idx}")
            plt.xlabel("Position")
            plt.ylabel("Layer Depth (Z coordinate)")
            plt.legend()
            plt.grid(True, alpha=0.3)

            if run:
                run.log({f"predictions_sample_{sample_idx}": wandb.Image(plt)})
            
            plt.close()
        
    except Exception as e:
        print(f"Visualization error: {e}")

In [27]:
def split_train_val(X_features, y, val_size=0.2, random_state=42):
    n_samples = len(X_features)
    indices = np.arange(n_samples)
    np.random.seed(random_state)
    np.random.shuffle(indices)
    
    val_samples = int(val_size * n_samples)
    val_indices = indices[:val_samples]
    train_indices = indices[val_samples:]
    
    X_train = X_features[train_indices]
    y_train = y[train_indices]
    X_val = X_features[val_indices]
    y_val = y[val_indices]
    
    return X_train, y_train, X_val, y_val, train_indices, val_indices

In [28]:
def run_experiment(config, autofinish=True, wandb_run=None):
    try:
        seed_everything(config['seed'])
        
        print(f"\n{'='*50}\nRunning experiment with {config['model_type']}\n{'='*50}")
        print(f"Config: {config}")
        
        X_train, y_train, X_val, y_val, train_indices, val_indices = split_train_val(
            X_features, y, val_size=0.2, random_state=config['seed']
        )

        base_predictions, val_predictions, val_nll_score = train_and_predict_single_model(
        X_train, y_train, X_val, y_val, X_features_test, config, 
        run_name="geology-forecast-challenge", 
        autofinish=autofinish, wandb_run=wandb_run)
        
        train_sub.loc[val_indices, TARGETS] = val_predictions
        
        submission = create_submission(base_predictions, config, val_nll_score)
    
        visualize_model_predictions(submission, config)
        
        print(f"Finished experiment with {config['model_type']}, validation NLL: {val_nll_score:.6f}")
        
        return val_nll_score
    
    except RuntimeError as e:
        if "CUDA out of memory" in str(e):
            print(f"CUDA OOM error with config: {config}")
            
            if wandb.run is not None:
                wandb.log({"cuda_oom_error": True, "error_message": str(e)})
                
            # Большое значение метрики, чтобы байесовский оптимизатор избегал таких конфигураций
            return float('inf')
        else:
            raise e

In [29]:
def visualize_predictions(test_idx=0, num_realizations=3):
    plt.figure(figsize=(15, 8))
    
    x_coords = np.arange(1, 301)
    plt.plot(x_coords, submission.iloc[test_idx, 1:301].values, 
             color='blue', label='Realization 0', linewidth=2)
    
    colors = ['red', 'green', 'purple']
    for r in range(1, min(num_realizations+1, 10)):
        cols = [f"r_{r}_pos_{i+1}" for i in range(300)]
        plt.plot(x_coords, submission.loc[submission.index[test_idx], cols].values,
                color=colors[(r-1) % len(colors)], label=f'Realization {r}', alpha=0.7)
    
    plt.title(f"Multiple Geological Sequence Realizations for Sample {test_idx}")
    plt.xlabel("Position")
    plt.ylabel("Layer Depth (Z coordinate)")
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    try:
        wandb.log({"prediction_visualization": wandb.Image(plt)})
    except:
        pass
    
    plt.close()

In [30]:
def analyze_geological_patterns(submission_df, sample_indices=None, num_samples=5):
    if sample_indices is None:
        sample_indices = np.random.choice(len(submission_df), min(num_samples, len(submission_df)), replace=False)
    
    plt.figure(figsize=(18, 12))
    
    for idx, i in enumerate(sample_indices):
        plt.subplot(len(sample_indices), 1, idx+1)
        
        base_pred = submission_df.iloc[i, 1:301].values
        
        slopes = np.diff(base_pred)
        
        threshold = np.std(slopes) * 2.5
        fault_indicators = np.where(np.abs(slopes) > threshold)[0]
        
        plt.plot(range(1, 301), base_pred, 'b-', linewidth=2, label='Predicted Sequence')
        
        if len(fault_indicators) > 0:
            plt.scatter([x+1 for x in fault_indicators], 
                       [base_pred[x] for x in fault_indicators],
                       color='red', s=80, marker='x', label='Potential Fault/Change')
        
        segment_size = 50
        for seg_start in range(0, 300, segment_size):
            seg_end = min(seg_start + segment_size, 300)
            if seg_end - seg_start > 10:  # Only fit if enough points
                x_seg = np.arange(seg_start, seg_end)
                y_seg = base_pred[seg_start:seg_end]
                # Fit a line to this segment
                z = np.polyfit(x_seg, y_seg, 1)
                p = np.poly1d(z)
                plt.plot(x_seg+1, p(x_seg), '--', linewidth=1.5, 
                         alpha=0.7, label=f'Trend (Seg {seg_start}-{seg_end})')
        
        plt.title(f"Geological Analysis for Sample {i}")
        plt.xlabel("Position")
        plt.ylabel("Layer Depth (Z)")
        plt.grid(True, alpha=0.3)
        if idx == 0:
            plt.legend(loc='upper right')
    
    plt.tight_layout()

    try:
        wandb.log({"geological_analysis": wandb.Image(plt)})
    except:
        pass
    
    plt.close()

In [31]:
def calculate_prediction_uncertainty(submission_df, num_samples=5):
    sample_indices = np.random.choice(len(submission_df), min(num_samples, len(submission_df)), replace=False)
    
    plt.figure(figsize=(15, 10))
    
    for idx, i in enumerate(sample_indices):
        plt.subplot(num_samples, 1, idx+1)
        
        realizations = []
        
        base_realization = submission_df.iloc[i, 1:301].values
        realizations.append(base_realization)
        
        for r in range(1, 10):
            cols = [f"r_{r}_pos_{i+1}" for i in range(300)]
            realization = submission_df.loc[submission_df.index[i], cols].values
            realizations.append(realization)
        
        realizations = np.array(realizations)
        
        mean_prediction = np.mean(realizations, axis=0)
        std_prediction = np.std(realizations, axis=0)
        
        x_coords = np.arange(1, 301)
        plt.plot(x_coords, mean_prediction, 'b-', label='Mean Prediction')
        
        plt.fill_between(x_coords, 
                         mean_prediction - 2*std_prediction,
                         mean_prediction + 2*std_prediction,
                         alpha=0.3, color='blue',
                         label='95% Confidence Interval')
        
        high_uncertainty = np.where(std_prediction > np.mean(std_prediction) + np.std(std_prediction))[0]
        if len(high_uncertainty) > 0:
            plt.scatter(high_uncertainty+1, 
                       mean_prediction[high_uncertainty],
                       color='red', s=50, alpha=0.7, 
                       label='High Uncertainty Regions')
        
        plt.title(f"Prediction Uncertainty Analysis for Sample {i}")
        plt.xlabel("Position")
        plt.ylabel("Layer Depth (Z)")
        plt.grid(True, alpha=0.3)
        if idx == 0:
            plt.legend(loc='upper right')
    
    plt.tight_layout()
    
    try:
        wandb.log({"uncertainty_analysis": wandb.Image(plt)})
    except:
        pass
    
    plt.close()

In [32]:
def create_sweep_config():
    sweep_config = {
        'method': 'bayes', 
        'metric': {
            'name': 'val_nll_score',
            'goal': 'minimize'
        },
        'parameters': {
            'model_type': {
                'values': ['HybridCNNLSTM']
            },
            'learning_rate': {
                'distribution': 'log_uniform_values', 
                'min': 3e-4,
                'max': 1e-2
            },
            'batch_size': {
                'values': [128]
            },
            'dropout': {
                'distribution': 'uniform',
                'min': 0.1,
                'max': 0.3 
            },
            'hidden_size': {
                'values': [128]
            },
            'num_layers': {
                'values': [2, 3, 4]
            },
            'weight_decay': {
                'values': [1e-5, 1e-4]
            },
            'optimizer': {
                'values': ['adamw', 'adamw', 'sgd']
            },
            'scheduler': {
                'values': ['cosine', 'reduce', 'onecycle']
            },
            'diversity_factor': {
                'distribution': 'uniform',
                'min': 0.8,
                'max': 1.5
            }
        }
    }
    
    sweep_config['parameters']['d_model'] = {
        'values': [128]
    }
    sweep_config['parameters']['nhead'] = {
        'values': [4, 8, 12]
    }
    sweep_config['parameters']['dim_feedforward'] = {
        'values': [128, 256]
    }
    
    sweep_config['parameters']['kernel_size'] = {
        'values': [3, 5]
    }
    
    return sweep_config

In [33]:
def sweep_agent():
    global X_features, y, X_features_test
    
    # Инициализируем wandb run и сохраняем объект run
    run = wandb.init(reinit=True)
    
    config = wandb.config
    model_type = config['model_type']
    
    default_config = get_default_config(model_type)
    
    experiment_config = dict(config)
    
    for key, value in default_config.items():
        if key not in experiment_config:
            experiment_config[key] = value
    
    transformer_params = ['d_model', 'nhead', 'dim_feedforward']
    tcn_params = ['kernel_size']
    
    if model_type != 'Transformer':
        for param in transformer_params:
            if param in experiment_config: 
                del experiment_config[param]
    
    if model_type != 'TCN':
        for param in tcn_params:
            if param in experiment_config:
                del experiment_config[param]
    
    print(f"Running experiment with config: {experiment_config}")
    
    # Передаем объект run в run_experiment
    val_nll_score = run_experiment(experiment_config, autofinish=False, wandb_run=run)
    
    # Логируем с использованием того же объекта run
    run.log({'val_nll_score': val_nll_score, 'final_val_nll_score': val_nll_score})
    
    # wandb.finish() не нужен, так как агент сам позаботится о завершении
    
    return val_nll_score

In [34]:
def run_sweep(count=10):
    # Инициализируем начальный run для создания sweep
    init_wandb(project_name="geology-forecast-challenge-sweep-gpu-bayes-30-server-HybridCNNLSTM")
    
    sweep_config = create_sweep_config()
    sweep_id = wandb.sweep(sweep_config, project="geology-forecast-challenge-sweep-gpu-bayes-30-server-HybridCNNLSTM")
    
    # Завершаем начальный run перед запуском агента
    wandb.finish()
    
    # Запускаем агента
    wandb.agent(sweep_id, function=sweep_agent, count=count)

In [35]:
def run_single_model(model_type='HybridCNNLSTM', custom_params=None):
    config = get_default_config(model_type)
    
    if custom_params:
        config.update(custom_params)
    
    val_nll_score = run_experiment(config)
    
    return val_nll_score, config

In [36]:
"""
score, config = run_single_model('HybridCNNLSTM', {
    'hidden_size': 512, 
    'num_layers': 2,
    'dropout': 0.3,
    'learning_rate': 3e-4,
    'epochs': 60 
})

print(f"Final validation NLL score: {score:.6f}")
"""

'\nscore, config = run_single_model(\'HybridCNNLSTM\', {\n    \'hidden_size\': 512, \n    \'num_layers\': 2,\n    \'dropout\': 0.3,\n    \'learning_rate\': 3e-4,\n    \'epochs\': 60 \n})\n\nprint(f"Final validation NLL score: {score:.6f}")\n'

In [37]:
print("Engineering features for train data...")
train_features = engineer_features(train)

print("Engineering features for test data...")
test_features = engineer_features(test)

X_features = train_features.drop('geology_id', axis=1).values
y = train[TARGETS].values
X_features_test = test_features.drop('geology_id', axis=1).values

print(f"Feature shape: {X_features.shape}, Target shape: {y.shape}")

Engineering features for train data...
Engineering features for test data...
Feature shape: (1510, 397), Target shape: (1510, 3000)


In [38]:
# run_sweep(count=30)

In [42]:
# =====================================
# ОПТИМИЗИРОВАННОЕ ОБУЧЕНИЕ МОДЕЛИ
# =====================================

# Импорт дополнительных библиотек
from sklearn.model_selection import KFold, StratifiedKFold
import torch.cuda.amp as amp
from torch.utils.data import ConcatDataset
import timeit
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

# Определение оптимального конфига для HybridCNNLSTM
def get_optimal_config():
    return {
        'model_type': 'HybridCNNLSTM',
        'cnn_filters': [64, 128, 128, 256],  # Увеличены размеры фильтров
        'kernel_size': 5,                    # Увеличенное окно свертки
        'hidden_size': 128,                  # Увеличен размер скрытого слоя
        'num_layers': 3,                     # Добавлен дополнительный слой LSTM
        'dropout': 0.1074051389697947,                      # Оптимальное значение дропаута
        'learning_rate': 0.001588625112561725,               # Будет адаптироваться с Ranger
        'weight_decay': 0.00001,                # Оптимизированное значение регуляризации
        'batch_size': 128,                    # Уменьшен размер батча для лучшей сходимости
        'epochs': 100,                       # Увеличенное число эпох для K-fold
        'seed': SEED,
        'feature_engineering': 'advanced',
        'optimizer': 'ranger',               # Используем Ranger вместо AdamW
        'scheduler': 'cosinewr',             # Cosine annealing with warm restarts
        'early_stopping_patience': 15,       # Увеличенное терпение для early stopping
        'use_amp': True,                     # Использование mixed precision
        'use_swa': True,                     # Stochastic Weight Averaging
        'swa_start': 50,                     # Начать SWA с эпохи 50
        'mixup_alpha': 0.2,                  # Параметр для Mixup аугментации
    }

# Реализация оптимизатора Ranger (RAdam + LookAhead)
class Ranger(torch.optim.Optimizer):
    def __init__(self, params, lr=1e-3, alpha=0.5, k=6, betas=(0.95, 0.999), eps=1e-8, weight_decay=0):
        # RAdam params
        defaults = dict(lr=lr, alpha=alpha, k=k, betas=betas, eps=eps, weight_decay=weight_decay)
        self.buffer = [[None, None, None] for ind in range(10)]
        
        super(Ranger, self).__init__(params, defaults)
        
        # Lookаhead параметры
        for group in self.param_groups:
            group['step_counter'] = 0
        
        # Создаем копию параметров для LookAhead
        self.alpha = alpha
        self.k = k
        
        for group in self.param_groups:
            for p in group['params']:
                if p.requires_grad:
                    p.data = p.data.clone()
                    state = self.state[p]
                    state['slow_params'] = torch.zeros_like(p.data)
                    state['slow_params'].copy_(p.data)
    
    def step(self, closure=None):
        loss = None
        if closure is not None:
            loss = closure()
        
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                
                grad = p.grad.data.float()
                if grad.is_sparse:
                    raise RuntimeError('Ranger не поддерживает разреженные градиенты')
                
                p_data_fp32 = p.data.float()
                
                state = self.state[p]
                
                if len(state) == 1:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
                else:
                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)
                
                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']
                
                # RAdam
                state['step'] += 1
                
                # Decay экспоненциально взвешенных скользящих средних
                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
                
                # Применяем корректировку смещения
                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']
                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
                
                # RAdam часть
                buffered = self.buffer[int(state['step'] % 10)]
                if state['step'] == buffered[0]:
                    N_sma, step_size = buffered[1], buffered[2]
                else:
                    buffered[0] = state['step']
                    beta2_t = beta2 ** state['step']
                    N_sma_max = 2 / (1 - beta2) - 1
                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
                    buffered[1] = N_sma
                    
                    if N_sma >= 5:
                        step_size = group['lr'] * math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
                    else:
                        step_size = group['lr'] / (1 - beta1 ** state['step'])
                    buffered[2] = step_size
                
                if group['weight_decay'] != 0:
                    p_data_fp32.add_(p_data_fp32, alpha=-group['weight_decay'] * group['lr'])
                
                if N_sma >= 5:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])
                    p_data_fp32.addcdiv_(exp_avg, denom, value=-step_size)
                else:
                    p_data_fp32.add_(exp_avg, alpha=-step_size)
                
                p.data.copy_(p_data_fp32)
                
                # LookAhead часть
                group['step_counter'] += 1
                if group['step_counter'] % self.k == 0:
                    slow_params = state['slow_params']
                    slow_params.add_(p.data - slow_params, alpha=self.alpha)
                    p.data.copy_(slow_params)
        
        return loss

# Реализация Mixup аугментации
def mixup_data(x, y, alpha=0.2):
    """Создает новые смешанные образцы."""
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(x.device)

    mixed_x = lam * x + (1 - lam) * x[index]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    """Вычисляет потери для смешанных образцов."""
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

# Модифицированная функция обучения с применением Mixup и AMP
def train_model_with_advanced_techniques(model, train_loader, optimizer, device, scaler, config, epoch=0, total_epochs=30):
    model.train()
    train_losses = []
    
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{total_epochs}")
    
    for data, target in pbar:
        data, target = data.to(device, dtype=torch.float32), target.to(device, dtype=torch.float32)
        
        # Применение Mixup аугментации
        if config.get('mixup_alpha', 0) > 0:
            data, targets_a, targets_b, lam = mixup_data(data, target, config['mixup_alpha'])
        
        optimizer.zero_grad()
        
        # Используем автоматическую mixed precision
        if config.get('use_amp', False):
            with amp.autocast():
                output = model(data)
                
                # Нормализация выхода и целевых значений
                target_mean = target.mean(dim=0)
                target_std = target.std(dim=0) + 1e-6
                
                normalized_output = (output - target_mean) / target_std
                
                if config.get('mixup_alpha', 0) > 0:
                    normalized_target_a = (targets_a - target_mean) / target_std
                    normalized_target_b = (targets_b - target_mean) / target_std
                    loss = mixup_criterion(F.mse_loss, normalized_output, normalized_target_a, normalized_target_b, lam)
                else:
                    normalized_target = (target - target_mean) / target_std
                    loss = F.mse_loss(normalized_output, normalized_target)
                
                # Добавляем штраф за гладкость прогноза
                if output.shape[1] > 1:
                    smoothness_penalty = torch.mean(torch.abs(output[:, 1:] - output[:, :-1]))
                    loss += 0.005 * smoothness_penalty
                
                # Добавляем штраф за неконсистентность наклона
                if output.shape[1] > 2:
                    slopes = output[:, 1:] - output[:, :-1]
                    slope_changes = slopes[:, 1:] - slopes[:, :-1]
                    curvature_penalty = torch.mean(torch.abs(slope_changes))
                    loss += 0.001 * curvature_penalty
            
            # Scale loss и обратное распространение
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()
        else:
            # Стандартное обучение без AMP
            output = model(data)
            
            target_mean = target.mean(dim=0)
            target_std = target.std(dim=0) + 1e-6
            
            normalized_output = (output - target_mean) / target_std
            
            if config.get('mixup_alpha', 0) > 0:
                normalized_target_a = (targets_a - target_mean) / target_std
                normalized_target_b = (targets_b - target_mean) / target_std
                loss = mixup_criterion(F.mse_loss, normalized_output, normalized_target_a, normalized_target_b, lam)
            else:
                normalized_target = (target - target_mean) / target_std
                loss = F.mse_loss(normalized_output, normalized_target)
            
            if output.shape[1] > 1:
                smoothness_penalty = torch.mean(torch.abs(output[:, 1:] - output[:, :-1]))
                loss += 0.005 * smoothness_penalty
                
            if output.shape[1] > 2:
                slopes = output[:, 1:] - output[:, :-1]
                slope_changes = slopes[:, 1:] - slopes[:, :-1]
                curvature_penalty = torch.mean(torch.abs(slope_changes))
                loss += 0.001 * curvature_penalty
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
        
        train_losses.append(loss.item())
        pbar.set_postfix({'loss': f"{loss.item():.6f}"})
    
    return np.mean(train_losses)

# Реализация Stochastic Weight Averaging
class SWA:
    def __init__(self, model):
        self.model = model
        self.n_averaged = 0
        self.swa_model = copy.deepcopy(model)
        
        # Устанавливаем веса SWA модели в 0
        for param in self.swa_model.parameters():
            param.data.zero_()
    
    def update(self, model):
        self.n_averaged += 1
        for swa_param, param in zip(self.swa_model.parameters(), model.parameters()):
            swa_param.data.mul_(1.0 - 1.0 / self.n_averaged)
            swa_param.data.add_(param.data / self.n_averaged)
    
    def get_model(self):
        return self.swa_model

# Усовершенствованная функция обучения k-fold
def train_kfold_model(X_features, y, X_features_test, n_folds=5):
    print(f"\n{'='*50}\nTraining {n_folds}-fold HybridCNNLSTM\n{'='*50}")
    
    config = get_optimal_config()
    print(f"Config: {config}")
    
    # Инициализируем wandb
    wandb_run = init_wandb(project_name="geology-forecast-challenge-improved", config=config)
    
    # Подготовка для K-fold
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=config['seed'])
    fold_preds_val = np.zeros((X_features.shape[0], y.shape[1]))
    fold_preds_test = np.zeros((X_features_test.shape[0], y.shape[1], n_folds))
    oof_scores = []
    fold_models = []
    
    # Главный цикл K-fold
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_features)):
        print(f"\n{'='*30}\nFold {fold+1}/{n_folds}\n{'='*30}")
        
        # Разделение данных
        X_train_fold, X_val_fold = X_features[train_idx], X_features[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]
        
        # Создание датасетов и загрузчиков
        train_dataset = GeologyDataset(X_train_fold, y_train_fold, scale_features=True)
        val_dataset = GeologyDataset(X_val_fold, y_val_fold, scale_features=True)
        test_dataset = GeologyDataset(X_features_test, is_test=True, scale_features=True)
        
        train_loader = DataLoader(
            train_dataset, 
            batch_size=config['batch_size'], 
            shuffle=True,
            pin_memory=True, 
            num_workers=0,
            drop_last=True
        )
        
        val_loader = DataLoader(
            val_dataset, 
            batch_size=config['batch_size'] * 2, 
            shuffle=False,
            pin_memory=True,
            num_workers=0
        )
        
        test_loader = DataLoader(
            test_dataset, 
            batch_size=config['batch_size'] * 2, 
            shuffle=False,
            pin_memory=True,
            num_workers=0
        )
        
        # Создание модели
        model = create_model(config, device)
        
        # Инициализация оптимизатора
        if config.get('optimizer', 'ranger').lower() == 'ranger':
            optimizer = Ranger(
                model.parameters(),
                lr=config['learning_rate'],
                weight_decay=config['weight_decay']
            )
        elif config.get('optimizer', 'ranger').lower() == 'adamw':
            optimizer = optim.AdamW(
                model.parameters(),
                lr=config['learning_rate'],
                weight_decay=config['weight_decay'],
                eps=1e-8
            )
        else:
            optimizer = optim.Adam(
                model.parameters(),
                lr=config['learning_rate'],
                weight_decay=config['weight_decay'],
                eps=1e-8
            )
        
        # Инициализация планировщика
        if config.get('scheduler', 'cosinewr').lower() == 'cosinewr':
            scheduler = CosineAnnealingWarmRestarts(
                optimizer,
                T_0=5, 
                T_mult=2,
                eta_min=config['learning_rate'] / 100
            )
        elif config.get('scheduler', 'cosinewr').lower() == 'onecycle':
            steps_per_epoch = len(train_loader)
            scheduler = optim.lr_scheduler.OneCycleLR(
                optimizer,
                max_lr=config['learning_rate'],
                epochs=config['epochs'],
                steps_per_epoch=steps_per_epoch,
                pct_start=0.3,
                div_factor=25,
                final_div_factor=1000,
            )
        else:
            scheduler = optim.lr_scheduler.CosineAnnealingLR(
                optimizer, 
                T_max=config['epochs'],
                eta_min=config['learning_rate'] / 100
            )
        
        # Инициализация SWA
        swa_model = None
        if config.get('use_swa', False):
            swa_model = SWA(model)
        
        # Инициализация scaler для AMP
        scaler = amp.GradScaler() if config.get('use_amp', False) else None
        
        # Параметры для раннего останова
        best_val_loss = float('inf')
        patience = config.get('early_stopping_patience', 15)
        counter = 0
        min_delta = 1e-4
        best_val_preds = None
        best_epoch = 0
        
        # Обучение модели
        for epoch in range(config['epochs']):
            start_time = timeit.default_timer()
            
            # Обучение эпохи
            train_loss = train_model_with_advanced_techniques(
                model, train_loader, optimizer, device, scaler, config, epoch, config['epochs']
            )
            
            # Обновление планировщика
            if config.get('scheduler', 'cosinewr').lower() != 'reduce':
                scheduler.step()
            
            # Валидация
            val_loss, val_preds, val_targets = validate_model(model, val_loader, device)
            
            # Обновление лучших весов
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_val_preds = val_preds
                best_epoch = epoch
                
                model_path = f"model_fold_{fold}.pt"
                torch.save(model.state_dict(), model_path)
                
                if wandb_run:
                    wandb_run.save(model_path)
                    
                print(f"✅ Saved new best model with validation loss: {val_loss:.6f}")
                counter = 0
            else:
                counter += 1
                
            # Обновление SWA
            if config.get('use_swa', False) and epoch >= config.get('swa_start', 50):
                swa_model.update(model)
            
            # Вычисление времени эпохи
            elapsed = timeit.default_timer() - start_time
            
            # Логирование метрик
            if wandb_run:
                wandb_run.log({
                    f"fold_{fold}_epoch": epoch + 1,
                    f"fold_{fold}_train_loss": train_loss,
                    f"fold_{fold}_val_loss": val_loss,
                    f"fold_{fold}_learning_rate": optimizer.param_groups[0]['lr'],
                    f"fold_{fold}_epoch_time": elapsed
                })
            
            print(f"Epoch {epoch+1}/{config['epochs']} - Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}, LR: {optimizer.param_groups[0]['lr']:.8f}, Time: {elapsed:.2f}s")
            
            # Проверка раннего останова
            if counter >= patience:
                print(f"Early stopping at epoch {epoch+1}, best val_loss: {best_val_loss:.6f}")
                break
        
        # Загрузка лучшей модели для предсказаний
        print(f"Loading best model from epoch {best_epoch+1}")
        model.load_state_dict(torch.load(f"model_fold_{fold}.pt"))
        
        # Если использовали SWA, проверяем производительность SWA модели
        if config.get('use_swa', False) and best_epoch >= config.get('swa_start', 50):
            swa_model_copy = swa_model.get_model()
            swa_val_loss, swa_val_preds, _ = validate_model(swa_model_copy, val_loader, device)
            
            print(f"SWA Model Val Loss: {swa_val_loss:.6f} vs Best Model Val Loss: {best_val_loss:.6f}")
            
            # Если SWA модель лучше, используем её
            if swa_val_loss < best_val_loss:
                print("Using SWA model for predictions")
                model = swa_model_copy
                best_val_preds = swa_val_preds
                best_val_loss = swa_val_loss
        
        # Сохраняем предсказания валидации
        fold_preds_val[val_idx] = best_val_preds
        
        # Создаем предсказания для тестовых данных
        model.eval()
        test_preds = []
        
        with torch.no_grad():
            for data in tqdm(test_loader, desc="Predicting test data"):
                if isinstance(data, list):
                    data = data[0]
                data = data.to(device, dtype=torch.float32)
                output = model(data)
                test_preds.append(output.cpu().numpy())
        
        test_fold_preds = np.concatenate(test_preds)
        fold_preds_test[:, :, fold] = test_fold_preds
        
        # Вычисление NLL метрики для фолда
        val_preds_df = pd.DataFrame(
            data=best_val_preds,
            columns=TARGETS,
        )
        val_preds_df['geology_id'] = train.iloc[val_idx]['geology_id'].values
        
        val_solution_df = pd.DataFrame(
            data=y_val_fold,
            columns=TARGETS,
        )
        val_solution_df['geology_id'] = train.iloc[val_idx]['geology_id'].values
        
        fold_nll_score = compute_nll_score(val_solution_df, val_preds_df)
        oof_scores.append(fold_nll_score)
        
        print(f"Fold {fold+1} NLL Score: {fold_nll_score:.6f}")
        
        # Сохраняем модель для ансамбля
        fold_models.append(model)
        
        # Очистка после каждого фолда
        torch.cuda.empty_cache()
        import gc
        gc.collect()
    
    # Вычисление среднего OOF скора
    mean_oof_score = np.mean(oof_scores)
    print(f"\nMean OOF NLL Score: {mean_oof_score:.6f}")
    
    if wandb_run:
        wandb_run.log({"mean_oof_nll_score": mean_oof_score})
        for i, score in enumerate(oof_scores):
            wandb_run.log({f"fold_{i}_nll_score": score})
    
    # Усреднение предсказаний по всем фолдам
    test_predictions = np.mean(fold_preds_test, axis=2)
    
    # Создание сабмишна
    submission = create_optimized_submission(test_predictions, mean_oof_score)
    
    # Визуализация результатов
    visualize_predictions(submission, 0, 3)
    analyze_geological_patterns(submission)
    calculate_prediction_uncertainty(submission)
    
    if wandb_run:
        wandb_run.log({"final_submission": wandb.Table(dataframe=submission.head())})
        wandb_run.finish()
    
    return submission, mean_oof_score, fold_preds_val, fold_models

# Оптимизированное создание разнообразных реализаций
def generate_optimized_realizations(base_predictions, num_samples=10):
    num_rows, num_cols = base_predictions.shape
    realizations = np.zeros((num_samples, num_rows, num_cols))
    
    # Первая реализация - базовая
    realizations[0] = base_predictions
    
    # Параметры шума, зависящие от позиции
    position_factors = np.linspace(0.1, 1.5, num_cols) ** 1.5
    
    # Создаем оптимизированный шум для каждой реализации
    for i in range(1, num_samples):
        realization = base_predictions.copy()
        
        # Добавляем разную степень случайности к разным реализациям
        diversity_factor = 1.3818770228728243
        
        for j in range(num_rows):
            # Создаем базовый шум
            noise = np.random.normal(0, diversity_factor, num_cols)
            
            # Сглаживаем шум с разными уровнями сглаживания
            smoothed_noise = gaussian_filter1d(noise, sigma=3.0 + 2.0 * (i % 3))
            
            # Масштабируем шум в зависимости от позиции
            scaled_noise = smoothed_noise * position_factors
            
            # Добавляем периодичность к некоторым реализациям
            if i % 3 == 0:
                periodic_component = np.sin(np.linspace(0, i * np.pi, num_cols)) * diversity_factor * 0.5
                scaled_noise += periodic_component
            
            # Применяем шум
            realization[j] += scaled_noise
            
            # Обеспечиваем геологическую реалистичность - ограничиваем резкие изменения
            for k in range(1, num_cols):
                # Ограничение максимального изменения, возрастающее с расстоянием
                max_change = 1.5 * (1 + k/num_cols)
                diff = realization[j, k] - realization[j, k-1]
                if abs(diff) > max_change:
                    realization[j, k] = realization[j, k-1] + np.sign(diff) * max_change
        
        realizations[i] = realization
    
    return realizations

# Оптимизированное создание сабмишна
def create_optimized_submission(base_predictions, val_nll_score):
    submission = sub.copy()
    
    # Заполняем основные предсказания
    for i in range(300):
        col_name = str(i+1)
        submission[col_name] = base_predictions[:, i]
    
    # Создаем разнообразные реализации
    realizations = generate_optimized_realizations(base_predictions, num_samples=10)
    
    # Заполняем колонки реализаций
    for r_idx in range(1, 10): 
        for i in range(300):
            col_name = f"r_{r_idx}_pos_{i+1}"
            submission[col_name] = realizations[r_idx][:, i]
    
    # Сохраняем сабмишн
    submission_file = f"submission_HybridCNNLSTM_kfold_{val_nll_score:.6f}.csv"
    submission.to_csv(submission_file, index=False)
    print(f"\nSubmission file saved: {submission_file}")
    
    # Проверяем корректность формата
    expected_cols = sub.columns.tolist()
    actual_cols = submission.columns.tolist()
    
    if set(expected_cols) != set(actual_cols):
        print("WARNING: Submission columns don't match expected format!")
        missing = set(expected_cols) - set(actual_cols)
        extra = set(actual_cols) - set(expected_cols)
        if missing:
            print(f"Missing columns: {missing}")
        if extra:
            print(f"Extra columns: {extra}")
    else:
        print("Submission format validated successfully!")
    
    return submission

In [43]:
import copy
import multiprocessing
multiprocessing.set_start_method('spawn', force=True)

In [44]:
# train_kfold_model(X_features, y, X_features_test, n_folds=5)


Training 5-fold HybridCNNLSTM
Config: {'model_type': 'LSTM', 'cnn_filters': [64, 128, 128, 256], 'kernel_size': 5, 'hidden_size': 128, 'num_layers': 3, 'dropout': 0.1074051389697947, 'learning_rate': 0.001588625112561725, 'weight_decay': 1e-05, 'batch_size': 128, 'epochs': 100, 'seed': 42, 'feature_engineering': 'advanced', 'optimizer': 'ranger', 'scheduler': 'cosinewr', 'early_stopping_patience': 15, 'use_amp': True, 'use_swa': True, 'swa_start': 50, 'mixup_alpha': 0.2}

Fold 1/5


Epoch 1/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 1.150023
Epoch 1/100 - Train Loss: 1.180127, Val Loss: 1.150023, LR: 0.00143844, Time: 0.43s


Epoch 2/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 1.074035
Epoch 2/100 - Train Loss: 1.095573, Val Loss: 1.074035, LR: 0.00104526, Time: 0.21s


Epoch 3/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 1.062486
Epoch 3/100 - Train Loss: 1.046933, Val Loss: 1.062486, LR: 0.00055925, Time: 0.20s


Epoch 4/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 1.054982
Epoch 4/100 - Train Loss: 1.034230, Val Loss: 1.054982, LR: 0.00016607, Time: 0.19s


Epoch 5/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 1.052386
Epoch 5/100 - Train Loss: 1.032061, Val Loss: 1.052386, LR: 0.00158863, Time: 0.19s


Epoch 6/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 1.047721
Epoch 6/100 - Train Loss: 1.025307, Val Loss: 1.047721, LR: 0.00155014, Time: 0.20s


Epoch 7/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 1.031776
Epoch 7/100 - Train Loss: 1.021996, Val Loss: 1.031776, LR: 0.00143844, Time: 0.19s


Epoch 8/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 1.009884
Epoch 8/100 - Train Loss: 1.002740, Val Loss: 1.009884, LR: 0.00126447, Time: 0.20s


Epoch 9/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.968736
Epoch 9/100 - Train Loss: 0.973686, Val Loss: 0.968736, LR: 0.00104526, Time: 0.20s


Epoch 10/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.916472
Epoch 10/100 - Train Loss: 0.949311, Val Loss: 0.916472, LR: 0.00080226, Time: 0.20s


Epoch 11/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.871861
Epoch 11/100 - Train Loss: 0.906015, Val Loss: 0.871861, LR: 0.00055925, Time: 0.20s


Epoch 12/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.835332
Epoch 12/100 - Train Loss: 0.877527, Val Loss: 0.835332, LR: 0.00034004, Time: 0.20s


Epoch 13/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.813328
Epoch 13/100 - Train Loss: 0.855980, Val Loss: 0.813328, LR: 0.00016607, Time: 0.21s


Epoch 14/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.802323
Epoch 14/100 - Train Loss: 0.852071, Val Loss: 0.802323, LR: 0.00005437, Time: 0.20s


Epoch 15/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.798511
Epoch 15/100 - Train Loss: 0.831454, Val Loss: 0.798511, LR: 0.00158863, Time: 0.20s


Epoch 16/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.682341
Epoch 16/100 - Train Loss: 0.760067, Val Loss: 0.682341, LR: 0.00157894, Time: 0.20s


Epoch 17/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.590923
Epoch 17/100 - Train Loss: 0.705493, Val Loss: 0.590923, LR: 0.00155014, Time: 0.20s


Epoch 18/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.513057
Epoch 18/100 - Train Loss: 0.686112, Val Loss: 0.513057, LR: 0.00150292, Time: 0.20s


Epoch 19/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.435547
Epoch 19/100 - Train Loss: 0.540475, Val Loss: 0.435547, LR: 0.00143844, Time: 0.19s


Epoch 20/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.392738
Epoch 20/100 - Train Loss: 0.525653, Val Loss: 0.392738, LR: 0.00135830, Time: 0.19s


Epoch 21/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.347056
Epoch 21/100 - Train Loss: 0.441865, Val Loss: 0.347056, LR: 0.00126447, Time: 0.19s


Epoch 22/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.304685
Epoch 22/100 - Train Loss: 0.443825, Val Loss: 0.304685, LR: 0.00115926, Time: 0.19s


Epoch 23/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.289234
Epoch 23/100 - Train Loss: 0.377627, Val Loss: 0.289234, LR: 0.00104526, Time: 0.20s


Epoch 24/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.268708
Epoch 24/100 - Train Loss: 0.390614, Val Loss: 0.268708, LR: 0.00092527, Time: 0.21s


Epoch 25/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.256774
Epoch 25/100 - Train Loss: 0.365791, Val Loss: 0.256774, LR: 0.00080226, Time: 0.19s


Epoch 26/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.240503
Epoch 26/100 - Train Loss: 0.320772, Val Loss: 0.240503, LR: 0.00067924, Time: 0.19s


Epoch 27/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.235013
Epoch 27/100 - Train Loss: 0.335012, Val Loss: 0.235013, LR: 0.00055925, Time: 0.20s


Epoch 28/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.233199
Epoch 28/100 - Train Loss: 0.364776, Val Loss: 0.233199, LR: 0.00044525, Time: 0.19s


Epoch 29/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.224753
Epoch 29/100 - Train Loss: 0.324386, Val Loss: 0.224753, LR: 0.00034004, Time: 0.20s


Epoch 30/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 30/100 - Train Loss: 0.382484, Val Loss: 0.226647, LR: 0.00024621, Time: 0.19s


Epoch 31/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.222793
Epoch 31/100 - Train Loss: 0.323997, Val Loss: 0.222793, LR: 0.00016607, Time: 0.20s


Epoch 32/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.220525
Epoch 32/100 - Train Loss: 0.300276, Val Loss: 0.220525, LR: 0.00010160, Time: 0.20s


Epoch 33/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.219237
Epoch 33/100 - Train Loss: 0.374160, Val Loss: 0.219237, LR: 0.00005437, Time: 0.20s


Epoch 34/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 34/100 - Train Loss: 0.382908, Val Loss: 0.219927, LR: 0.00002557, Time: 0.18s


Epoch 35/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 35/100 - Train Loss: 0.355369, Val Loss: 0.219957, LR: 0.00158863, Time: 0.18s


Epoch 36/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.216451
Epoch 36/100 - Train Loss: 0.346091, Val Loss: 0.216451, LR: 0.00158620, Time: 0.20s


Epoch 37/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.203342
Epoch 37/100 - Train Loss: 0.399538, Val Loss: 0.203342, LR: 0.00157894, Time: 0.20s


Epoch 38/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 38/100 - Train Loss: 0.359113, Val Loss: 0.205732, LR: 0.00156690, Time: 0.18s


Epoch 39/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 39/100 - Train Loss: 0.322984, Val Loss: 0.204094, LR: 0.00155014, Time: 0.19s


Epoch 40/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.193306
Epoch 40/100 - Train Loss: 0.357368, Val Loss: 0.193306, LR: 0.00152877, Time: 0.20s


Epoch 41/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.185716
Epoch 41/100 - Train Loss: 0.327007, Val Loss: 0.185716, LR: 0.00150292, Time: 0.20s


Epoch 42/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 42/100 - Train Loss: 0.406792, Val Loss: 0.200083, LR: 0.00147275, Time: 0.18s


Epoch 43/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 43/100 - Train Loss: 0.335336, Val Loss: 0.193819, LR: 0.00143844, Time: 0.18s


Epoch 44/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 44/100 - Train Loss: 0.263321, Val Loss: 0.190116, LR: 0.00140022, Time: 0.18s


Epoch 45/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.185577
Epoch 45/100 - Train Loss: 0.277936, Val Loss: 0.185577, LR: 0.00135830, Time: 0.20s


Epoch 46/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.182991
Epoch 46/100 - Train Loss: 0.323251, Val Loss: 0.182991, LR: 0.00131296, Time: 0.20s


Epoch 47/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 47/100 - Train Loss: 0.386490, Val Loss: 0.196708, LR: 0.00126447, Time: 0.17s


Epoch 48/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.181884
Epoch 48/100 - Train Loss: 0.258940, Val Loss: 0.181884, LR: 0.00121313, Time: 0.19s


Epoch 49/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.180256
Epoch 49/100 - Train Loss: 0.299658, Val Loss: 0.180256, LR: 0.00115926, Time: 0.19s


Epoch 50/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 50/100 - Train Loss: 0.339309, Val Loss: 0.182858, LR: 0.00110319, Time: 0.17s


Epoch 51/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Saved new best model with validation loss: 0.177386
Epoch 51/100 - Train Loss: 0.381950, Val Loss: 0.177386, LR: 0.00104526, Time: 0.20s


Epoch 52/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 52/100 - Train Loss: 0.223389, Val Loss: 0.178215, LR: 0.00098583, Time: 0.18s


Epoch 53/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 53/100 - Train Loss: 0.251596, Val Loss: 0.184537, LR: 0.00092527, Time: 0.18s


Epoch 54/100:   0%|          | 0/9 [00:00<?, ?it/s]

Validating:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [48]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.0


In [55]:
# Импорт необходимых библиотек
import copy
import numpy as np
import pandas as pd
import timeit
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import torch  # Для проверки доступности GPU
import wandb  # Если используется Wandb для логирования

# Константы
SEED = 42
TARGETS = [f'target_{i}' for i in range(1, 6)]  # Пример названий целевых переменных

# Функция создания модели XGBoost
def create_xgboost_model(config):
    """Создание модели XGBoost с MultiOutputRegressor"""
    xgb_params = {
        'n_estimators': config.get('n_estimators', 1000),
        'max_depth': config.get('max_depth', 8),
        'learning_rate': config.get('learning_rate', 0.01),
        'subsample': config.get('subsample', 0.8),
        'colsample_bytree': config.get('colsample_bytree', 0.8),
        'reg_alpha': config.get('reg_alpha', 0.01),
        'reg_lambda': config.get('reg_lambda', 1.0),
        'min_child_weight': config.get('min_child_weight', 3),
        'gamma': config.get('gamma', 0.0),
        'random_state': config.get('seed', SEED),
        'n_jobs': -1,
        'tree_method': 'gpu_hist' if torch.cuda.is_available() else 'hist',
        'enable_categorical': False,
    }
    
    base_model = xgb.XGBRegressor(**xgb_params)
    return MultiOutputRegressor(base_model)

# Функция обучения модели
def train_xgboost_model(model, X_train, y_train, X_val=None, y_val=None, config=None):
    """Обучение модели с поддержкой ранней остановки"""
    # Преобразование данных в numpy arrays
    X_train = np.asarray(X_train, dtype=np.float32)
    y_train = np.asarray(y_train, dtype=np.float32)
    
    eval_set = None
    if X_val is not None and y_val is not None:
        X_val = np.asarray(X_val, dtype=np.float32)
        y_val = np.asarray(y_val, dtype=np.float32)
        eval_set = [(X_val, y_val)]

    # Параметры обучения
    fit_params = {
        'eval_set': eval_set,
        'early_stopping_rounds': config.get('early_stopping_patience', 20) if eval_set else None,
        'verbose': False
    }

    # Обучение модели
    model.fit(X_train, y_train, **fit_params)
    return model

# Функция предсказания
def predict_with_xgboost(model, X):
    """Предсказание для всех целевых переменных"""
    return model.predict(np.asarray(X, dtype=np.float32))

# Функция K-fold обучения
def train_kfold_xgboost_model(X_features, y, X_features_test, n_folds=5):
    """Основная функция для кросс-валидации"""
    print(f"\n{'='*50}\nTraining {n_folds}-fold XGBoost\n{'='*50}")
    
    config = get_optimal_xgboost_config()
    print(f"Config: {config}")
    
    # Инициализация Wandb
    wandb_run = init_wandb(project_name="geology-forecast-xgboost", config=config) if wandb else None
    
    # Подготовка структур данных
    kf = KFold(n_splits=n_folds, shuffle=True, random_state=config['seed'])
    fold_preds_val = np.zeros((X_features.shape[0], y.shape[1]))
    fold_preds_test = np.zeros((X_features_test.shape[0], y.shape[1], n_folds))
    oof_scores = []
    models = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_features)):
        print(f"\n{'='*30}\nFold {fold+1}/{n_folds}\n{'='*30}")
        
        # Разделение данных
        X_train_fold, X_val_fold = X_features[train_idx], X_features[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]

        # Масштабирование
        scaler = StandardScaler()
        X_train_fold = scaler.fit_transform(X_train_fold)
        X_val_fold = scaler.transform(X_val_fold)
        X_test_scaled = scaler.transform(X_features_test)

        # Создание и обучение модели
        model = create_xgboost_model(config)
        
        start_time = timeit.default_timer()
        model = train_xgboost_model(
            model, 
            X_train_fold, 
            y_train_fold,
            X_val_fold,
            y_val_fold,
            config
        )
        elapsed = timeit.default_timer() - start_time
        
        print(f"Training completed in {elapsed:.2f} seconds")

        # Предсказания
        val_preds = predict_with_xgboost(model, X_val_fold)
        fold_preds_val[val_idx] = val_preds
        
        test_preds = predict_with_xgboost(model, X_test_scaled)
        fold_preds_test[:, :, fold] = test_preds

        # Вычисление метрики
        fold_nll_score = compute_nll_score(y_val_fold, val_preds)  # Предполагаем существование этой функции
        oof_scores.append(fold_nll_score)
        print(f"Fold {fold+1} NLL Score: {fold_nll_score:.6f}")

        # Сохранение модели и логирование
        models.append(copy.deepcopy(model))
        if wandb_run:
            wandb_run.log({
                f"fold_{fold+1}_nll": fold_nll_score,
                f"fold_{fold+1}_time": elapsed
            })

        # Очистка памяти
        del model
        import gc
        gc.collect()

    # Финализация результатов
    mean_oof_score = np.mean(oof_scores)
    print(f"\nMean OOF NLL Score: {mean_oof_score:.6f}")
    
    # Усреднение предсказаний
    test_predictions = np.mean(fold_preds_test, axis=2)
    
    # Создание сабмишна
    submission = create_optimized_submission(test_predictions)  # Предполагаем существование этой функции
    
    if wandb_run:
        wandb_run.log({"final_score": mean_oof_score})
        wandb_run.finish()

    return submission, mean_oof_score, fold_preds_val, models

# Вспомогательные функции
def get_optimal_xgboost_config():
    """Конфигурация гиперпараметров"""
    return {
        'model_type': 'XGBoost',
        'n_estimators': 1000,
        'max_depth': 8,
        'learning_rate': 0.01,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'min_child_weight': 3,
        'reg_alpha': 0.01,
        'reg_lambda': 1.0,
        'gamma': 0,
        'seed': SEED,
        'early_stopping_patience': 20,
        'feature_engineering': 'advanced',
    }


# Функция создания признаков (пример)
def create_additional_xgboost_features(data, is_test=False):
    """Создание расширенных признаков"""
    features = pd.DataFrame()
    
    # Базовые признаки
    historical_cols = [col for col in data.columns if col.replace('-', '').isdigit()]
    if historical_cols:
        historical_data = data[historical_cols].values.astype(np.float32)
        
        # Статистики
        features['mean'] = np.mean(historical_data, axis=1)
        features['std'] = np.std(historical_data, axis=1)
        features['max'] = np.max(historical_data, axis=1)
        features['min'] = np.min(historical_data, axis=1)
        
        # Тренды
        x = np.arange(historical_data.shape[1])
        slopes = []
        for row in historical_data:
            coeffs = np.polyfit(x, row, 1)
            slopes.append(coeffs[0])
        features['trend_slope'] = slopes
    
    return features