In [1]:
!pip install /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
!pip install mordred --no-index --find-links=file:///kaggle/input/mordred-1-2-0-py3-none-any/

Processing /kaggle/input/rdkit-2025-3-3-cp311/rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl
Installing collected packages: rdkit
Successfully installed rdkit-2025.3.3
Looking in links: file:///kaggle/input/mordred-1-2-0-py3-none-any/
Processing /kaggle/input/mordred-1-2-0-py3-none-any/mordred-1.2.0-py3-none-any.whl
Processing /kaggle/input/mordred-1-2-0-py3-none-any/networkx-2.8.8-py3-none-any.whl (from mordred)
Installing collected packages: networkx, mordred
  Attempting uninstall: networkx
    Found existing installation: networkx 3.5
    Uninstalling networkx-3.5:
      Successfully uninstalled networkx-3.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
scikit-image 0.25.2 requires networkx>=3.0, but you have networkx 2.8.8 which is incompatible.
nx-cugraph-cu12 25.2.0 requires networkx>=3.2, but you have networkx 2.8.8 which is in

In [2]:
# train.py - Corrected and Refactored

import pandas as pd
import numpy as np
import gc
import warnings
import os
import random
import torch

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.impute import SimpleImputer

import xgboost as xgb
import lightgbm as lgb

from rdkit import Chem
from mordred import Calculator, descriptors


# --- Global Settings ---
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

class CFG:
    """Configuration class for hyperparameters and settings."""
    N_SPLITS = 5
    SEEDS = [42] # Using more seeds for robustness provides better generalization
    TARGET_COLS = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
    
    # batch
    EPOCHS = 160
    BATCH_SIZE = 256
    LR = 1e-3
    WEIGHT_DECAY = 1e-5
    EARLY_STOPPING_PATIENCE = 40
    HIDDEN_DIM = 512
    N_BLOCKS = 2
    DROPOUT_RATE = 0.3
    
    # Feature Selection
    K_BEST_FEATURES = 450
    CORR_THRESHOLD = 0.98

def set_seed(seed):
    """Sets the seed for reproducibility across all libraries."""
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True

print("✅ Configuration and seeding function defined.")

# --- 1. Load Data ---
print("Loading data...")
# Assume data is in a standard input directory
DATA_DIR = '/kaggle/input/neurips-open-polymer-prediction-2025' # Change this to your data path, e.g., '/kaggle/input/neurips-open-polymer-prediction-2025/'
train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test_df = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
sample_submission_df = pd.read_csv(os.path.join(DATA_DIR, 'sample_submission.csv'))

# --- 2. Feature Engineering Function ---
def generate_mordred_descriptors(smiles_series, name):
    """Calculates and saves Mordred descriptors."""
    print(f"Generating Mordred descriptors for {len(smiles_series)} molecules ({name})...")
    
    # RDKit molecule conversion
    mols = [Chem.MolFromSmiles(s) for s in smiles_series]
    
    # Initialize Mordred calculator
    calc = Calculator(descriptors, ignore_3D=True)
    
    # Use all available CPU cores for calculation
    # The .pandas() method shows the progress bar you are seeing
    df_desc = calc.pandas(mols, nproc=os.cpu_count())
    
    # Post-processing: ensure numeric, fill NaNs, remove constant columns
    df_desc = df_desc.apply(pd.to_numeric, errors='coerce').fillna(0)
    df_desc = df_desc.loc[:, df_desc.nunique() > 1] # Important: remove zero-variance features
    
    print(f"Generated {df_desc.shape[1]} descriptors for {name}.")
    return df_desc

# --- 3. Generate and Save Features ---
train_features_df = pd.read_csv("/kaggle/input/polymer-dataset-525/train_features.csv") #generate_mordred_descriptors(train_df['SMILES'], 'train')
test_features_df = generate_mordred_descriptors(test_df['SMILES'], 'test')

# Align columns after generation - crucial for consistent model input
train_cols = set(train_features_df.columns)
test_cols = set(test_features_df.columns)
shared_cols = sorted(list(train_cols.intersection(test_cols)))

train_features_df = train_features_df[shared_cols]
test_features_df = test_features_df[shared_cols]


train_full_df = pd.concat([train_df, train_features_df], axis=1)
test_full_df = pd.concat([test_df, test_features_df], axis=1)

print(f"✅ Features generated. Train shape: {train_full_df.shape}, Test shape: {test_full_df.shape}")


# --- 3. Feature Selection & Target Transformation ---
def select_features_in_fold(Xtr_df, ytr, k, corr_th):
    """Performs supervised feature selection using ONLY fold training data."""
    if Xtr_df.shape[1] <= k:
        return Xtr_df.columns.tolist()

    sel_f = SelectKBest(f_regression, k=min(k, Xtr_df.shape[1] - 1)).fit(Xtr_df, ytr)
    selected_cols_kbest = Xtr_df.columns[sel_f.get_support()]
    Xtr_df_selected = Xtr_df[selected_cols_kbest]

    corr = Xtr_df_selected.corr().abs()
    f_vals, _ = f_regression(Xtr_df_selected, ytr)
    strength = pd.Series(f_vals, index=Xtr_df_selected.columns).fillna(0.0)

    ordered_features = strength.sort_values(ascending=False).index
    kept_features = []
    for feature in ordered_features:
        if not kept_features:
            kept_features.append(feature)
            continue
        # Check correlation against already kept features
        if not (corr.loc[feature, kept_features] > corr_th).any():
            kept_features.append(feature)

    return kept_features

def get_transforms(y, target):
    """Applies target transformation for better model training."""
    if target == "FFV":
        eps = 1e-4
        y_clipped = np.clip(y, eps, 1 - eps)
        transform = lambda x: np.log(x / (1 - x)) # Logit transform
        inverse = lambda z: 1.0 / (1.0 + np.exp(-z))
        return transform(y_clipped), inverse
    if target == "Density":
        transform = lambda x: np.log(np.clip(x, 1e-4, None)) # Log transform
        inverse = lambda x: np.exp(x)
        return transform(y), inverse
    return y, lambda z: z # No transform for other targets

print("✅ Feature selection and transformation functions defined.")


def run_xgb_pipeline(train_data, test_data, target, random_state):
    print(f"  Target: {target} | Model: XGBoost")
    set_seed(random_state)

    train_filtered = train_data[train_data[target].notna()].copy()
    y_raw = train_filtered[target].astype(np.float32).values
    
    # --- CORRECTED: Use shared feature columns ---
    feat_cols = shared_cols
    X_df = train_filtered[feat_cols]
    X_test_df = test_data[feat_cols]

    bins = pd.qcut(y_raw, q=10, labels=False, duplicates='drop')
    splitter = StratifiedKFold(n_splits=CFG.N_SPLITS, shuffle=True, random_state=random_state)

    test_preds = []
    fold_maes = []

    for fold, (tr_idx, va_idx) in enumerate(splitter.split(X_df, bins), 1):
        print(f"    Fold {fold}/{CFG.N_SPLITS}")
        Xtr_df, Xva_df = X_df.iloc[tr_idx], X_df.iloc[va_idx]
        ytr_raw, yva_raw = y_raw[tr_idx], y_raw[va_idx]

        imputer = SimpleImputer(strategy='median')
        Xtr_df = pd.DataFrame(imputer.fit_transform(Xtr_df), columns=Xtr_df.columns)
        Xva_df = pd.DataFrame(imputer.transform(Xva_df), columns=Xva_df.columns)
        X_test_fold_df = pd.DataFrame(imputer.transform(X_test_df), columns=X_test_df.columns)
        
        selected_cols = select_features_in_fold(Xtr_df, ytr_raw, CFG.K_BEST_FEATURES, CFG.CORR_THRESHOLD)
        Xtr, Xva, X_test_fold = Xtr_df[selected_cols].values, Xva_df[selected_cols].values, X_test_fold_df[selected_cols].values
        
        model = xgb.XGBRegressor(
            random_state=random_state, objective='reg:absoluteerror', tree_method='hist',
            n_estimators=2500, learning_rate=0.015, max_depth=6, subsample=0.8,
            colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=1.0, early_stopping_rounds=150
        )
        model.fit(Xtr, ytr_raw, eval_set=[(Xva, yva_raw)], verbose=False)

        val_preds = model.predict(Xva)
        fold_test_preds = model.predict(X_test_fold)

        ytr_min, ytr_max = np.percentile(ytr_raw, [0.5, 99.5])
        test_preds.append(np.clip(fold_test_preds, ytr_min, ytr_max))
        
        fold_mae = mean_absolute_error(yva_raw, val_preds)
        fold_maes.append(fold_mae)
        print(f"      Fold {fold} MAE: {fold_mae:.5f}")

    print(f"    -> Average CV MAE for {target}: {np.mean(fold_maes):.5f} (+/- {np.std(fold_maes):.5f})")
    return np.mean(test_preds, axis=0)

def run_lgbm_pipeline(train_data, test_data, target, random_state):
    print(f"  Target: {target} | Model: LightGBM")
    set_seed(random_state)

    train_filtered = train_data[train_data[target].notna()].copy()
    y_raw = train_filtered[target].astype(np.float32).values
    
    # Use all feature columns
    feat_cols = [col for col in train_data.columns if col not in ['id', 'SMILES'] + CFG.TARGET_COLS]
    X_df = train_filtered[feat_cols]
    X_test_df = test_data[feat_cols]

    bins = pd.qcut(y_raw, q=10, labels=False, duplicates='drop')
    splitter = StratifiedKFold(n_splits=CFG.N_SPLITS, shuffle=True, random_state=random_state)

    test_preds = []
    fold_maes = []

    for fold, (tr_idx, va_idx) in enumerate(splitter.split(X_df, bins), 1):
        print(f"    Fold {fold}/{CFG.N_SPLITS}")
        Xtr_df, Xva_df = X_df.iloc[tr_idx], X_df.iloc[va_idx]
        ytr_raw, yva_raw = y_raw[tr_idx], y_raw[va_idx]

        imputer = SimpleImputer(strategy='median')
        Xtr = imputer.fit_transform(Xtr_df)
        Xva = imputer.transform(Xva_df)
        X_test_fold = imputer.transform(X_test_df)

        model = lgb.LGBMRegressor(
            random_state=random_state,
            objective='mae',
            metric='mae',
            n_estimators=2000,
            learning_rate=0.01,
            num_leaves=31,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.1,
            reg_lambda=0.1,
            n_jobs=-1,
            verbose=-1
        )
        
        model.fit(Xtr, ytr_raw,
                  eval_set=[(Xva, yva_raw)],
                  callbacks=[lgb.early_stopping(100, verbose=False)])

        val_preds = model.predict(Xva)
        fold_test_preds = model.predict(X_test_fold)

        ytr_min, ytr_max = np.percentile(ytr_raw, [0.5, 99.5])
        test_preds.append(np.clip(fold_test_preds, ytr_min, ytr_max))
        
        fold_mae = mean_absolute_error(yva_raw, val_preds)
        fold_maes.append(fold_mae)
        print(f"      Fold {fold} MAE: {fold_mae:.5f}")

    print(f"    -> Average CV MAE for {target}: {np.mean(fold_maes):.5f} (+/- {np.std(fold_maes):.5f})")
    return np.mean(test_preds, axis=0)


# --- 5. Main Execution ---
final_preds = {}
ID_test = test_df['id']

# --- DEFINE WHICH MODEL TO RUN FOR EACH TARGET ---
# You can change this to run different models, e.g., 'lgbm', 'catboost', 'xgb'
# This allows for easy experimentation. Let's run CatBoost for all to start.
MODEL_CHOICE = {
    'Tg': 'lgbm',
    'FFV': 'lgbm',
    'Tc': 'lgbm',
    'Density': 'lgbm',
    'Rg': 'lgbm',
}

# --- MODEL MAPPING ---
# This maps the string choice to the actual function to run
MODEL_FUNCTIONS = {
    'xgb': run_xgb_pipeline,
    'lgbm': run_lgbm_pipeline,
}


print("\n=== Initiating Training ===")
for target in CFG.TARGET_COLS:
    print(f"\n[{target.upper()}]")
    
    model_name = MODEL_CHOICE[target] 
    model_func = MODEL_FUNCTIONS[model_name]
    
    seed_preds = []
    for seed in CFG.SEEDS:
        print(f"\n--- Training with seed: {seed} ---")
        preds = model_func(train_full_df, test_full_df, target, random_state=seed)
        seed_preds.append(preds)
        gc.collect()

    # Average predictions across different seeds
    final_preds[target] = np.mean(seed_preds, axis=0)

print("\n✅ Training complete.")

# --- 6. Create Submission File ---
submission_df = pd.DataFrame({'id': ID_test})
for target in CFG.TARGET_COLS:
    # This ensures the correct predictions are assigned to the correct column
    submission_df[target] = final_preds[target]

submission_df.to_csv('submission.csv', index=False)
print("\nSubmission file 'submission.csv' created successfully.")
print(submission_df.head())

✅ Configuration and seeding function defined.
Loading data...
Generating Mordred descriptors for 3 molecules (test)...


100%|██████████| 3/3 [00:00<00:00,  5.51it/s]


Generated 525 descriptors for test.
✅ Features generated. Train shape: (7973, 532), Test shape: (3, 527)
✅ Feature selection and transformation functions defined.

=== Initiating Training ===

[TG]

--- Training with seed: 42 ---
  Target: Tg | Model: LightGBM
    Fold 1/5
      Fold 1 MAE: 42.81160
    Fold 2/5
      Fold 2 MAE: 45.96157
    Fold 3/5
      Fold 3 MAE: 50.22105
    Fold 4/5
      Fold 4 MAE: 48.98213
    Fold 5/5
      Fold 5 MAE: 49.62732
    -> Average CV MAE for Tg: 47.52074 (+/- 2.77388)

[FFV]

--- Training with seed: 42 ---
  Target: FFV | Model: LightGBM
    Fold 1/5
      Fold 1 MAE: 0.00698
    Fold 2/5
      Fold 2 MAE: 0.00696
    Fold 3/5
      Fold 3 MAE: 0.00700
    Fold 4/5
      Fold 4 MAE: 0.00616
    Fold 5/5
      Fold 5 MAE: 0.00695
    -> Average CV MAE for FFV: 0.00681 (+/- 0.00033)

[TC]

--- Training with seed: 42 ---
  Target: Tc | Model: LightGBM
    Fold 1/5
      Fold 1 MAE: 0.02516
    Fold 2/5
      Fold 2 MAE: 0.02796
    Fold 3/5
      F