# Advanced Melting Point Prediction (Ensemble + GPU)

## Strategy
1.  **SMILES-only**: Focus on molecular structure.
2.  **Enhanced Features**: RDKit Descriptors + Morgan Fingerprints (Radius 2, 1024 bits).
3.  **Ensemble Model**: Combine **CatBoost** and **XGBoost** predictions.
4.  **GPU Acceleration**: Enabled for both models.

## Prerequisite
**Run the cell below to install necessary libraries:**

In [None]:
# INSTALL LIBRARIES
!pip install rdkit pingouin optuna catboost lightgbm xgboost openpyxl

In [None]:
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import pingouin as pg
import optuna
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, GraphDescriptors, rdFingerprintGenerator
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import os

# configuration
pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 120)

## 1. Load Data (Auto-Download)

In [None]:
# Check environment and load data
if os.path.exists('/kaggle/input/melting-point/train.csv'):
    # Running on Kaggle Kernel
    print("Detected Kaggle Kernel environment.")
    data_path = '/kaggle/input/melting-point/'
elif os.path.exists('train.csv'):
    # Local file exists
    print("Detected local dataset.")
    data_path = './'
else:
    # Try downloading via Kaggle API (Requires kaggle.json)
    print("Dataset not found. Attempting download via Kaggle API...")
    print("NOTE: You need to upload your 'kaggle.json' API token if on Colab.")
    
    try:
        !pip install -q kaggle
        !kaggle competitions download -c melting-point
        !unzip -o melting-point.zip
        data_path = './'
        print("Download successful.")
    except Exception as e:
        print("\nERROR: Could not download data automatically.")
        print("Please upload 'train.csv', 'test.csv', and 'sample_submission.csv' manually.")
        data_path = './'

# Load Data
try:
    df_train = pd.read_csv(f"{data_path}train.csv", sep=",")[['SMILES', 'Tm']]
    test_df = pd.read_csv(f"{data_path}test.csv")
    submission_df = pd.read_csv(f"{data_path}sample_submission.csv")
    print(f"\nLoaded Train shape: {df_train.shape}")
except FileNotFoundError:
    print("\nCRITICAL ERROR: Data files still not found. Please check your setup.")

## 2. Feature Engineering (Descriptors + Fingerprints)

In [None]:
def get_mol_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return None
    
    # 1. Extensive Descriptors
    features = {
        'MolWt': Descriptors.MolWt(mol),
        'LogP': Descriptors.MolLogP(mol),
        'NumHDonors': Descriptors.NumHDonors(mol),
        'NumHAcceptors': Descriptors.NumHAcceptors(mol),
        'TPSA': Descriptors.TPSA(mol),
        'NumRotatableBonds': Descriptors.NumRotatableBonds(mol),
        'RingCount': Descriptors.RingCount(mol),
        'HeavyAtomCount': Descriptors.HeavyAtomCount(mol),
        'NumValenceElectrons': Descriptors.NumValenceElectrons(mol),
        'BertzCT': GraphDescriptors.BertzCT(mol),
        'HallKierAlpha': GraphDescriptors.HallKierAlpha(mol),
    }
    
    # 2. Morgan Fingerprints (Radius 2, 1024 bits) - USING NEW GENERATOR API
    mfgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)
    fp = mfgen.GetFingerprint(mol)
    fp_bits = list(fp)
    
    for i, bit in enumerate(fp_bits):
        features[f'fp_{i}'] = bit
        
    return features

print("Extracting features for Train set (this may take a minute)...")
train_features_df = df_train['SMILES'].apply(get_mol_features).apply(pd.Series)
train_full = pd.concat([df_train, train_features_df], axis=1)

print("Extracting features for Test set...")
test_features_df = test_df['SMILES'].apply(get_mol_features).apply(pd.Series)
test_full = pd.concat([test_df, test_features_df], axis=1)

## 3. Model Training (Ensemble: CatBoost + XGBoost) with GPU

In [None]:
# Prepare Data
features = [c for c in train_full.columns if c not in ['id', 'SMILES', 'Tm']]
X = train_full[features].fillna(0)
y = train_full['Tm']
X_test = test_full[features].fillna(0)

# K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
test_preds_cb = np.zeros(len(X_test))
test_preds_xgb = np.zeros(len(X_test))
mae_scores = []

print("Starting Cross-Validation on GPU...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # --- CatBoost (GPU) ---
    cb = CatBoostRegressor(
        iterations=5000,
        learning_rate=0.03,
        depth=7,
        l2_leaf_reg=5,
        loss_function='MAE',
        verbose=0,
        random_seed=42,
        task_type="GPU", # ENABLE GPU
        devices='0'      # Use first GPU
    )
    cb.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=200)
    cb_val_pred = cb.predict(X_val)
    test_preds_cb += cb.predict(X_test) / kf.get_n_splits()
    
    # --- XGBoost (GPU) ---
    xgb = XGBRegressor(
        n_estimators=5000,
        learning_rate=0.03,
        max_depth=7,
        reg_alpha=1,
        reg_lambda=5,
        n_jobs=-1,
        random_state=42,
        early_stopping_rounds=200,
        tree_method='hist',     # CHANGED: 'gpu_hist' -> 'hist' (Newer XGBoost)
        device='cuda'           # Explicitly set device
    )
    xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    xgb_val_pred = xgb.predict(X_val)
    test_preds_xgb += xgb.predict(X_test) / kf.get_n_splits()
    
    # --- Ensemble (Average) ---
    ensemble_val_pred = (cb_val_pred * 0.5) + (xgb_val_pred * 0.5)
    oof_preds[val_idx] = ensemble_val_pred
    
    mae = mean_absolute_error(y_val, ensemble_val_pred)
    mae_scores.append(mae)
    print(f"Fold {fold+1} MAE: {mae:.4f}")

print(f"Average MAE: {np.mean(mae_scores):.4f}")

In [None]:
final_test_preds = (test_preds_cb * 0.5) + (test_preds_xgb * 0.5)
submission_df['Tm'] = final_test_preds
submission_df.to_csv('submission_ensemble_gpu.csv', index=False)
print("Saved submission_ensemble_gpu.csv")