# NeurIPS Open Polymer Prediction 2025 - EDA

Exploratory Data Analysis for polymer property prediction competition.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## Load Data

In [None]:
# Load training data
train_df = pd.read_csv('../data/raw/train.csv')
test_df = pd.read_csv('../data/raw/test.csv')
sample_submission = pd.read_csv('../data/raw/sample_submission.csv')

# Load supplemental datasets
supplement_dfs = []
for i in range(1, 5):
    df = pd.read_csv(f'../data/raw/train_supplement/dataset{i}.csv')
    df['source'] = f'dataset{i}'
    supplement_dfs.append(df)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Supplemental datasets: {[df.shape for df in supplement_dfs]}")

## Data Overview

In [None]:
train_df.head(10)

In [None]:
train_df.info()

In [None]:
# Target statistics
target_columns = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
train_df[target_columns].describe()

## Missing Value Analysis

In [None]:
# Missing values per target
missing_stats = pd.DataFrame({
    'Missing Count': train_df[target_columns].isnull().sum(),
    'Missing %': (train_df[target_columns].isnull().sum() / len(train_df) * 100).round(2),
    'Available Count': train_df[target_columns].notnull().sum(),
    'Available %': (train_df[target_columns].notnull().sum() / len(train_df) * 100).round(2)
})

print("\nMissing Value Statistics:")
print(missing_stats)

In [None]:
# Visualize missing data
fig, ax = plt.subplots(figsize=(10, 6))
missing_stats[['Available %', 'Missing %']].plot(kind='barh', stacked=True, ax=ax, color=['#2ecc71', '#e74c3c'])
ax.set_xlabel('Percentage (%)')
ax.set_title('Data Availability by Target')
ax.legend(['Available', 'Missing'])
plt.tight_layout()
plt.show()

## Target Distribution Analysis

In [None]:
# Distribution plots for each target
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, col in enumerate(target_columns):
    data = train_df[col].dropna()
    axes[i].hist(data, bins=50, edgecolor='black', alpha=0.7)
    axes[i].set_title(f'{col} Distribution (n={len(data)})')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')
    axes[i].axvline(data.mean(), color='red', linestyle='--', label=f'Mean: {data.mean():.3f}')
    axes[i].legend()

axes[-1].axis('off')  # Hide last subplot
plt.tight_layout()
plt.show()

## Target Correlations

In [None]:
# Correlation matrix
corr_matrix = train_df[target_columns].corr()

fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, ax=ax)
ax.set_title('Target Correlations')
plt.tight_layout()
plt.show()

## SMILES Analysis

In [None]:
# SMILES length distribution
train_df['smiles_length'] = train_df['SMILES'].str.len()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Length distribution
axes[0].hist(train_df['smiles_length'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('SMILES Length')
axes[0].set_ylabel('Frequency')
axes[0].set_title('SMILES Length Distribution')
axes[0].axvline(train_df['smiles_length'].mean(), color='red', linestyle='--',
                label=f'Mean: {train_df["smiles_length"].mean():.1f}')
axes[0].legend()

# Boxplot
axes[1].boxplot(train_df['smiles_length'])
axes[1].set_ylabel('SMILES Length')
axes[1].set_title('SMILES Length Boxplot')

plt.tight_layout()
plt.show()

print(f"SMILES Length Statistics:")
print(train_df['smiles_length'].describe())

In [None]:
# Polymer features from SMILES
train_df['has_repeating_unit'] = train_df['SMILES'].str.contains('\*', regex=False)
train_df['num_stars'] = train_df['SMILES'].str.count('\*')
train_df['num_rings'] = train_df['SMILES'].str.count('1') + train_df['SMILES'].str.count('2')
train_df['num_aromatic'] = train_df['SMILES'].str.count('c')

print(f"\nPolymer Features:")
print(f"Samples with repeating units (*): {train_df['has_repeating_unit'].sum()} ({train_df['has_repeating_unit'].mean()*100:.1f}%)")
print(f"Average number of * symbols: {train_df['num_stars'].mean():.2f}")
print(f"Average number of rings: {train_df['num_rings'].mean():.2f}")
print(f"Average aromatic count: {train_df['num_aromatic'].mean():.2f}")

## Sample SMILES Visualization

Let's visualize some polymer structures using RDKit.

In [None]:
try:
    from rdkit import Chem
    from rdkit.Chem import Draw
    
    # Sample some SMILES
    sample_smiles = train_df['SMILES'].head(8).tolist()
    
    # Convert to molecules
    mols = [Chem.MolFromSmiles(s) for s in sample_smiles]
    
    # Draw
    img = Draw.MolsToGridImage(mols, molsPerRow=4, subImgSize=(300, 300))
    display(img)
except ImportError:
    print("RDKit not installed. Run: pip install rdkit")

## Supplemental Dataset Analysis

In [None]:
# Analyze supplemental datasets
for df in supplement_dfs:
    print(f"\n{df['source'].iloc[0]}:")
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {list(df.columns)}")
    if 'TC_mean' in df.columns:
        print(f"  TC_mean range: [{df['TC_mean'].min():.3f}, {df['TC_mean'].max():.3f}]")
        print(f"  TC_mean mean: {df['TC_mean'].mean():.3f}")

## Key Insights

**Summary of findings:**

1. **Multi-task sparse targets**: Most samples don't have all 5 properties measured
2. **SMILES characteristics**: Polymers use `*` to denote repeating units
3. **Supplemental data**: Additional datasets provide extra training signal
4. **Target correlations**: Some properties may be correlated (useful for multi-task learning)

**Modeling recommendations:**
- Use multi-task learning with masked loss (ignore missing values)
- Leverage pretrained molecular transformers (ChemBERTa)
- Consider SMILES augmentation for data augmentation
- Use supplemental datasets with domain adaptation