# t-SNE Analysis of PKS vs non-PKS Molecules

This notebook loads fingerprinted train/val/test splits, combines them,
reduces ECFP4 (2048-bit) to 2D with t-SNE, and plots colored by class (PKS vs non-PKS).

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Prefer multicore t-SNE implementations if available
TSNE_IMPL = 'sklearn'
try:
    from MulticoreTSNE import MulticoreTSNE as TSNE_CLASS
    TSNE_IMPL = 'multicore'
except Exception:
    try:
        from openTSNE import TSNE as TSNE_CLASS
        TSNE_IMPL = 'opentsne'
    except Exception:
        from sklearn.manifold import TSNE as TSNE_CLASS
        TSNE_IMPL = 'sklearn'

data_dir = Path('../data')
splits = ['train', 'val', 'test']

def find_split(split):
    candidates = [
        data_dir / split / f'baseline_{split}_ecfp4.parquet',
        data_dir / split / f'baseline_{split}_ecfp4.csv',
        data_dir / split / f'baseline_{split}.parquet',
        data_dir / split / f'baseline_{split}.csv',
    ]
    for p in candidates:
        if p.exists():
            return p
    return None

paths = {s: find_split(s) for s in splits}
paths


{'train': PosixPath('../data/train/baseline_train_ecfp4.parquet'),
 'val': PosixPath('../data/val/baseline_val_ecfp4.parquet'),
 'test': PosixPath('../data/test/baseline_test_ecfp4.parquet')}

In [2]:
# Load and combine
dfs = []
for s, p in paths.items():
    if p is None:
        print(f'Warning: missing split {s}; skipping')
        continue
    if p.suffix == '.parquet':
        df = pd.read_parquet(p)
    else:
        df = pd.read_csv(p)
    df['split'] = s
    dfs.append(df)

df_all = pd.concat(dfs, ignore_index=True)
df_all.shape, df_all['source'].value_counts()


((4045434, 2051),
 bio     2106960
 chem    1925162
 PKS       13312
 Name: source, dtype: int64)

In [4]:
# Prepare features and labels (no subsampling)
fp_cols = [c for c in df_all.columns if str(c).startswith('fp_')]
fp_cols = sorted(fp_cols, key=lambda s: int(str(s).split('_')[1]))
X = df_all[fp_cols].to_numpy(dtype=np.float32)
y = (df_all['source'].astype(str) == 'PKS').astype(int).to_numpy()
df_plot = df_all.copy().reset_index(drop=True)
X.shape, df_plot['source'].value_counts()


((4045434, 2048),
 bio     2106960
 chem    1925162
 PKS       13312
 Name: source, dtype: int64)

In [None]:
n_jobs = os.cpu_count() or 1
print(f'Using TSNE implementation: {TSNE_IMPL} with n_jobs={n_jobs}')
if TSNE_IMPL == 'multicore':
    tsne = TSNE_CLASS(n_components=2, perplexity=30, learning_rate=200, init='pca', n_iter=1000, random_state=42, n_jobs=n_jobs, verbose=True)
    emb = tsne.fit_transform(X)
elif TSNE_IMPL == 'opentsne':
    tsne = TSNE_CLASS(n_components=2, perplexity=30, n_jobs=n_jobs, random_state=42, verbose=True, initialization='pca')
    emb = tsne.fit(X)
else:
    tsne = TSNE_CLASS(n_components=2, perplexity=30, learning_rate='auto', init='pca', n_iter=1000, verbose=1, random_state=42)
    emb = tsne.fit_transform(X)
df_plot['tsne_1'] = emb[:, 0]
df_plot['tsne_2'] = emb[:, 1]
df_plot[['tsne_1', 'tsne_2', 'source']].head()


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 4045434 samples in 28.259s...


In [None]:
# Plot
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_plot, x='tsne_1', y='tsne_2', hue='source',
                palette={'PKS': '#1f77b4', 'bio': '#2ca02c', 'chem': '#ff7f0e'},
                alpha=0.6, s=8, linewidth=0)
plt.title('t-SNE of ECFP4 fingerprints: PKS vs non-PKS')
plt.legend(title='source', markerscale=2)
plt.tight_layout()
out_path = Path('../data/processed/tsne_pks_vs_nonpks.png')
out_path.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(out_path, dpi=200)
print('Saved figure to', out_path)
plt.show()