In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

import seaborn as sns

In [None]:


def load_and_prepare(path, label='pretrain'):
    df = pd.read_csv(path)
    features = df.drop(columns=['label']).values
    labels = df['label'].values
    return features, labels, [label] * len(labels)

x1, y1, stage1 = load_and_prepare('cls1.csv', label='before')
x2, y2, stage2 = load_and_prepare('cls2.csv', label='after')

X = pd.DataFrame(
    data = StandardScaler().fit_transform(
        pd.DataFrame(x1).append(pd.DataFrame(x2), ignore_index=True)
    )
)
Y = pd.Series(list(y1) + list(y2), name='label')
Stage = pd.Series(stage1 + stage2, name='stage')

tsne = TSNE(n_components=2, perplexity=30, learning_rate=200, n_iter=1000, random_state=42)
X_embedded = tsne.fit_transform(X)

df_vis = pd.DataFrame(X_embedded, columns=['dim1', 'dim2'])
df_vis['label'] = Y
df_vis['stage'] = Stage

plt.figure(figsize=(10, 8))
sns.scatterplot(
    data=df_vis, 
    x='dim1', y='dim2', 
    hue='label', 
    style='stage',
    palette='tab10',
    alpha=0.7
)
plt.title('t-SNE Visualization of [CLS] Representations (Before vs After Training)', fontsize=14)
plt.legend(title='Label / Stage', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('tsne_cls_comparison.png', dpi=300)
plt.show()
