In [None]:
import pandas as pd
import numpy as np
import warnings
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

warnings.filterwarnings('ignore', message=".*KMeans is known to have a memory leak on Windows with MKL.*")

# -----------------------------------------------------------
# 1. Load batch-corrected gene expression matrix
# -----------------------------------------------------------
df = pd.read_csv("final.csv")
df.set_index("Unnamed: 0", inplace=True)
df.index.name = None

# Samples become rows, genes become columns
df = df.T

# -----------------------------------------------------------
# 2. Train-test split
# -----------------------------------------------------------
df_train, df_test = train_test_split(
    df,
    test_size=0.20,
    random_state=42,
    shuffle=True
)

# -----------------------------------------------------------
# 3. Scale data
# -----------------------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(df_train)
X_test_scaled  = scaler.transform(df_test)

# -----------------------------------------------------------
# 4. Dimensionality reduction (PCA for visualization + clustering stability)
# -----------------------------------------------------------
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)

# -----------------------------------------------------------
# 5. K-means clustering (k=4 subtypes)
# -----------------------------------------------------------
k = 4
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)

train_clusters = kmeans.fit_predict(X_train_scaled)
test_clusters  = kmeans.predict(X_test_scaled)

# -----------------------------------------------------------
# 6. Attach cluster labels back to dataframes
# -----------------------------------------------------------
X_train_df = df_train.copy()
X_test_df  = df_test.copy()

X_train_df['Cluster'] = train_clusters
X_test_df['Cluster']  = test_clusters

# Concatenate them (for reference / debugging)
df_clustered = pd.concat([X_train_df, X_test_df], axis=0)

# Assign subtype labels C1â€“C4
X_train_df['subtype'] = 'C' + (X_train_df['Cluster'] + 1).astype(str)
X_train_df.drop(columns=['Cluster'], inplace=True)

X_test_df['subtype'] = 'C' + (X_test_df['Cluster'] + 1).astype(str)
X_test_df.drop(columns=['Cluster'], inplace=True)

# -----------------------------------------------------------
# 7. Load Non-TNBC dataset and assign subtype C0
# -----------------------------------------------------------
non_df = pd.read_csv("final_non.csv")
non_df.set_index("Unnamed: 0", inplace=True)
non_df.index.name = None
non_df = non_df.T

non_df['subtype'] = "C0"

# Merge non-TNBC samples into both train and test sets
X_train_df = pd.concat([X_train_df, non_df])
X_test_df  = pd.concat([X_test_df,  non_df])

# -----------------------------------------------------------
# 8. Save clustering objects for ML models
# -----------------------------------------------------------
save_objects = {
    "X_train_df": X_train_df,
    "X_test_df": X_test_df,
    "train_clusters": train_clusters,
    "test_clusters": test_clusters,
    "scaler": scaler,
    "kmeans": kmeans
}

with open("clustering_output.pkl", "wb") as f:
    pickle.dump(save_objects, f)

# -----------------------------------------------------------
# 9. Save subtypes and final training matrix
# -----------------------------------------------------------
subtype_df = X_train_df[['subtype']].copy()

# Drop subtype column for ML model input
X_train_features = X_train_df.drop(columns=['subtype'])


X_train_features = X_train_features.T

# Save final outputs
X_train_features.to_csv('final_with_subtypes_v2.csv')
subtype_df.to_csv('only_subtypes_v2.csv')

print("TNBC subtyping completed successfully.")
