In [2]:
import pandas as pd

df = pd.read_csv('GSE60361_C1-3005-Expression.txt.gz', sep='\t')
print(df.shape)
df.head()

(19972, 3006)


Unnamed: 0,cell_id,1772071015_C02,1772071017_G12,1772071017_A05,1772071014_B06,1772067065_H06,1772071017_E02,1772067065_B07,1772067060_B09,1772071014_E04,...,1772066110_D12,1772071017_A07,1772063071_G10,1772058148_C03,1772063061_D09,1772067059_B04,1772066097_D04,1772063068_D01,1772066098_A12,1772058148_F03
0,Tspan12,0,0,0,3,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,1
1,Tshz1,3,1,0,2,2,2,2,1,0,...,0,0,0,0,0,0,0,0,0,1
2,Fnbp1l,3,1,6,4,1,2,1,0,5,...,0,0,0,0,0,0,0,0,0,0
3,Adamts15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Cldn12,1,1,1,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0


In [8]:
import pandas as pd

# Load existing param file
param_path = '/home/yoeu/BIPN 162/data for BIPN 162 project/Topological-PCA/Tests/paramvals_tPCA_patched.csv'
param_df = pd.read_csv(param_path, index_col=0)

# Create a new row (copy values from an existing row or define new ones)
example_row = param_df.loc['GSE82187'].copy()
param_df.loc['GSE60361'] = example_row  # or modify values as needed

# Save back to CSV
param_df.to_csv(param_path)


In [9]:
import sys
sys.path.append('/home/yoeu/BIPN 162/data for BIPN 162 project/Topological-PCA/Model')

from tPCA import RpLSPCA_cal_projections

import pandas as pd

class ParameterWrapper:
    def __init__(self, csv_path):
        self.df = pd.read_csv(csv_path, index_col=0)

    def get_parameters(self, dataset_name):
        return self.df.loc[dataset_name]

# ✅ DO NOT import it again
# from wrapper import ParameterWrapper  ← REMOVE THIS


In [7]:
import sys
sys.path.append('/home/yoeu/BIPN 162/data for BIPN 162 project/Topological-PCA/Model')

from tPCA import RpLSPCA_cal_projections

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
import os

# Define ParameterWrapper inline
class ParameterWrapper:
    def __init__(self, csv_path):
        self.df = pd.read_csv(csv_path, index_col=0)

    def get_parameters(self, dataset_name):
        return self.df.loc[dataset_name]

# Utility Functions
def computeClusterScore(y, label):
    ari = adjusted_rand_score(y, label)
    nmi = normalized_mutual_info_score(y, label)
    return ari, nmi

def computeKMeans(X, y, max_state=30):
    from sklearn.cluster import KMeans
    X_scaled = StandardScaler().fit_transform(X)
    n_clusters = np.unique(y).shape[0]
    ARI, NMI = [], []

    for state in range(max_state):
        km = KMeans(n_clusters=n_clusters, n_init=10, random_state=state)
        labels = km.fit_predict(X_scaled)
        ari, nmi = computeClusterScore(y, labels)
        ARI.append(ari)
        NMI.append(nmi)
    return np.mean(ARI), np.mean(NMI)

def load_data(dataset_name, root_path):
    inpath = os.path.join(root_path, 'Tests', dataset_name)
    X = pd.read_csv(f"{inpath}/{dataset_name}_full_X.csv").values.astype(float)
    y = pd.read_csv(f"{inpath}/{dataset_name}_full_labels.csv")['Label'].values.astype(int)
    return X, y

# Main analysis function
def run_tPCA_clustering(dataset_name):
    root_path = '/home/yoeu/BIPN 162'
    X, y = load_data(dataset_name, root_path)
    X = X[:50, :]
    y = y[:50]

    # Normalize counts if needed
    if dataset_name in ['GSE67835', 'GSE84133human1', 'GSE84133human2', 'GSE84133human3', 'GSE84133mouse1', 'GSE84133mouse2']:
        scaling_factors = 1e4 / np.sum(X, axis=0)
        X = scaling_factors * X

    # Assume original X is (cells, genes)
    log_X = np.log(X + 1)
    log_X[log_X < 1e-6] = 0

# Transpose so that genes are rows: (genes, cells)
    log_X_T = log_X.T  # shape: (genes, cells)

# Filter low-variance genes
    row_var = np.var(log_X_T, axis=1)
    threshold = np.percentile(row_var, 50)
    filtered_X_T = log_X_T[row_var >= threshold]  # shape: (filtered_genes, cells)

# Now transpose back to (cells, genes)
    filtered_X = filtered_X_T.T  # shape: (cells, filtered_genes)

# Filter out rare classes
    unique_classes, class_counts = np.unique(y, return_counts=True)
    mask = np.isin(y, unique_classes[class_counts >= 15])

    X_filtered = filtered_X[mask, :]  # rows = filtered cells
    y_filtered = y[mask]

    # ✅ Subsample AFTER filtering
    max_cells = min(50, X_filtered.shape[0])
    max_genes = min(50, X_filtered.shape[1])

    X_filtered = X_filtered[:max_cells, :max_genes]
    y_filtered = y_filtered[:max_cells]

    # Standardize
    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X_filtered)

    # Load parameters
    # Load parameters
    param_file = '/home/yoeu/BIPN 162/data for BIPN 162 project/Topological-PCA/Tests/paramvals_tPCA_patched.csv'
    param_wrapper = ParameterWrapper(param_file)
    params = param_wrapper.get_parameters(dataset_name)
    k = len(np.unique(y_filtered))

# ✅ Dynamically load zeta values
    zeta_keys = sorted([key for key in params.index if key.startswith('zeta')],
                   key=lambda x: int(x.replace('zeta', '')))
    zeta = np.array([float(params[key]) for key in zeta_keys])

# Other parameters
    gamma = float(params['gamma'])
    beta = float(params['beta'])
    # Run tPCA
    PDM = RpLSPCA_cal_projections(X_normalized, beta, gamma, k, zeta)
    TM = ((np.linalg.inv(PDM.T @ PDM)) @ (PDM.T)).T
    Q = X_normalized @ TM

    # Clustering
    ari, nmi = computeKMeans(Q, y_filtered)
    print(f"tPCA ARI: {ari:.4f}")
    print(f"tPCA NMI: {nmi:.4f}")
run_tPCA_clustering('GSE60361')

tPCA ARI: -0.0224
tPCA NMI: 0.0031
