In [74]:
import numpy as np
from datasets import Dataset, concatenate_datasets
import torch
import os
import random
import pandas as pd
from rich import columns

In [75]:
def load_ds_from_dirs_flattening_timesteps(
        path: str, columns, dtype, activations_col_name: str = "values"
) -> Dataset:
    datasets = []
    for timestep_dir_name in os.listdir(path):
        timestep_dir_path = os.path.join(path, timestep_dir_name)
        ds_dir_names = os.listdir(timestep_dir_path)
        for example_dir_name in ds_dir_names:
            example_dir_path = os.path.join(timestep_dir_path, example_dir_name)
            ds = Dataset.load_from_disk(example_dir_path, keep_in_memory=False)
            ds.set_format(type="torch", columns=columns, dtype=dtype)
            structure_id = [example_dir_name] * len(ds)
            timestep = [timestep_dir_name] * len(ds)
            ds = ds.add_column("Sequence_Id", structure_id)
            ds = ds.add_column("Timestep", timestep)
            ds = ds.rename_column("values", activations_col_name)
            datasets.append(ds)
        print(f"processed {timestep_dir_name}")
    return concatenate_datasets(datasets)


In [100]:
ds = load_ds_from_dirs_flattening_timesteps(
    "/home/wzarzecki/ds_sae_latents_1600x/tiny_debug_activations/block4_pair",
    ["values"], torch.float32, "activations_pair")
df = ds.to_pandas()

processed 1
processed 2


In [101]:
labels_df = pd.read_csv("/home/wzarzecki/ds_sae_latents_1600x/classifiers.csv")

In [102]:
ds1 = load_ds_from_dirs_flattening_timesteps(
    "/home/wzarzecki/ds_sae_latents_1600x/tiny_debug_activations/block4_non_pair",
    ["values"], torch.float32, "activations_non_pair")
df1 = ds1.to_pandas()

ds2 = load_ds_from_dirs_flattening_timesteps(
    "/home/wzarzecki/ds_sae_latents_1600x/tiny_debug_latents/pair",
    ["values"], torch.float32, "latents_pair")
df2 = ds1.to_pandas()

ds3 = load_ds_from_dirs_flattening_timesteps(
    "/home/wzarzecki/ds_sae_latents_1600x/tiny_debug_latents/non_pair",
    ["values"], torch.float32, "latents_non_pair")
df3 = ds1.to_pandas()

processed 1
processed 2
processed 1
processed 2
processed 1
processed 2


In [105]:
activations_df = pd.merge(df, labels_df, on=["Sequence_Id"], how="inner")
activations_df = pd.merge(activations_df, df1, on=["Sequence_Id", "Timestep"], how="inner")
activations_df.columns

Index(['activations_pair', 'Sequence_Id', 'Timestep', 'Sequence',
       'Subcellular Localization', 'Solubility/Membrane-boundness',
       'activations_non_pair'],
      dtype='object')

In [106]:
activations_df.iloc[0]["activations_pair"].shape, activations_df.iloc[0]["activations_non_pair"].shape

((128,), (296,))

In [107]:
np.concat((activations_df.iloc[0]["activations_pair"], activations_df.iloc[0]["activations_non_pair"])).shape

(424,)

In [109]:
def concat_activations(row):
    return np.concat((row["activations_non_pair"], row["activations_pair"]))
# ignores token information !!!
activations_df["concat_activations"] = activations_df.apply(concat_activations, axis=1)

In [116]:
subcellular_localization_values = activations_df["Subcellular Localization"].unique()
def ovr_label_row_cytoplasm(row):
    return row["Subcellular Localization"]=="Cytoplasm"
def ovr_label_row_nucleus(row):
    return row["Subcellular Localization"]=="Nucleus"
activations_df["Cytoplasm"] = activations_df.apply(ovr_label_row_cytoplasm, axis=1)
activations_df["Nucleus"] = activations_df.apply(ovr_label_row_nucleus, axis=1)

In [117]:
found_timesteps = np.unique(activations_df["Timestep"].values)
timestep_datasets = []
for timestep in found_timesteps:
    timestep_datasets.append(activations_df[activations_df["Timestep"]==str(timestep)][["concat_activations", "Subcellular Localization", "Cytoplasm", "Nucleus"]])

In [118]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.metrics import roc_auc_score

In [121]:
def get_train_test_split(df):

    X = np.stack(df['concat_activations'].apply(lambda x: x.flatten()).values)
    y = df['Cytoplasm'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test


In [122]:
def train_and_eval_regression(X_train, X_test, y_train, y_test) -> float:
    clf = LogisticRegression(
        max_iter=100,
        solver='newton-cholesky',
        class_weight='balanced',
        random_state=42
    )
    clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, y_pred)
X_train, X_test, y_train, y_test =get_train_test_split(timestep_datasets[0])
train_and_eval_regression(X_train, X_test, y_train, y_test)

Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=2.6318e-14): result may not be accurate.


np.float64(1.0)

In [123]:
results = {}
for idx, timestep_ds in enumerate(timestep_datasets):
    X_train, X_test, y_train, y_test = get_train_test_split(timestep_ds)
    res = train_and_eval_regression(X_train, X_test, y_train, y_test)
    print(f"{idx}: {res}")
    results[idx] = res


0: 1.0


Further options are to use another solver or to avoid such situation in the first place. Possible remedies are removing collinear features of X or increasing the penalization strengths.
The original Linear Algebra message was:
Ill-conditioned matrix (rcond=2.6318e-14): result may not be accurate.


In [124]:
import json
with open("temp.json", "w") as f:
    json.dump(results, f)