In [None]:
# Cell 1
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Cell 5
# === Detailed load-debug for KNN imputation, to increase samples
import pandas as pd
import numpy as np
import os
import time

# --- Paths (adjust if needed) ---
base    = "/content/drive/Shareddrives/ECS289L"
cog_csv = os.path.join(base, "embeddings/cog/embeddings.csv")
mri_csv = os.path.join(base, "embeddings/mri/embeddings.csv")
gen_csv = os.path.join(base, "embeddings/genetics_mlp/embeddings.csv")
lbl_csv = os.path.join(base, "master_subjects.csv")

# --- Load metadata ---
df_cog = pd.read_csv(cog_csv)
df_mri = pd.read_csv(mri_csv)
df_gen = pd.read_csv(gen_csv)
df_lbl = pd.read_csv(lbl_csv)

# --- Find the RIDs to process ---
rids_int = sorted(set(df_cog.RID) & set(df_mri.RID) & set(df_lbl.RID))
print(f"Total subjects: {len(rids_int)}")

# --- Build path lookup dicts ---
cog_dict = dict(zip(df_cog.RID, df_cog.path))
mri_dict = dict(zip(df_mri.RID, df_mri.path))
gen_dict = dict(zip(df_gen.RID, df_gen.path))

# --- Determine embedding dims from a sample ---
first = rids_int[0]
dim_c = np.load(os.path.join(base, cog_dict[first])).shape[0]
dim_m = np.load(os.path.join(base, mri_dict[first])).shape[0]
dim_g = np.load(os.path.join(base, next(iter(gen_dict.values())))).shape[0]

# --- Detailed per-modality loading with timing ---
rows = []
for i, rid in enumerate(rids_int, 1):
    print(f"\n[{i}/{len(rids_int)}] RID = {rid}")
    # Cognition
    cog_path = os.path.join(base, cog_dict[rid])
    t0 = time.time()
    cog_vec = np.load(cog_path)
    print(f"  cog → {cog_vec.shape} loaded in {time.time() - t0:.2f}s")
    # MRI
    mri_path = os.path.join(base, mri_dict[rid])
    t0 = time.time()
    mri_vec = np.load(mri_path)
    print(f"  mri → {mri_vec.shape} loaded in {time.time() - t0:.2f}s")
    # Genetics (or impute)
    if rid in gen_dict:
        gen_path = os.path.join(base, gen_dict[rid])
        t0 = time.time()
        gen_vec = np.load(gen_path)
        print(f"  gen → {gen_vec.shape} loaded in {time.time() - t0:.2f}s")
    else:
        print("  gen → missing, filling with NaNs")
        gen_vec = np.full(dim_g, np.nan, dtype=float)
    # collect
    rows.append(np.concatenate([cog_vec, mri_vec, gen_vec]))

print("\n✅ All embeddings loaded (or NaN‐filled) successfully.")


Total subjects: 210

[1/210] RID = 4
  cog → (32,) loaded in 0.00s
  mri → (128,) loaded in 0.00s
  gen → missing, filling with NaNs

[2/210] RID = 30
  cog → (32,) loaded in 0.47s
  mri → (128,) loaded in 0.54s
  gen → missing, filling with NaNs

[3/210] RID = 33
  cog → (32,) loaded in 0.44s
  mri → (128,) loaded in 0.45s
  gen → missing, filling with NaNs

[4/210] RID = 38
  cog → (32,) loaded in 0.41s
  mri → (128,) loaded in 0.46s
  gen → missing, filling with NaNs

[5/210] RID = 42
  cog → (32,) loaded in 0.40s
  mri → (128,) loaded in 0.45s
  gen → (128,) loaded in 0.38s

[6/210] RID = 44
  cog → (32,) loaded in 0.37s
  mri → (128,) loaded in 0.51s
  gen → missing, filling with NaNs

[7/210] RID = 45
  cog → (32,) loaded in 0.28s
  mri → (128,) loaded in 0.53s
  gen → missing, filling with NaNs

[8/210] RID = 51
  cog → (32,) loaded in 0.47s
  mri → (128,) loaded in 0.55s
  gen → (128,) loaded in 0.42s

[9/210] RID = 54
  cog → (32,) loaded in 0.45s
  mri → (128,) loaded in 0.32

In [None]:
# Cell 6
# === Perform imputation & save new genetics CSV ===
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
import os

# — same base paths as before —
base    = "/content/drive/Shareddrives/ECS289L"
gen_csv = os.path.join(base, "embeddings/genetics_mlp/embeddings.csv")

# Load the CSV you just verified
df_gen = pd.read_csv(gen_csv)

# 'rows' and rids_int, dim_c, dim_m, dim_g are still in memory from your debug cell.
# If not, re-run the part that builds 'rows' and those dims.

# Stack into X
X = np.stack(rows)

# Run KNN imputer
imputer = KNNImputer(n_neighbors=5)
X_imp = imputer.fit_transform(X)

# Save the newly imputed genetics embeddings
output_dir = os.path.dirname(gen_csv)
new_entries = []
for i, rid in enumerate(rids_int):
    if rid not in df_gen.RID.values:
        gen_imp = X_imp[i, dim_c+dim_m : dim_c+dim_m+dim_g]
        out_path = os.path.join(output_dir, f"{rid}_imputed.npy")
        np.save(out_path, gen_imp)
        new_entries.append({"RID": rid, "path": out_path})

# Merge and write updated CSV
df_imputed = pd.concat([df_gen, pd.DataFrame(new_entries)], ignore_index=True)
out_csv = os.path.join(output_dir, "embeddings_imputed.csv")
df_imputed.to_csv(out_csv, index=False)

print(f"✔️ KNN imputation done. New genetics CSV at:\n  {out_csv}")
print(f"Added {len(new_entries)} imputed rows, total now {len(df_imputed)}.")


✔️ KNN imputation done. New genetics CSV at:
  /content/drive/Shareddrives/ECS289L/embeddings/genetics_mlp/embeddings_imputed.csv
Added 138 imputed rows, total now 620.
