In [20]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

In [21]:
data = np.load(f"./data/abide.npy",allow_pickle=True).item()
print("Number of subjects: ", len(data['label']))
data.keys()

Number of subjects:  1009


dict_keys(['timeseires', 'label', 'corr', 'pcorr', 'site', 'id'])

In [22]:
conn = data["corr"]
print(f"Connectome (X) shape: {conn.shape}") # n_ROIs, n_ROIs, n_subjects

Connectome (X) shape: (1009, 116, 116)


In [23]:
scores = data["label"]
print(f"ASD scores (y) shape: {scores.shape}") # n_subjects

ASD scores (y) shape: (1009,)


In [24]:
subject_ids = [int(i) for i in data["id"]]
print(f"Subject IDs shape: {len(subject_ids)}") # n_subjects

Subject IDs shape: 1009


In [25]:
class Autoencoder(nn.Module):
    def __init__(self, latent_dim=3):
        super(Autoencoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(13456, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 12),
            nn.ReLU(),
            nn.Linear(12, latent_dim)  # Compressed representation
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 12),
            nn.ReLU(),
            nn.Linear(12, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 13456),
            nn.Sigmoid()  # Output values between 0 and 1
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


In [26]:
# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(conn, scores, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (807, 116, 116)
X_test shape: (202, 116, 116)
y_train shape: (807,)
y_test shape: (202,)


In [27]:
model = Autoencoder().float()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)


In [28]:
batch_size = 8
# change the length of X_train to match batch_size
X_train = X_train[:len(X_train) - (len(X_train) % batch_size)]
print(f"X_train shape: {X_train.shape}")
train_loader = DataLoader(X_train, batch_size=1, shuffle=True)

X_train shape: (800, 116, 116)


In [29]:

num_epochs = 10
for epoch in range(num_epochs):
    for d in train_loader:
        matrix = d.float()
        # reshape
        matrix = matrix.view(matrix.size(0), -1)
        output = model(matrix)
        loss = criterion(output, matrix)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    # Evaluate on test set
    test_matrix = torch.tensor(X_test).float()
    test_matrix = test_matrix.view(test_matrix.size(0), -1)
    test_output = model(test_matrix)
    test_loss = criterion(test_output, test_matrix)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Test loss: {test_loss.item():.4f}')



Epoch [1/10], Loss: 0.0336, Test loss: 0.0365
Epoch [2/10], Loss: 0.0485, Test loss: 0.0367
Epoch [3/10], Loss: 0.0329, Test loss: 0.0359
Epoch [4/10], Loss: 0.0374, Test loss: 0.0346
Epoch [5/10], Loss: 0.0291, Test loss: 0.0337
Epoch [6/10], Loss: 0.0312, Test loss: 0.0341
Epoch [7/10], Loss: 0.0471, Test loss: 0.0336
Epoch [8/10], Loss: 0.0326, Test loss: 0.0337
Epoch [9/10], Loss: 0.0330, Test loss: 0.0339
Epoch [10/10], Loss: 0.0347, Test loss: 0.0337


In [30]:
# Train on all data as it is an unsupervised task

# Initiliaze model
model = Autoencoder().float()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Determine batch size and adapt dataset
batch_size = 1
# change the length of X_train to match batch_size
conn = conn[:len(conn) - (len(conn) % batch_size)]
subject_ids = subject_ids[:len(subject_ids) - (len(subject_ids) % batch_size)]
print(f"Initial X shape: {conn.shape}")
print(f"Adapted X shape: {conn.shape}")
X_loader = DataLoader(conn, batch_size=batch_size, shuffle=True)

num_epochs = 10
for epoch in range(num_epochs):
    for d in X_loader:
        matrix = d.float()
        # reshape
        matrix = matrix.view(matrix.size(0), -1)
        output = model(matrix)
        loss = criterion(output, matrix)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')



Initial X shape: (1009, 116, 116)
Adapted X shape: (1009, 116, 116)
Epoch [1/10], Loss: 0.1061
Epoch [2/10], Loss: 0.0218
Epoch [3/10], Loss: 0.0385
Epoch [4/10], Loss: 0.0457
Epoch [5/10], Loss: 0.0311
Epoch [6/10], Loss: 0.0284
Epoch [7/10], Loss: 0.0404
Epoch [8/10], Loss: 0.0262
Epoch [9/10], Loss: 0.0274
Epoch [10/10], Loss: 0.0336


In [31]:
# Get predictions for all data (train and test combined)
all_data = torch.tensor(conn).float()
all_data = all_data.view(all_data.size(0), -1)
all_output = model(all_data)
all_loss = criterion(all_output, all_data)
print(f'Loss: {all_loss.item():.4f}')

Loss: 0.0341


In [32]:
# Get the compressed representation
compressed = model.encoder(all_data)
compressed = compressed.detach().numpy()
print(f"Compressed representation shape: {compressed.shape}")

Compressed representation shape: (1009, 3)


In [33]:
# Load original csv file
PATH_CSV = "./data/ABIDE_tab.csv"

df = pd.read_csv(PATH_CSV, index_col=0)
df.head()

Unnamed: 0.1,Unnamed: 0,SUB_ID,X,subject,SITE_ID,FILE_ID,DX_GROUP,DSM_IV_TR,AGE_AT_SCAN,SEX,...,qc_notes_rater_1,qc_anat_rater_2,qc_anat_notes_rater_2,qc_func_rater_2,qc_func_notes_rater_2,qc_anat_rater_3,qc_anat_notes_rater_3,qc_func_rater_3,qc_func_notes_rater_3,SUB_IN_SMP
0,1,50002,1,50002,PITT,no_filename,1,1,16.77,1,...,,OK,,fail,ic-parietal-cerebellum,OK,,fail,ERROR #24,1
1,2,50003,2,50003,PITT,Pitt_0050003,1,1,24.45,1,...,,OK,,OK,,OK,,OK,,1
2,3,50004,3,50004,PITT,Pitt_0050004,1,1,19.09,1,...,,OK,,OK,,OK,,OK,,1
3,4,50005,4,50005,PITT,Pitt_0050005,1,1,13.73,2,...,,OK,,maybe,ic-parietal-cerebellum,OK,,OK,,0
4,5,50006,5,50006,PITT,Pitt_0050006,1,1,13.37,1,...,,OK,,maybe,ic-parietal slight,OK,,OK,,1


In [34]:
# Create a dictionary to hold sub_id and corresponding compressed values
print(f"Number of subjects: {len(subject_ids)}")
print(f"Number of compressed values: {len(compressed)}")
compressed_dict = {sub_id: compressed[i] for i, sub_id in enumerate(subject_ids)}

# Function to return the compressed value for a given sub_id
def get_compressed_value(row, subject_ids, compressed):
    if row['SUB_ID'] in subject_ids:
        index = subject_ids.index(row['SUB_ID'])
        return compressed[index]
    return np.nan  # or return the default value you want in case of no match

# Apply this function to each row in the DataFrame
df['compressed'] = df.apply(get_compressed_value, axis=1, args=(subject_ids, compressed))

# Save df to csv
df.to_csv("./data/ABIDE_tab_compressed.csv")

Number of subjects: 1009
Number of compressed values: 1009


In [35]:
print(f"Number of subjects: {len(df)}")
print(f"Number of compressed values: {len(compressed)}")
print(f"Number of subjects with no MRI: {df['compressed'].isna().sum()}")

Number of subjects: 1112
Number of compressed values: 1009
Number of subjects with no MRI: 103


## Reproduce the same pipeline for different size of latent space

In [36]:
latent_sizes = [2, 3, 4, 6, 8, 10]
for latent_size in latent_sizes:
    print(f"Latent size: {latent_size}")
    # Initiliaze model
    model = Autoencoder(latent_dim=latent_size).float()
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    
    X_loader = DataLoader(conn, batch_size=1, shuffle=True)

    num_epochs = 10
    for epoch in range(num_epochs):
        for d in X_loader:
            matrix = d.float()
            matrix = matrix.view(matrix.size(0), -1)
            output = model(matrix)
            loss = criterion(output, matrix)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

    # Get predictions for all data (train and test combined)
    all_data = torch.tensor(conn).float()
    all_data = all_data.view(all_data.size(0), -1)
    all_output = model(all_data)
    all_loss = criterion(all_output, all_data)
    print(f'Loss: {all_loss.item():.4f}')


    # Get the compressed representation
    compressed = model.encoder(all_data)
    compressed = compressed.detach().numpy()
    print(f"Compressed representation shape: {compressed.shape}")

    # Create a dictionary to hold sub_id and corresponding compressed values
    compressed_dict = {sub_id: list(compressed[i]) for i, sub_id in enumerate(subject_ids)}

    # Apply this function to each row in the DataFrame
    df[f"compressed_{latent_size}"] = df.apply(get_compressed_value, axis=1, args=(subject_ids, compressed))


Latent size: 2
Epoch [1/10], Loss: 0.0285
Epoch [2/10], Loss: 0.0351
Epoch [3/10], Loss: 0.0491
Epoch [4/10], Loss: 0.0356
Epoch [5/10], Loss: 0.0309
Epoch [6/10], Loss: 0.0396
Epoch [7/10], Loss: 0.0276
Epoch [8/10], Loss: 0.0386
Epoch [9/10], Loss: 0.0307
Epoch [10/10], Loss: 0.0393
Loss: 0.0332
Compressed representation shape: (1009, 2)
Latent size: 3
Epoch [1/10], Loss: 0.0242
Epoch [2/10], Loss: 0.0266
Epoch [3/10], Loss: 0.0308
Epoch [4/10], Loss: 0.0254
Epoch [5/10], Loss: 0.0359
Epoch [6/10], Loss: 0.0276
Epoch [7/10], Loss: 0.0322
Epoch [8/10], Loss: 0.0357
Epoch [9/10], Loss: 0.0478
Epoch [10/10], Loss: 0.0263
Loss: 0.0325
Compressed representation shape: (1009, 3)
Latent size: 4
Epoch [1/10], Loss: 0.0717
Epoch [2/10], Loss: 0.0216
Epoch [3/10], Loss: 0.0407
Epoch [4/10], Loss: 0.0274
Epoch [5/10], Loss: 0.0439
Epoch [6/10], Loss: 0.0371
Epoch [7/10], Loss: 0.0345
Epoch [8/10], Loss: 0.0293
Epoch [9/10], Loss: 0.0262
Epoch [10/10], Loss: 0.0308
Loss: 0.0347
Compressed repres

In [52]:
df[f"compressed_{latent_size}"][0]

nan

In [54]:
# Decompose each element of compressed_2, compressed_3, compressed_4, compressed_6, compressed_8, compressed_10
# into separate columns named for example compressed_2_1, compressed_2_2, compressed_2_3 for 2
# and compressed_3_1, compressed_3_2, compressed_3_3 for 3

for latent_size in latent_sizes:
    for i in range(latent_size):
        print(f"Latent size: {latent_size}, component: {i+1}")
        # if nan skip 
        df[f"compressed_{latent_size}_{i+1}"] = df[f"compressed_{latent_size}"].apply(lambda x: x[i] if not np.isnan(x).any() else np.nan)

Latent size: 2, component: 1
Latent size: 2, component: 2
Latent size: 3, component: 1
Latent size: 3, component: 2
Latent size: 3, component: 3
Latent size: 4, component: 1
Latent size: 4, component: 2
Latent size: 4, component: 3
Latent size: 4, component: 4
Latent size: 6, component: 1
Latent size: 6, component: 2
Latent size: 6, component: 3
Latent size: 6, component: 4
Latent size: 6, component: 5
Latent size: 6, component: 6
Latent size: 8, component: 1
Latent size: 8, component: 2
Latent size: 8, component: 3
Latent size: 8, component: 4
Latent size: 8, component: 5
Latent size: 8, component: 6
Latent size: 8, component: 7
Latent size: 8, component: 8
Latent size: 10, component: 1
Latent size: 10, component: 2
Latent size: 10, component: 3
Latent size: 10, component: 4
Latent size: 10, component: 5
Latent size: 10, component: 6
Latent size: 10, component: 7
Latent size: 10, component: 8
Latent size: 10, component: 9
Latent size: 10, component: 10


In [55]:
df

Unnamed: 0.1,Unnamed: 0,SUB_ID,X,subject,SITE_ID,FILE_ID,DX_GROUP,DSM_IV_TR,AGE_AT_SCAN,SEX,...,compressed_10_1,compressed_10_2,compressed_10_3,compressed_10_4,compressed_10_5,compressed_10_6,compressed_10_7,compressed_10_8,compressed_10_9,compressed_10_10
0,1,50002,1,50002,PITT,no_filename,1,1,16.77,1,...,,,,,,,,,,
1,2,50003,2,50003,PITT,Pitt_0050003,1,1,24.45,1,...,0.909656,0.481762,0.884295,-0.988569,-0.472216,1.179701,-4.691446,1.285244,0.304127,-1.236132
2,3,50004,3,50004,PITT,Pitt_0050004,1,1,19.09,1,...,0.490374,0.429602,0.585015,-0.724735,-0.285081,0.603682,-3.106297,1.142545,0.212014,-0.834060
3,4,50005,4,50005,PITT,Pitt_0050005,1,1,13.73,2,...,-0.246707,0.242712,-0.032542,-2.295534,0.225910,0.307312,-2.314953,3.458993,-0.867373,-0.172947
4,5,50006,5,50006,PITT,Pitt_0050006,1,1,13.37,1,...,0.184293,0.258914,0.252984,-0.554248,-0.206663,0.108282,-1.661348,0.986131,0.053556,-0.439587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1107,1108,51583,1108,51583,SBL,SBL_0051583,1,2,35.00,1,...,0.980515,0.530008,0.981649,-1.146998,-0.465993,1.348539,-5.197294,1.499649,0.304935,-1.349291
1108,1109,51584,1109,51584,SBL,SBL_0051584,1,2,49.00,1,...,1.590604,0.400444,1.424278,-1.344318,-0.756654,2.030421,-7.098702,1.600666,0.510455,-1.873910
1109,1110,51585,1110,51585,SBL,SBL_0051585,1,1,27.00,1,...,0.702105,-0.133351,0.431572,0.204039,-0.664004,0.166147,-1.663145,-0.237840,0.470579,-0.636842
1110,1111,51606,1111,51606,MAX_MUN,MaxMun_a_0051606,1,2,29.00,2,...,0.087650,0.148457,0.017554,-0.129910,-0.277422,-0.260822,-0.504564,0.294545,0.094053,-0.194899


In [38]:
# Check the DataFrame
df.head()

Unnamed: 0.1,Unnamed: 0,SUB_ID,X,subject,SITE_ID,FILE_ID,DX_GROUP,DSM_IV_TR,AGE_AT_SCAN,SEX,...,qc_func_rater_3,qc_func_notes_rater_3,SUB_IN_SMP,compressed,compressed_2,compressed_3,compressed_4,compressed_6,compressed_8,compressed_10
0,1,50002,1,50002,PITT,no_filename,1,1,16.77,1,...,fail,ERROR #24,1,,,,,,,
1,2,50003,2,50003,PITT,Pitt_0050003,1,1,24.45,1,...,OK,,1,"[5.5358014, -1.9740214, 3.8024192]","[-4.1329317, 1.915973]","[2.3084972, -2.857194, -4.6895466]","[-0.33113953, 0.81164557, 0.37227333, 0.96400756]","[-2.2387748, -3.5640666, 0.09702098, -2.160834...","[-2.2480175, 1.6303325, 0.5056043, -0.6117266,...","[0.9096564, 0.48176152, 0.884295, -0.9885688, ..."
2,3,50004,3,50004,PITT,Pitt_0050004,1,1,19.09,1,...,OK,,1,"[6.946011, -3.0977376, 0.5026052]","[-2.3123832, 1.797447]","[1.9793308, -1.4719096, -2.9629197]","[-0.26281962, 3.867676, 1.8729587, 5.129569]","[-4.1193504, -1.12287, -1.0050129, -2.8992608,...","[-1.5968913, 0.7262498, 0.13404477, -0.2059115...","[0.49037445, 0.4296016, 0.5850151, -0.7247355,..."
3,4,50005,4,50005,PITT,Pitt_0050005,1,1,13.73,2,...,OK,,0,"[3.5172627, -2.2816546, -3.9345512]","[-0.65785336, -0.8069845]","[4.2407064, 0.033206224, -3.1740694]","[2.7871912, -2.1375813, 0.7781836, 2.761505]","[-3.411883, 3.6637545, 2.3138394, 1.0991508, -...","[-1.9579207, -2.4649396, -0.09315338, 0.743613...","[-0.24670707, 0.24271171, -0.03254228, -2.2955..."
4,5,50006,5,50006,PITT,Pitt_0050006,1,1,13.37,1,...,OK,,1,"[15.962503, -7.3864527, -3.2940526]","[0.6298959, 3.0843263]","[1.3287704, -0.59391, -1.7520998]","[1.8689116, 10.683505, 6.043604, 16.954182]","[-10.793749, 0.7768371, -1.8503393, -5.9783297...","[-1.0171833, -0.13308728, -0.34721193, 0.30007...","[0.18429323, 0.258914, 0.25298434, -0.55424833..."


In [56]:
# Save df to csv
df.to_csv("./data/ABIDE_tab_compressed.csv")