In [122]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

In [123]:
data = np.load(f"./data/abide.npy",allow_pickle=True).item()
print("Number of subjects: ", len(data['label']))
data.keys()

Number of subjects:  1009


dict_keys(['timeseires', 'label', 'corr', 'pcorr', 'site', 'id'])

In [124]:
conn = data["corr"]
print(f"Connectome (X) shape: {conn.shape}") # n_ROIs, n_ROIs, n_subjects

Connectome (X) shape: (1009, 116, 116)


In [125]:
scores = data["label"]
print(f"ASD scores (y) shape: {scores.shape}") # n_subjects

ASD scores (y) shape: (1009,)


In [126]:
subject_ids = [int(i) for i in data["id"]]
print(f"Subject IDs shape: {len(subject_ids)}") # n_subjects

Subject IDs shape: 1009


In [127]:
class Autoencoder(nn.Module):
    def __init__(self, latent_dim=3):
        super(Autoencoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(13456, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 12),
            nn.ReLU(),
            nn.Linear(12, latent_dim)  # Compressed representation
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 12),
            nn.ReLU(),
            nn.Linear(12, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 13456),
            nn.Sigmoid()  # Output values between 0 and 1
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


In [93]:
# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(conn, scores, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (807, 116, 116)
X_test shape: (202, 116, 116)
y_train shape: (807,)
y_test shape: (202,)


In [94]:
model = Autoencoder().float()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)


In [95]:
batch_size = 8
# change the length of X_train to match batch_size
X_train = X_train[:len(X_train) - (len(X_train) % batch_size)]
print(f"X_train shape: {X_train.shape}")
train_loader = DataLoader(X_train, batch_size=1, shuffle=True)

X_train shape: (800, 116, 116)


In [96]:

num_epochs = 10
for epoch in range(num_epochs):
    for d in train_loader:
        matrix = d.float()
        # reshape
        matrix = matrix.view(matrix.size(0), -1)
        output = model(matrix)
        loss = criterion(output, matrix)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    # Evaluate on test set
    test_matrix = torch.tensor(X_test).float()
    test_matrix = test_matrix.view(test_matrix.size(0), -1)
    test_output = model(test_matrix)
    test_loss = criterion(test_output, test_matrix)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Test loss: {test_loss.item():.4f}')



Epoch [1/10], Loss: 0.0389, Test loss: 0.0368
Epoch [2/10], Loss: 0.0258, Test loss: 0.0360
Epoch [3/10], Loss: 0.0921, Test loss: 0.0354
Epoch [4/10], Loss: 0.0174, Test loss: 0.0348
Epoch [5/10], Loss: 0.0335, Test loss: 0.0339
Epoch [6/10], Loss: 0.0239, Test loss: 0.0339
Epoch [7/10], Loss: 0.0311, Test loss: 0.0342
Epoch [8/10], Loss: 0.0300, Test loss: 0.0338
Epoch [9/10], Loss: 0.0425, Test loss: 0.0337
Epoch [10/10], Loss: 0.0293, Test loss: 0.0337


In [97]:
# Train on all data as it is an unsupervised task

# Initiliaze model
model = Autoencoder().float()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Determine batch size and adapt dataset
batch_size = 1
# change the length of X_train to match batch_size
conn = conn[:len(conn) - (len(conn) % batch_size)]
subject_ids = subject_ids[:len(subject_ids) - (len(subject_ids) % batch_size)]
print(f"Initial X shape: {conn.shape}")
print(f"Adapted X shape: {conn.shape}")
X_loader = DataLoader(conn, batch_size=batch_size, shuffle=True)

num_epochs = 10
for epoch in range(num_epochs):
    for d in X_loader:
        matrix = d.float()
        # reshape
        matrix = matrix.view(matrix.size(0), -1)
        output = model(matrix)
        loss = criterion(output, matrix)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')



Initial X shape: (1009, 116, 116)
Adapted X shape: (1009, 116, 116)
Epoch [1/10], Loss: 0.0569
Epoch [2/10], Loss: 0.0383
Epoch [3/10], Loss: 0.0311
Epoch [4/10], Loss: 0.0594
Epoch [5/10], Loss: 0.0515
Epoch [6/10], Loss: 0.0371
Epoch [7/10], Loss: 0.0340
Epoch [8/10], Loss: 0.0293
Epoch [9/10], Loss: 0.0392
Epoch [10/10], Loss: 0.0422


In [101]:
# Get predictions for all data (train and test combined)
all_data = torch.tensor(conn).float()
all_data = all_data.view(all_data.size(0), -1)
all_output = model(all_data)
all_loss = criterion(all_output, all_data)
print(f'Loss: {all_loss.item():.4f}')

Loss: 0.0327


In [99]:
# Get the compressed representation
compressed = model.encoder(all_data)
compressed = compressed.detach().numpy()
print(f"Compressed representation shape: {compressed.shape}")

Compressed representation shape: (1009, 3)


array([[-2.259695  ,  2.2921166 ,  1.9402783 ],
       [-0.6182181 ,  2.4837327 ,  2.222906  ],
       [-0.5641706 ,  0.11749941,  0.26383722],
       ...,
       [-1.2959974 ,  2.0995135 ,  1.8607417 ],
       [-2.4122214 ,  2.0182457 ,  1.697311  ],
       [-1.9440824 ,  2.0219364 ,  1.7408917 ]], dtype=float32)

In [129]:
# Load original csv file
PATH_CSV = "./data/ABIDE_tab.csv"

df = pd.read_csv(PATH_CSV, index_col=0)
df.head()

Unnamed: 0.1,Unnamed: 0,SUB_ID,X,subject,SITE_ID,FILE_ID,DX_GROUP,DSM_IV_TR,AGE_AT_SCAN,SEX,...,qc_notes_rater_1,qc_anat_rater_2,qc_anat_notes_rater_2,qc_func_rater_2,qc_func_notes_rater_2,qc_anat_rater_3,qc_anat_notes_rater_3,qc_func_rater_3,qc_func_notes_rater_3,SUB_IN_SMP
0,1,50002,1,50002,PITT,no_filename,1,1,16.77,1,...,,OK,,fail,ic-parietal-cerebellum,OK,,fail,ERROR #24,1
1,2,50003,2,50003,PITT,Pitt_0050003,1,1,24.45,1,...,,OK,,OK,,OK,,OK,,1
2,3,50004,3,50004,PITT,Pitt_0050004,1,1,19.09,1,...,,OK,,OK,,OK,,OK,,1
3,4,50005,4,50005,PITT,Pitt_0050005,1,1,13.73,2,...,,OK,,maybe,ic-parietal-cerebellum,OK,,OK,,0
4,5,50006,5,50006,PITT,Pitt_0050006,1,1,13.37,1,...,,OK,,maybe,ic-parietal slight,OK,,OK,,1


In [121]:
# Create a dictionary to hold sub_id and corresponding compressed values
print(f"Number of subjects: {len(subject_ids)}")
print(f"Number of compressed values: {len(compressed)}")
compressed_dict = {sub_id: compressed[i] for i, sub_id in enumerate(subject_ids)}

# Function to return the compressed value for a given sub_id
def get_compressed_value(row, subject_ids, compressed):
    if row['SUB_ID'] in subject_ids:
        index = subject_ids.index(row['SUB_ID'])
        return compressed[index]
    return np.nan  # or return the default value you want in case of no match

# Apply this function to each row in the DataFrame
df['compressed'] = df.apply(get_compressed_value, axis=1, args=(subject_ids, compressed))

# Save df to csv
df.to_csv("./data/ABIDE_tab_compressed.csv")

Number of subjects: 1009
Number of compressed values: 1009


In [120]:
print(f"Number of subjects: {len(df)}")
print(f"Number of compressed values: {len(compressed)}")
print(f"Number of subjects with no MRI: {df['compressed'].isna().sum()}")

Number of subjects: 1112
Number of compressed values: 1009
Number of subjects with no MRI: 103


## Reproduce the same pipeline for different size of latent space

In [132]:
latent_sizes = [2, 3, 4, 6, 8, 10]
for latent_size in latent_sizes:
    print(f"Latent size: {latent_size}")
    # Initiliaze model
    model = Autoencoder(latent_dim=latent_size).float()
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    
    X_loader = DataLoader(conn, batch_size=1, shuffle=True)

    num_epochs = 10
    for epoch in range(num_epochs):
        for d in X_loader:
            matrix = d.float()
            matrix = matrix.view(matrix.size(0), -1)
            output = model(matrix)
            loss = criterion(output, matrix)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

    # Get predictions for all data (train and test combined)
    all_data = torch.tensor(conn).float()
    all_data = all_data.view(all_data.size(0), -1)
    all_output = model(all_data)
    all_loss = criterion(all_output, all_data)
    print(f'Loss: {all_loss.item():.4f}')


    # Get the compressed representation
    compressed = model.encoder(all_data)
    compressed = compressed.detach().numpy()
    print(f"Compressed representation shape: {compressed.shape}")

    # Create a dictionary to hold sub_id and corresponding compressed values
    compressed_dict = {sub_id: compressed[i] for i, sub_id in enumerate(subject_ids)}

    # Apply this function to each row in the DataFrame
    df[f"compressed_{latent_size}"] = df.apply(get_compressed_value, axis=1, args=(subject_ids, compressed))


Latent size: 2
Epoch [1/10], Loss: 0.0505
Epoch [2/10], Loss: 0.0501
Epoch [3/10], Loss: 0.0598
Epoch [4/10], Loss: 0.0273
Epoch [5/10], Loss: 0.0504
Epoch [6/10], Loss: 0.0227
Epoch [7/10], Loss: 0.0298
Epoch [8/10], Loss: 0.0281
Epoch [9/10], Loss: 0.0469
Epoch [10/10], Loss: 0.0395
Loss: 0.0326
Compressed representation shape: (1009, 2)
Latent size: 3
Epoch [1/10], Loss: 0.0436
Epoch [2/10], Loss: 0.0407
Epoch [3/10], Loss: 0.0305
Epoch [4/10], Loss: 0.0244
Epoch [5/10], Loss: 0.0262
Epoch [6/10], Loss: 0.0359
Epoch [7/10], Loss: 0.0272
Epoch [8/10], Loss: 0.0309
Epoch [9/10], Loss: 0.0493
Epoch [10/10], Loss: 0.0385
Loss: 0.0328
Compressed representation shape: (1009, 3)
Latent size: 4
Epoch [1/10], Loss: 0.0253
Epoch [2/10], Loss: 0.0272
Epoch [3/10], Loss: 0.0436
Epoch [4/10], Loss: 0.0306
Epoch [5/10], Loss: 0.0298
Epoch [6/10], Loss: 0.0498
Epoch [7/10], Loss: 0.0327
Epoch [8/10], Loss: 0.0322
Epoch [9/10], Loss: 0.0324
Epoch [10/10], Loss: 0.0299
Loss: 0.0326
Compressed repres

In [133]:
# Check the DataFrame
df.head()

Unnamed: 0.1,Unnamed: 0,SUB_ID,X,subject,SITE_ID,FILE_ID,DX_GROUP,DSM_IV_TR,AGE_AT_SCAN,SEX,...,qc_anat_notes_rater_3,qc_func_rater_3,qc_func_notes_rater_3,SUB_IN_SMP,compressed_2,compressed_3,compressed_4,compressed_6,compressed_8,compressed_10
0,1,50002,1,50002,PITT,no_filename,1,1,16.77,1,...,,fail,ERROR #24,1,,,,,,
1,2,50003,2,50003,PITT,Pitt_0050003,1,1,24.45,1,...,,OK,,1,"[0.44941542, -2.9575777]","[1.1815448, -4.634217, 3.114023]","[3.8482342, -0.3381914, 1.3678892, -2.8299541]","[0.47832513, -0.3344283, -2.775017, -0.6487376...","[-3.2448077, 0.20148331, 0.729448, 1.8987474, ...","[-2.5745575, -0.6420369, -1.4933676, 1.4467326..."
2,3,50004,3,50004,PITT,Pitt_0050004,1,1,19.09,1,...,,OK,,1,"[0.4390367, -1.8658609]","[0.39401212, -2.9582384, 2.079665]","[2.6737583, 0.08604087, 0.8983698, -1.8406199]","[-0.39788154, 0.9797256, -0.345452, -0.6684227...","[-1.6996136, 0.58251894, 1.1431954, 1.3389084,...","[-1.0550424, -0.4334481, -1.2319748, 0.5235287..."
3,4,50005,4,50005,PITT,Pitt_0050005,1,1,13.73,2,...,,OK,,0,"[1.5495021, -1.2372564]","[-2.4866755, -2.2959528, 1.7506592]","[3.2064295, 1.6908103, 1.285104, -1.3762128]","[-0.5110897, -1.0571824, -2.980431, -3.9370008...","[-2.2855935, 0.7342312, 3.1883984, 2.0930343, ...","[2.9089928, -1.321404, -3.329828, -1.644749, 0..."
4,5,50006,5,50006,PITT,Pitt_0050006,1,1,13.37,1,...,,OK,,1,"[0.50030065, -0.7813915]","[-0.46178344, -1.350864, 1.0396112]","[1.7077237, 0.42651868, 0.51136065, -1.0311688]","[-0.29313406, 0.8351699, -0.017895207, -0.4916...","[-0.57433647, 0.7514796, 1.2304022, 0.8044821,...","[0.34323144, -0.16988352, -0.8524246, -0.33802..."


In [134]:
# Save df to csv
df.to_csv("./data/ABIDE_tab_compressed.csv")