In [2]:
import os
import math

import umap
import matplotlib.pyplot as plt
import matplotlib.colors as mcol
import seaborn as sns
import numpy as np
import pandas as pd
from tqdm import tqdm
# from sklearn.neural_network import MLPClassifier

import torch
import torch.nn as nn

from datasets import load_from_disk, concatenate_datasets
from brainlm_mae.modeling_brainlm import BrainLMForPretraining

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
if not os.path.exists("inference_plots"):
    os.mkdir("inference_plots")

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

## Load Entire Dataset

In [5]:
train_ds = load_from_disk("/home/sr2464/palmer_scratch/datasets/UKB_Large_rsfMRI_and_tffMRI_Arrow_WithRegression_v3_with_metadata/train_ukbiobank")
print(train_ds)
val_ds = load_from_disk("/home/sr2464/palmer_scratch/datasets/UKB_Large_rsfMRI_and_tffMRI_Arrow_WithRegression_v3_with_metadata/val_ukbiobank")
print(val_ds)
test_ds = load_from_disk("/home/sr2464/palmer_scratch/datasets/UKB_Large_rsfMRI_and_tffMRI_Arrow_WithRegression_v3_with_metadata/test_ukbiobank")
print(test_ds)
coords_ds = load_from_disk("/home/sr2464/palmer_scratch/datasets/UKBioBank1000_Arrow_v4/Brain_Region_Coordinates")
print(coords_ds)

Dataset({
    features: ['Raw_Recording', 'Voxelwise_RobustScaler_Normalized_Recording', 'Filename', 'Patient ID', 'Order', 'eid', 'Gender', 'Age.At.MHQ', 'PHQ9.Severity', 'Depressed.At.Baseline', 'Neuroticism', 'Self.Harm.Ever', 'Not.Worth.Living', 'PCL.Score', 'GAD7.Severity'],
    num_rows: 61038
})
Dataset({
    features: ['Raw_Recording', 'Voxelwise_RobustScaler_Normalized_Recording', 'Filename', 'Patient ID', 'Order', 'eid', 'Gender', 'Age.At.MHQ', 'PHQ9.Severity', 'Depressed.At.Baseline', 'Neuroticism', 'Self.Harm.Ever', 'Not.Worth.Living', 'PCL.Score', 'GAD7.Severity'],
    num_rows: 7629
})
Dataset({
    features: ['Raw_Recording', 'Voxelwise_RobustScaler_Normalized_Recording', 'Filename', 'Patient ID', 'Order', 'eid', 'Gender', 'Age.At.MHQ', 'PHQ9.Severity', 'Depressed.At.Baseline', 'Neuroticism', 'Self.Harm.Ever', 'Not.Worth.Living', 'PCL.Score', 'GAD7.Severity'],
    num_rows: 7628
})
Dataset({
    features: ['Index', 'X', 'Y', 'Z'],
    num_rows: 424
})


In [6]:
concat_ds = concatenate_datasets([train_ds, val_ds, test_ds])
concat_ds

Dataset({
    features: ['Raw_Recording', 'Voxelwise_RobustScaler_Normalized_Recording', 'Filename', 'Patient ID', 'Order', 'eid', 'Gender', 'Age.At.MHQ', 'PHQ9.Severity', 'Depressed.At.Baseline', 'Neuroticism', 'Self.Harm.Ever', 'Not.Worth.Living', 'PCL.Score', 'GAD7.Severity'],
    num_rows: 76295
})

In [7]:
example0 = concat_ds[10]
print(example0['Filename'])
print(example0['Patient ID'])
print(example0['Order'])
print(example0['eid'])
print(example0['Gender'])
print(example0['Age.At.MHQ'])
print(example0['Depressed.At.Baseline'])
print(example0['Neuroticism'])
print(example0['Self.Harm.Ever'])
print(example0['Not.Worth.Living'])
print(example0['PCL.Score'])
print(example0['GAD7.Severity'])

1191089.dat_tf
_tf
18416.0
1191089.0
nan
nan
nan
nan
nan
nan
nan
nan


## Define MLP with WeightedCrossEntropy

In [8]:
import random
from typing import List

import torch.nn.functional as F

In [9]:
class MLP(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_size: int):
        """
        Arguments:
            random_state: random seed
            max_iter: maximum number of epochs
            input_size: input size
            hidden_size: size for hidden layers in MLP
            output_size: output size
            
        Assumed parameters:
            activation: 'relu'
            solver: 'adam'
            alpha: weight decay of 1e-4
            batch_size: min(200, n_samples)
            learning_rate: 0.001
            tolerance: 1e-4, stops if loss doesn't improve by 1e-4 for n_iter_no_change iterations
            n_iter_no_change: 10
        """
        super().__init__()
        self.lin1 = nn.Linear(input_size, hidden_size)
        self.act1 = nn.ReLU()
        self.lin2 = nn.Linear(hidden_size, hidden_size)
        self.act2 = nn.ReLU()
        self.lin3 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.lin1(x)
        x = self.act1(x)
        x = self.lin2(x)
        x = self.act2(x)
        x = self.lin3(x)
        return x

    
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = X  # numpy array shape [num_examples, num_features]
        self.y = torch.tensor(y, dtype=torch.float32)  # list length [num_examples]
    
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, idx):
        sample = self.X[idx]
        label = self.y[idx]
        return sample, label


class EarlyStopper:
    def __init__(self, patience=10, delta=1e-5):
        self.patience = patience
        self.delta = delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf

    def __call__(self, loss_value):
        score = -loss_value

        if self.best_score is None:  # no prior best loss, set
            self.best_score = score
        elif score < self.best_score + self.delta:  # hasn't improved enough yet
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:  # improved enough, reset counter and score
            self.best_score = score
            self.counter = 0
    
        
class MLPClassifier(nn.Module):
    def __init__(self, 
                 random_state: int, 
                 max_iter: int, 
                 input_size: int,
                 hidden_size: int, 
                 output_size: int,
                ):
        """
        Arguments:
            random_state: random seed
            max_iter: maximum number of epochs
            input_size: input size
            hidden_size: size for hidden layers in MLP
            output_size: output size
            
        Assumed parameters:
            activation: 'relu'
            solver: 'adam'
            alpha: weight decay of 1e-4
            batch_size: min(200, n_samples)
            learning_rate: 0.001
            tolerance: 1e-5, stops if loss doesn't improve by 1e-4 for n_iter_no_change iterations
            n_iter_no_change: 40
        """
        super().__init__()
        self.max_iter = max_iter
        
        # Set random seed
        torch.manual_seed(random_state)
        random.seed(random_state)
        np.random.seed(random_state)
        
        self.mlp = MLP(input_size, hidden_size, output_size)
    
    def fit(self, train_X, train_y, pos_weight, learning_rate=0.001):
        """
        Arguments:
            - train_X: numpy array of shape [num_examples, num_features]
            - train_y: numpy array of shape [num_examples]
            - pos_weight: numpy array of positive sample weights, shape [num_classes],
                weight should be = num_negative_instances / num_positive_instances of that class
        """
        batch_size = min(train_X.shape[0], 250)
        dataset = SimpleDataset(train_X, train_y)
        train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=1)
        
        optimizer = torch.optim.Adam(self.mlp.parameters(), lr=learning_rate, weight_decay=1e-4)
        criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
        early_stopper = EarlyStopper(patience=30, delta=1e-5)
        # criterion should be called like: criterion(output, target), 
        #  where both output and target are shape [num_samples, num_classes=1]
        
        for epoch_idx in range(self.max_iter):
            train_loss_total = 0.
            num_ex = 0
            for sample, labels in train_loader:
                #print("sample.shape:", sample.shape)  # [batch_size, num_features]
                #print("label.shape:", label.shape)  # [batch_size]
                self.mlp.train()
                optimizer.zero_grad()
                
                outputs = self.mlp(sample)
                labels = labels.unsqueeze(-1)
                #print("outputs.shape:", outputs.shape)
                #print("label.shape:", label.shape)
                #print("outputs[:5]:", outputs[:5])
                #print("label[:5]:", label[:5])
                
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                
                train_loss_total += loss.item()
                num_ex += sample.shape[0]
            
            train_loss_avg = train_loss_total / num_ex
            print(f"Iteration {epoch_idx + 1}, loss = {train_loss_avg:.8f}")
            early_stopper(train_loss_avg)
            if early_stopper.early_stop:
                print("Training loss did not improve more than tol=0.00001 for 30 consecutive epochs. Stopping.")
                break
    
    def score(self, test_X, test_y):
        self.mlp.eval()
        test_X = torch.tensor(test_X, dtype=torch.float32)  # shape [num_samples, num_features]
        test_y = torch.tensor(test_y, dtype=torch.int64).squeeze()  # shape [num_samples]
        #print("test_X.shape:", test_X.shape)
        #print("test_y.shape:", test_y.shape)
        
        preds = self.mlp(test_X)
        preds = preds.squeeze().detach()    # shape [num_samples]
        #print("preds.shape:", preds.shape)
        #print("preds[:5]:", preds[:5])
        #print("test_y[:5]:", test_y[:5])
        preds = F.sigmoid(preds)
        preds = preds > 0.5
        preds = preds.long()
        #print("preds[:5]:", preds[:5])
        accuracy = (preds == test_y).sum() / test_X.shape[0]
        return accuracy.item()

## Reload PCA components of CLS tokens and raw data

In [10]:
# Best BrainLM model so far: /home/mr2238/BrainLM/inference_plots/dataset_v3/2023-07-17-19_00_00_ckpt-500/
#  all_cls_200recordinglength.npy
#  pca_reduced_cls_tokens_200components.npy
# Raw recordings:
#  recordings normalized: in concat_ds, column 
#  PCA of raw recordings: /home/mr2238/BrainLM/inference_plots/dataset_v3/pca_reduced_raw_data_200length_200components.npy
all_cls_tokens = np.load("/home/mr2238/BrainLM/inference_plots/dataset_v3/2023-07-17-19_00_00_ckpt-500/all_cls_200recordinglength.npy")
all_cls_tokens.shape

(76295, 512)

In [12]:
cls_token_pca_components = np.load("/home/mr2238/BrainLM/inference_plots/dataset_v3/2023-07-17-19_00_00_ckpt-500/pca_reduced_cls_tokens_200components.npy")
cls_token_pca_components.shape

(76295, 200)

In [13]:
total_num_ex = cls_token_pca_components.shape[0]
cls_token_pca_components_list = [cls_token_pca_components[idx] for idx in range(total_num_ex)]
concat_ds = concat_ds.add_column(name="cls_token_pca_components", column=cls_token_pca_components_list)
concat_ds

Dataset({
    features: ['Raw_Recording', 'Voxelwise_RobustScaler_Normalized_Recording', 'Filename', 'Patient ID', 'Order', 'eid', 'Gender', 'Age.At.MHQ', 'PHQ9.Severity', 'Depressed.At.Baseline', 'Neuroticism', 'Self.Harm.Ever', 'Not.Worth.Living', 'PCL.Score', 'GAD7.Severity', 'cls_token_pca_components'],
    num_rows: 76295
})

In [14]:
# Add whole CLS token to ds as well
total_num_ex = all_cls_tokens.shape[0]
all_cls_tokens_list = [all_cls_tokens[idx] for idx in range(total_num_ex)]
concat_ds = concat_ds.add_column(name="whole_cls_token", column=all_cls_tokens_list)
concat_ds

Dataset({
    features: ['Raw_Recording', 'Voxelwise_RobustScaler_Normalized_Recording', 'Filename', 'Patient ID', 'Order', 'eid', 'Gender', 'Age.At.MHQ', 'PHQ9.Severity', 'Depressed.At.Baseline', 'Neuroticism', 'Self.Harm.Ever', 'Not.Worth.Living', 'PCL.Score', 'GAD7.Severity', 'cls_token_pca_components', 'whole_cls_token'],
    num_rows: 76295
})

In [16]:
# recording_col_name = "Subtract_Mean_Divide_Global_STD_Normalized_Recording"
# all_recordings = np.load("inference_plots/all_{}_490len.npy".format(recording_col_name))
# all_recordings.shape

In [17]:
recording_pca_components = np.load("/home/mr2238/BrainLM/inference_plots/dataset_v3/pca_reduced_raw_data_200length_200components.npy")
recording_pca_components.shape

(76295, 200)

In [18]:
recording_pca_components_list = [recording_pca_components[idx] for idx in range(total_num_ex)]
concat_ds = concat_ds.add_column(name="recording_pca_components", column=recording_pca_components_list)
concat_ds

Dataset({
    features: ['Raw_Recording', 'Voxelwise_RobustScaler_Normalized_Recording', 'Filename', 'Patient ID', 'Order', 'eid', 'Gender', 'Age.At.MHQ', 'PHQ9.Severity', 'Depressed.At.Baseline', 'Neuroticism', 'Self.Harm.Ever', 'Not.Worth.Living', 'PCL.Score', 'GAD7.Severity', 'cls_token_pca_components', 'whole_cls_token', 'recording_pca_components'],
    num_rows: 76295
})

In [19]:
np.array(concat_ds[0]["cls_token_pca_components"], dtype=np.float32).shape

(200,)

In [20]:
print(concat_ds[0]["cls_token_pca_components"][:5])
print(concat_ds[0]["recording_pca_components"][:5])

[-0.5865330696105957, -0.2947590947151184, 0.1324397772550583, -1.0456269979476929, 0.8647788166999817]
[-27.499038696289062, 10.373053550720215, -30.58742332458496, -25.046045303344727, 29.680246353149414]


In [21]:
print(concat_ds["Gender"][:10])
print(concat_ds["Age.At.MHQ"][:10])
print(concat_ds["PHQ9.Severity"][:10])
print(concat_ds["Depressed.At.Baseline"][:10])
print(concat_ds["Neuroticism"][:10])
print(concat_ds["Self.Harm.Ever"][:10])
print(concat_ds["Not.Worth.Living"][:10])
print(concat_ds["PCL.Score"][:10])
print(concat_ds["GAD7.Severity"][:10])

[nan, nan, nan, 0.0, 1.0, 0.0, nan, nan, 0.0, 0.0]
[nan, nan, nan, 52.0, 72.0, 75.0, nan, nan, 72.0, 63.0]
[nan, nan, nan, 3.0, 1.0, 0.0, nan, nan, 0.0, 10.0]
[nan, nan, nan, 0.0, 0.0, 1.0, nan, nan, 0.0, 1.0]
[nan, nan, nan, 5.0, nan, 1.0, nan, nan, 4.0, nan]
[nan, nan, nan, 0.0, 0.0, 0.0, nan, nan, 0.0, 0.0]
[nan, nan, nan, 0.0, 1.0, 0.0, nan, nan, 0.0, 0.0]
[nan, nan, nan, 7.0, 11.0, 7.0, nan, nan, 4.0, 14.0]
[nan, nan, nan, 2.0, 3.0, 4.0, nan, nan, 0.0, 10.0]


In [22]:
print("Gender:", np.unique(np.array(concat_ds["Gender"])))
print("Age.At.MHQ:", np.unique(np.array(concat_ds["Age.At.MHQ"])))
print("PHQ9.Severity:", np.unique(np.array(concat_ds["PHQ9.Severity"])))
print("Depressed.At.Baseline:", np.unique(np.array(concat_ds["Depressed.At.Baseline"])))
print("Neuroticism:", np.unique(np.array(concat_ds["Neuroticism"])))
print("Self.Harm.Ever:", np.unique(np.array(concat_ds["Self.Harm.Ever"])))
print("Not.Worth.Living:", np.unique(np.array(concat_ds["Not.Worth.Living"])))
print("PCL.Score:", np.unique(np.array(concat_ds["PCL.Score"])))
print("GAD7.Severity:", np.unique(np.array(concat_ds["GAD7.Severity"])))

Gender: [ 0.  1. nan]
Age.At.MHQ: [47. 48. 49. 50. 51. 52. 53. 54. 55. 56. 57. 58. 59. 60. 61. 62. 63. 64.
 65. 66. 67. 68. 69. 70. 71. 72. 73. 74. 75. 76. 77. 78. 79. 80. nan]
PHQ9.Severity: [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
 18. 19. 20. 21. 22. 23. 24. 25. 26. 27. nan]
Depressed.At.Baseline: [ 0.  1. nan]
Neuroticism: [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. nan]
Self.Harm.Ever: [ 0.  1. nan]
Not.Worth.Living: [ 0.  1. nan]
PCL.Score: [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17. 18.
 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. nan]
GAD7.Severity: [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
 18. 19. 20. 21. nan]


In [23]:
np.unique(np.array(concat_ds["Not.Worth.Living"]), return_counts=True)

(array([ 0.,  1., nan]), array([25965, 11191, 39139]))

## Run MLP Classifier on Gender

In [24]:
variable_of_interest = "Gender"

In [25]:
# Select rows of concat_dataset where label is not nan
full_label_list = concat_ds[variable_of_interest]
non_nan_indices = [idx for idx in range(len(full_label_list)) if not math.isnan(full_label_list[idx])]
# print(non_nan_indices[:10])
# print(full_label_list[:10])
non_nan_ds = concat_ds.select(non_nan_indices)
non_nan_ds = non_nan_ds.shuffle(seed=42)
non_nan_ds

Dataset({
    features: ['Raw_Recording', 'Voxelwise_RobustScaler_Normalized_Recording', 'Filename', 'Patient ID', 'Order', 'eid', 'Gender', 'Age.At.MHQ', 'PHQ9.Severity', 'Depressed.At.Baseline', 'Neuroticism', 'Self.Harm.Ever', 'Not.Worth.Living', 'PCL.Score', 'GAD7.Severity', 'cls_token_pca_components', 'whole_cls_token', 'recording_pca_components'],
    num_rows: 37156
})

In [26]:
raw_data_pca_nonnan = np.array(non_nan_ds["recording_pca_components"], dtype=np.float32)
print(raw_data_pca_nonnan.shape)
cls_token_pca_nonnan = np.array(non_nan_ds["cls_token_pca_components"], dtype=np.float32)
print(cls_token_pca_nonnan.shape)
whole_cls_token = np.array(non_nan_ds["whole_cls_token"], dtype=np.float32)
print(whole_cls_token.shape)

labels = non_nan_ds[variable_of_interest]
labels = [int(num) for num in labels]
labels[:10]

(37156, 200)
(37156, 200)
(37156, 512)


[0, 0, 1, 0, 1, 0, 0, 0, 1, 1]

In [27]:
# Run on raw data
split_idx = int(raw_data_pca_nonnan.shape[0] * 0.8)
train_X_raw_data = raw_data_pca_nonnan[:split_idx]
test_X_raw_data = raw_data_pca_nonnan[split_idx:]
print(train_X_raw_data.shape)
print(test_X_raw_data.shape)
train_y_raw_data = labels[:split_idx]
test_y_raw_data = labels[split_idx:]

(29724, 200)
(7432, 200)


In [28]:
num_positive_examples = sum(train_y_raw_data)
num_neg_examples = len(train_y_raw_data) - num_positive_examples
pos_weight = num_neg_examples / num_positive_examples
pos_weight = torch.tensor(pos_weight, dtype=torch.float32)
print(num_positive_examples, "positives")
print(num_neg_examples, "negatives")
print(len(train_y_raw_data), "total")
print("pos_weight:", pos_weight)

13608 positives
16116 negatives
29724 total
pos_weight: tensor(1.1843)


In [29]:
train_X_raw_data[0, :10]

array([-46.49019   ,  -6.9693494 , -44.529865  , -28.447311  ,
       -17.217026  ,   0.47021937,  14.024554  , -16.150738  ,
         8.224347  ,   0.1882209 ], dtype=float32)

In [30]:
clf = MLPClassifier(random_state=1234, max_iter=200, input_size=200, hidden_size=100, output_size=1)
clf.fit(train_X_raw_data, train_y_raw_data, pos_weight, learning_rate=0.001)
round(clf.score(test_X_raw_data, test_y_raw_data) * 100, 3)

Iteration 1, loss = 0.00281133
Iteration 2, loss = 0.00252307
Iteration 3, loss = 0.00235583
Iteration 4, loss = 0.00218348
Iteration 5, loss = 0.00200794
Iteration 6, loss = 0.00182901
Iteration 7, loss = 0.00165237
Iteration 8, loss = 0.00147952
Iteration 9, loss = 0.00133505
Iteration 10, loss = 0.00116827
Iteration 11, loss = 0.00104912
Iteration 12, loss = 0.00091731
Iteration 13, loss = 0.00082304
Iteration 14, loss = 0.00071563
Iteration 15, loss = 0.00061727
Iteration 16, loss = 0.00054355
Iteration 17, loss = 0.00046012
Iteration 18, loss = 0.00041369
Iteration 19, loss = 0.00035027
Iteration 20, loss = 0.00031400
Iteration 21, loss = 0.00026681
Iteration 22, loss = 0.00022584
Iteration 23, loss = 0.00020806
Iteration 24, loss = 0.00017900
Iteration 25, loss = 0.00020111
Iteration 26, loss = 0.00021100
Iteration 27, loss = 0.00020453
Iteration 28, loss = 0.00019026
Iteration 29, loss = 0.00021633
Iteration 30, loss = 0.00026378
Iteration 31, loss = 0.00022573
Iteration 32, los



61.437

In [31]:
# Run on CLS tokens
split_idx = int(cls_token_pca_nonnan.shape[0] * 0.8)
train_X_cls_token = cls_token_pca_nonnan[:split_idx]
test_X_cls_token = cls_token_pca_nonnan[split_idx:]
print(train_X_cls_token.shape)
print(test_X_cls_token.shape)
train_y_cls_token = labels[:split_idx]
test_y_cls_token = labels[split_idx:]

(29724, 200)
(7432, 200)


In [32]:
num_positive_examples = sum(train_y_raw_data)
num_neg_examples = len(train_y_raw_data) - num_positive_examples
pos_weight = num_neg_examples / num_positive_examples
pos_weight = torch.tensor(pos_weight, dtype=torch.float32)
print(num_positive_examples, "positives")
print(num_neg_examples, "negatives")
print(len(train_y_raw_data), "total")
print("pos_weight:", pos_weight)

13608 positives
16116 negatives
29724 total
pos_weight: tensor(1.1843)


In [33]:
train_X_cls_token[0, :10]

array([-4.44571   , -0.86119646, -0.49196237,  1.001361  , -0.5260579 ,
       -0.1556529 ,  0.7409144 , -0.53513235,  0.35432345,  0.27557766],
      dtype=float32)

In [34]:
clf = MLPClassifier(random_state=1234, max_iter=200, input_size=200, hidden_size=100, output_size=1)
clf.fit(train_X_cls_token, train_y_cls_token, pos_weight, learning_rate=0.001)
round(clf.score(test_X_cls_token, test_y_cls_token) * 100, 3)

Iteration 1, loss = 0.00276116
Iteration 2, loss = 0.00231719
Iteration 3, loss = 0.00222312
Iteration 4, loss = 0.00217633
Iteration 5, loss = 0.00214246
Iteration 6, loss = 0.00211137
Iteration 7, loss = 0.00209136
Iteration 8, loss = 0.00205818
Iteration 9, loss = 0.00202903
Iteration 10, loss = 0.00200176
Iteration 11, loss = 0.00198166
Iteration 12, loss = 0.00193914
Iteration 13, loss = 0.00190948
Iteration 14, loss = 0.00188771
Iteration 15, loss = 0.00185817
Iteration 16, loss = 0.00182981
Iteration 17, loss = 0.00179558
Iteration 18, loss = 0.00176865
Iteration 19, loss = 0.00172774
Iteration 20, loss = 0.00169150
Iteration 21, loss = 0.00166674
Iteration 22, loss = 0.00162546
Iteration 23, loss = 0.00159935
Iteration 24, loss = 0.00156839
Iteration 25, loss = 0.00155482
Iteration 26, loss = 0.00150691
Iteration 27, loss = 0.00146857
Iteration 28, loss = 0.00143095
Iteration 29, loss = 0.00141270
Iteration 30, loss = 0.00138078
Iteration 31, loss = 0.00136054
Iteration 32, los

68.878

In [35]:
# Run on whole CLS tokens
split_idx = int(whole_cls_token.shape[0] * 0.8)
train_X_cls_token = whole_cls_token[:split_idx]
test_X_cls_token = whole_cls_token[split_idx:]
print(train_X_cls_token.shape)
print(test_X_cls_token.shape)
train_y_cls_token = labels[:split_idx]
test_y_cls_token = labels[split_idx:]

(29724, 512)
(7432, 512)


In [36]:
num_positive_examples = sum(train_y_cls_token)
num_neg_examples = len(train_y_cls_token) - num_positive_examples
pos_weight = num_neg_examples / num_positive_examples
pos_weight = torch.tensor(pos_weight, dtype=torch.float32)
print(num_positive_examples, "positives")
print(num_neg_examples, "negatives")
print(len(train_y_raw_data), "total")
print("pos_weight:", pos_weight)

13608 positives
16116 negatives
29724 total
pos_weight: tensor(1.1843)


In [37]:
train_X_cls_token[0, :10]

array([-0.55948734, -0.8076543 ,  0.8432963 , -0.38548863,  1.3483922 ,
       -0.6400193 ,  1.1421213 ,  0.9573835 ,  1.5216708 ,  0.44831607],
      dtype=float32)

In [39]:
clf = MLPClassifier(random_state=1234, max_iter=200, input_size=512, hidden_size=100, output_size=1)
clf.fit(train_X_cls_token, train_y_cls_token, pos_weight, learning_rate=0.001)
round(clf.score(test_X_cls_token, test_y_cls_token) * 100, 3)

Iteration 1, loss = 0.00300766
Iteration 2, loss = 0.00296709
Iteration 3, loss = 0.00288212
Iteration 4, loss = 0.00282354
Iteration 5, loss = 0.00276916
Iteration 6, loss = 0.00274751
Iteration 7, loss = 0.00271770
Iteration 8, loss = 0.00270260
Iteration 9, loss = 0.00266314
Iteration 10, loss = 0.00265825
Iteration 11, loss = 0.00267982
Iteration 12, loss = 0.00262590
Iteration 13, loss = 0.00263042
Iteration 14, loss = 0.00258108
Iteration 15, loss = 0.00255700
Iteration 16, loss = 0.00260076
Iteration 17, loss = 0.00254704
Iteration 18, loss = 0.00251148
Iteration 19, loss = 0.00255310
Iteration 20, loss = 0.00253636
Iteration 21, loss = 0.00251740
Iteration 22, loss = 0.00251200
Iteration 23, loss = 0.00247540
Iteration 24, loss = 0.00247821
Iteration 25, loss = 0.00248407
Iteration 26, loss = 0.00248504
Iteration 27, loss = 0.00245899
Iteration 28, loss = 0.00248206
Iteration 29, loss = 0.00245513
Iteration 30, loss = 0.00244979
Iteration 31, loss = 0.00244079
Iteration 32, los

74.919

## Run KNN Classifier on Depression_at_Baseline

In [40]:
variable_of_interest = "Depressed.At.Baseline"

In [41]:
# Select rows of concat_dataset where label is not nan
full_label_list = concat_ds[variable_of_interest]
non_nan_indices = [idx for idx in range(len(full_label_list)) if not math.isnan(full_label_list[idx])]
# print(non_nan_indices[:10])
# print(full_label_list[:10])
non_nan_ds = concat_ds.select(non_nan_indices)
non_nan_ds = non_nan_ds.shuffle(seed=42)
non_nan_ds

Dataset({
    features: ['Raw_Recording', 'Voxelwise_RobustScaler_Normalized_Recording', 'Filename', 'Patient ID', 'Order', 'eid', 'Gender', 'Age.At.MHQ', 'PHQ9.Severity', 'Depressed.At.Baseline', 'Neuroticism', 'Self.Harm.Ever', 'Not.Worth.Living', 'PCL.Score', 'GAD7.Severity', 'cls_token_pca_components', 'whole_cls_token', 'recording_pca_components'],
    num_rows: 36134
})

In [42]:
raw_data_pca_nonnan = np.array(non_nan_ds["recording_pca_components"], dtype=np.float32)
print(raw_data_pca_nonnan.shape)
cls_token_pca_nonnan = np.array(non_nan_ds["cls_token_pca_components"], dtype=np.float32)
print(cls_token_pca_nonnan.shape)
whole_cls_token = np.array(non_nan_ds["whole_cls_token"], dtype=np.float32)
print(whole_cls_token.shape)

labels = non_nan_ds[variable_of_interest]
labels = [int(num) for num in labels]
labels[:10]

(36134, 200)
(36134, 200)
(36134, 512)


[0, 0, 0, 0, 0, 1, 0, 0, 0, 0]

In [43]:
# Run on raw data
split_idx = int(raw_data_pca_nonnan.shape[0] * 0.8)
train_X_raw_data = raw_data_pca_nonnan[:split_idx]
test_X_raw_data = raw_data_pca_nonnan[split_idx:]
print(train_X_raw_data.shape)
print(test_X_raw_data.shape)
train_y_raw_data = labels[:split_idx]
test_y_raw_data = labels[split_idx:]

(28907, 200)
(7227, 200)


In [44]:
num_positive_examples = sum(train_y_raw_data)
num_neg_examples = len(train_y_raw_data) - num_positive_examples
pos_weight = num_neg_examples / num_positive_examples
pos_weight = torch.tensor(pos_weight, dtype=torch.float32)
print(num_positive_examples, "positives")
print(num_neg_examples, "negatives")
print(len(train_y_raw_data), "total")
print("pos_weight:", pos_weight)

5609 positives
23298 negatives
28907 total
pos_weight: tensor(4.1537)


In [45]:
train_X_raw_data[0, :10]

array([-27.752443 ,  -1.9527817,  15.93868  , -16.917395 , -27.497398 ,
        27.616974 ,  -3.8510146,   6.2229595,   2.5564923,  -7.5470734],
      dtype=float32)

In [46]:
clf = MLPClassifier(random_state=1234, max_iter=200, input_size=200, hidden_size=100, output_size=1)
clf.fit(train_X_raw_data, train_y_raw_data, pos_weight, learning_rate=0.001)
round(clf.score(test_X_raw_data, test_y_raw_data) * 100, 3)

Iteration 1, loss = 0.00457400
Iteration 2, loss = 0.00430167
Iteration 3, loss = 0.00406359
Iteration 4, loss = 0.00373123
Iteration 5, loss = 0.00336288
Iteration 6, loss = 0.00296351
Iteration 7, loss = 0.00255734
Iteration 8, loss = 0.00223335
Iteration 9, loss = 0.00189104
Iteration 10, loss = 0.00159181
Iteration 11, loss = 0.00134139
Iteration 12, loss = 0.00118634
Iteration 13, loss = 0.00102404
Iteration 14, loss = 0.00082248
Iteration 15, loss = 0.00070300
Iteration 16, loss = 0.00057093
Iteration 17, loss = 0.00047535
Iteration 18, loss = 0.00040806
Iteration 19, loss = 0.00033782
Iteration 20, loss = 0.00028395
Iteration 21, loss = 0.00022448
Iteration 22, loss = 0.00016848
Iteration 23, loss = 0.00012167
Iteration 24, loss = 0.00008389
Iteration 25, loss = 0.00006476
Iteration 26, loss = 0.00005377
Iteration 27, loss = 0.00004619
Iteration 28, loss = 0.00004036
Iteration 29, loss = 0.00003603
Iteration 30, loss = 0.00003228
Iteration 31, loss = 0.00002892
Iteration 32, los

69.033

In [47]:
# Run on CLS tokens
split_idx = int(cls_token_pca_nonnan.shape[0] * 0.8)
train_X_cls_token = cls_token_pca_nonnan[:split_idx]
test_X_cls_token = cls_token_pca_nonnan[split_idx:]
print(train_X_cls_token.shape)
print(test_X_cls_token.shape)
train_y_cls_token = labels[:split_idx]
test_y_cls_token = labels[split_idx:]

(28907, 200)
(7227, 200)


In [48]:
num_positive_examples = sum(train_y_cls_token)
num_neg_examples = len(train_y_cls_token) - num_positive_examples
pos_weight = num_neg_examples / num_positive_examples
pos_weight = torch.tensor(pos_weight, dtype=torch.float32)
print(num_positive_examples, "positives")
print(num_neg_examples, "negatives")
print(len(train_y_raw_data), "total")
print("pos_weight:", pos_weight)

5609 positives
23298 negatives
28907 total
pos_weight: tensor(4.1537)


In [49]:
train_X_cls_token[0, :10]

array([-4.0550914 ,  0.10491114, -0.07129294,  0.28047138, -0.3171592 ,
        0.27682567, -0.6286486 ,  0.13380419, -0.3632379 ,  0.51952714],
      dtype=float32)

In [50]:
clf = MLPClassifier(random_state=1234, max_iter=200, input_size=200, hidden_size=100, output_size=1)
clf.fit(train_X_cls_token, train_y_cls_token, pos_weight, learning_rate=0.001)
round(clf.score(test_X_cls_token, test_y_cls_token) * 100, 3)

Iteration 1, loss = 0.00448286
Iteration 2, loss = 0.00446480
Iteration 3, loss = 0.00444316
Iteration 4, loss = 0.00440066
Iteration 5, loss = 0.00436944
Iteration 6, loss = 0.00431785
Iteration 7, loss = 0.00424877
Iteration 8, loss = 0.00418452
Iteration 9, loss = 0.00408701
Iteration 10, loss = 0.00399363
Iteration 11, loss = 0.00387466
Iteration 12, loss = 0.00375999
Iteration 13, loss = 0.00364744
Iteration 14, loss = 0.00350559
Iteration 15, loss = 0.00340155
Iteration 16, loss = 0.00327563
Iteration 17, loss = 0.00313407
Iteration 18, loss = 0.00301977
Iteration 19, loss = 0.00289122
Iteration 20, loss = 0.00277593
Iteration 21, loss = 0.00265308
Iteration 22, loss = 0.00255368
Iteration 23, loss = 0.00242223
Iteration 24, loss = 0.00234402
Iteration 25, loss = 0.00221459
Iteration 26, loss = 0.00211026
Iteration 27, loss = 0.00204011
Iteration 28, loss = 0.00195577
Iteration 29, loss = 0.00186469
Iteration 30, loss = 0.00173524
Iteration 31, loss = 0.00168594
Iteration 32, los

66.791

In [51]:
# Run on whole CLS tokens
split_idx = int(whole_cls_token.shape[0] * 0.8)
train_X_cls_token = whole_cls_token[:split_idx]
test_X_cls_token = whole_cls_token[split_idx:]
print(train_X_cls_token.shape)
print(test_X_cls_token.shape)
train_y_cls_token = labels[:split_idx]
test_y_cls_token = labels[split_idx:]

(28907, 512)
(7227, 512)


In [52]:
num_positive_examples = sum(train_y_cls_token)
num_neg_examples = len(train_y_cls_token) - num_positive_examples
pos_weight = num_neg_examples / num_positive_examples
pos_weight = torch.tensor(pos_weight, dtype=torch.float32)
print(num_positive_examples, "positives")
print(num_neg_examples, "negatives")
print(len(train_y_raw_data), "total")
print("pos_weight:", pos_weight)

5609 positives
23298 negatives
28907 total
pos_weight: tensor(4.1537)


In [53]:
train_X_cls_token[0, :10]

array([-0.50639653, -0.7314293 ,  0.93138945, -0.4126634 ,  1.5093479 ,
       -0.5733331 ,  1.307188  ,  0.87171364,  1.5900366 ,  0.3580955 ],
      dtype=float32)

In [54]:
clf = MLPClassifier(random_state=1234, max_iter=200, input_size=512, hidden_size=100, output_size=1)
clf.fit(train_X_cls_token, train_y_cls_token, pos_weight, learning_rate=0.001)
round(clf.score(test_X_cls_token, test_y_cls_token) * 100, 3)

Iteration 1, loss = 0.00448808
Iteration 2, loss = 0.00448696
Iteration 3, loss = 0.00448430
Iteration 4, loss = 0.00448430
Iteration 5, loss = 0.00448512
Iteration 6, loss = 0.00448423
Iteration 7, loss = 0.00448334
Iteration 8, loss = 0.00448415
Iteration 9, loss = 0.00448302
Iteration 10, loss = 0.00448356
Iteration 11, loss = 0.00448348
Iteration 12, loss = 0.00448447
Iteration 13, loss = 0.00448331
Iteration 14, loss = 0.00448325
Iteration 15, loss = 0.00448424
Iteration 16, loss = 0.00448398
Iteration 17, loss = 0.00448394
Iteration 18, loss = 0.00448527
Iteration 19, loss = 0.00448394
Iteration 20, loss = 0.00448504
Iteration 21, loss = 0.00448280
Iteration 22, loss = 0.00448259
Iteration 23, loss = 0.00448487
Iteration 24, loss = 0.00448423
Iteration 25, loss = 0.00448536
Iteration 26, loss = 0.00448452
Iteration 27, loss = 0.00448417
Iteration 28, loss = 0.00448269
Iteration 29, loss = 0.00448469
Iteration 30, loss = 0.00448426
Iteration 31, loss = 0.00448261
Training loss did

79.217

## Run KNN on Self.Harm.Ever

In [55]:
variable_of_interest = "Self.Harm.Ever"

In [56]:
# Select rows of concat_dataset where label is not nan
full_label_list = concat_ds[variable_of_interest]
non_nan_indices = [idx for idx in range(len(full_label_list)) if not math.isnan(full_label_list[idx])]
# print(non_nan_indices[:10])
# print(full_label_list[:10])
non_nan_ds = concat_ds.select(non_nan_indices)
non_nan_ds = non_nan_ds.shuffle(seed=42)
non_nan_ds

Dataset({
    features: ['Raw_Recording', 'Voxelwise_RobustScaler_Normalized_Recording', 'Filename', 'Patient ID', 'Order', 'eid', 'Gender', 'Age.At.MHQ', 'PHQ9.Severity', 'Depressed.At.Baseline', 'Neuroticism', 'Self.Harm.Ever', 'Not.Worth.Living', 'PCL.Score', 'GAD7.Severity', 'cls_token_pca_components', 'whole_cls_token', 'recording_pca_components'],
    num_rows: 37048
})

In [57]:
raw_data_pca_nonnan = np.array(non_nan_ds["recording_pca_components"], dtype=np.float32)
print(raw_data_pca_nonnan.shape)
cls_token_pca_nonnan = np.array(non_nan_ds["cls_token_pca_components"], dtype=np.float32)
print(cls_token_pca_nonnan.shape)
whole_cls_token = np.array(non_nan_ds["whole_cls_token"], dtype=np.float32)
print(whole_cls_token.shape)

labels = non_nan_ds[variable_of_interest]
labels = [int(num) for num in labels]
labels[:10]

(37048, 200)
(37048, 200)
(37048, 512)


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [58]:
# Run on CLS tokens
split_idx = int(cls_token_pca_nonnan.shape[0] * 0.8)
train_X_cls_token = cls_token_pca_nonnan[:split_idx]
test_X_cls_token = cls_token_pca_nonnan[split_idx:]
print(train_X_cls_token.shape)
print(test_X_cls_token.shape)
train_y_cls_token = labels[:split_idx]
test_y_cls_token = labels[split_idx:]

(29638, 200)
(7410, 200)


In [59]:
num_positive_examples = sum(train_y_cls_token)
num_neg_examples = len(train_y_cls_token) - num_positive_examples
pos_weight = num_neg_examples / num_positive_examples
pos_weight = torch.tensor(pos_weight, dtype=torch.float32)
print(num_positive_examples, "positives")
print(num_neg_examples, "negatives")
print(len(train_y_raw_data), "total")
print("pos_weight:", pos_weight)

1266 positives
28372 negatives
28907 total
pos_weight: tensor(22.4107)


In [60]:
train_X_cls_token[0, :10]

array([-2.729769  ,  0.98524106, -0.39734557, -0.6080432 ,  0.18143505,
       -0.36860886,  0.12589368, -0.11459521, -0.63993025, -0.58845013],
      dtype=float32)

In [61]:
clf = MLPClassifier(random_state=1234, max_iter=200, input_size=200, hidden_size=100, output_size=1)
clf.fit(train_X_cls_token, train_y_cls_token, pos_weight, learning_rate=0.001)
round(clf.score(test_X_cls_token, test_y_cls_token) * 100, 3)

Iteration 1, loss = 0.00531949
Iteration 2, loss = 0.00525923
Iteration 3, loss = 0.00517280
Iteration 4, loss = 0.00508361
Iteration 5, loss = 0.00495632
Iteration 6, loss = 0.00479038
Iteration 7, loss = 0.00458784
Iteration 8, loss = 0.00434005
Iteration 9, loss = 0.00404872
Iteration 10, loss = 0.00372824
Iteration 11, loss = 0.00330336
Iteration 12, loss = 0.00295873
Iteration 13, loss = 0.00259296
Iteration 14, loss = 0.00228787
Iteration 15, loss = 0.00202829
Iteration 16, loss = 0.00172923
Iteration 17, loss = 0.00153153
Iteration 18, loss = 0.00136040
Iteration 19, loss = 0.00120677
Iteration 20, loss = 0.00118128
Iteration 21, loss = 0.00097176
Iteration 22, loss = 0.00092645
Iteration 23, loss = 0.00076914
Iteration 24, loss = 0.00065086
Iteration 25, loss = 0.00081603
Iteration 26, loss = 0.00068835
Iteration 27, loss = 0.00061990
Iteration 28, loss = 0.00052585
Iteration 29, loss = 0.00045698
Iteration 30, loss = 0.00044885
Iteration 31, loss = 0.00037853
Iteration 32, los

91.7

In [83]:
# Run on whole CLS tokens
split_idx = int(whole_cls_token.shape[0] * 0.8)
train_X_cls_token = whole_cls_token[:split_idx]
test_X_cls_token = whole_cls_token[split_idx:]
print(train_X_cls_token.shape)
print(test_X_cls_token.shape)
train_y_cls_token = labels[:split_idx]
test_y_cls_token = labels[split_idx:]

(29724, 512)
(7432, 512)


In [84]:
num_positive_examples = sum(train_y_cls_token)
num_neg_examples = len(train_y_cls_token) - num_positive_examples
pos_weight = num_neg_examples / num_positive_examples
pos_weight = torch.tensor(pos_weight, dtype=torch.float32)
print(num_positive_examples, "positives")
print(num_neg_examples, "negatives")
print(len(train_y_raw_data), "total")
print("pos_weight:", pos_weight)

9014 positives
20710 negatives
29724 total
pos_weight: tensor(2.2975)


In [64]:
clf = MLPClassifier(random_state=1234, max_iter=200, input_size=512, hidden_size=100, output_size=1)
clf.fit(train_X_cls_token, train_y_cls_token, pos_weight, learning_rate=0.001)
round(clf.score(test_X_cls_token, test_y_cls_token) * 100, 3)

Iteration 1, loss = 0.00534777
Iteration 2, loss = 0.00533292
Iteration 3, loss = 0.00533061
Iteration 4, loss = 0.00533008
Iteration 5, loss = 0.00532232
Iteration 6, loss = 0.00533563
Iteration 7, loss = 0.00532436
Iteration 8, loss = 0.00533551
Iteration 9, loss = 0.00533570
Iteration 10, loss = 0.00532439
Iteration 11, loss = 0.00533716
Iteration 12, loss = 0.00533097
Iteration 13, loss = 0.00532595
Iteration 14, loss = 0.00532846
Iteration 15, loss = 0.00532892
Iteration 16, loss = 0.00532937
Iteration 17, loss = 0.00533445
Iteration 18, loss = 0.00533394
Iteration 19, loss = 0.00533381
Iteration 20, loss = 0.00532936
Iteration 21, loss = 0.00533154
Iteration 22, loss = 0.00533331
Iteration 23, loss = 0.00532556
Iteration 24, loss = 0.00533534
Iteration 25, loss = 0.00533200
Iteration 26, loss = 0.00532387
Iteration 27, loss = 0.00533199
Iteration 28, loss = 0.00533209
Iteration 29, loss = 0.00532877
Iteration 30, loss = 0.00532551
Iteration 31, loss = 0.00532471
Iteration 32, los

4.265

In [65]:
# Run on raw data
split_idx = int(raw_data_pca_nonnan.shape[0] * 0.8)
train_X_raw_data = raw_data_pca_nonnan[:split_idx]
test_X_raw_data = raw_data_pca_nonnan[split_idx:]
print(train_X_raw_data.shape)
print(test_X_raw_data.shape)
train_y_raw_data = labels[:split_idx]
test_y_raw_data = labels[split_idx:]

(29638, 200)
(7410, 200)


In [66]:
num_positive_examples = sum(train_y_raw_data)
num_neg_examples = len(train_y_raw_data) - num_positive_examples
pos_weight = num_neg_examples / num_positive_examples
pos_weight = torch.tensor(pos_weight, dtype=torch.float32)
print(num_positive_examples, "positives")
print(num_neg_examples, "negatives")
print(len(train_y_raw_data), "total")
print("pos_weight:", pos_weight)

1266 positives
28372 negatives
29638 total
pos_weight: tensor(22.4107)


In [67]:
train_X_raw_data[0, :10]

array([-49.608303 ,  13.449668 ,   3.423456 , -10.528745 , -14.761459 ,
        24.146687 ,   3.298543 ,   6.3007097,  12.420247 ,   2.5742261],
      dtype=float32)

In [68]:
clf = MLPClassifier(random_state=1234, max_iter=200, input_size=200, hidden_size=100, output_size=1)
clf.fit(train_X_raw_data, train_y_raw_data, pos_weight, learning_rate=0.001)
round(clf.score(test_X_raw_data, test_y_raw_data) * 100, 3)

Iteration 1, loss = 0.00547587
Iteration 2, loss = 0.00473934
Iteration 3, loss = 0.00408906
Iteration 4, loss = 0.00330417
Iteration 5, loss = 0.00247387
Iteration 6, loss = 0.00177360
Iteration 7, loss = 0.00121447
Iteration 8, loss = 0.00088921
Iteration 9, loss = 0.00063170
Iteration 10, loss = 0.00046634
Iteration 11, loss = 0.00035466
Iteration 12, loss = 0.00026998
Iteration 13, loss = 0.00015383
Iteration 14, loss = 0.00008462
Iteration 15, loss = 0.00005422
Iteration 16, loss = 0.00003923
Iteration 17, loss = 0.00003077
Iteration 18, loss = 0.00002514
Iteration 19, loss = 0.00002121
Iteration 20, loss = 0.00001807
Iteration 21, loss = 0.00001558
Iteration 22, loss = 0.00001352
Iteration 23, loss = 0.00001200
Iteration 24, loss = 0.00001064
Iteration 25, loss = 0.00000943
Iteration 26, loss = 0.00000850
Iteration 27, loss = 0.00000765
Iteration 28, loss = 0.00000697
Iteration 29, loss = 0.00000631
Iteration 30, loss = 0.00000574
Iteration 31, loss = 0.00000529
Iteration 32, los

92.659

## Run KNN on Not.Worth.Living

In [69]:
variable_of_interest = "Not.Worth.Living"

In [70]:
# Select rows of concat_dataset where label is not nan
full_label_list = concat_ds[variable_of_interest]
non_nan_indices = [idx for idx in range(len(full_label_list)) if not math.isnan(full_label_list[idx])]
# print(non_nan_indices[:10])
# print(full_label_list[:10])
non_nan_ds = concat_ds.select(non_nan_indices)
non_nan_ds = non_nan_ds.shuffle(seed=42)
non_nan_ds

Loading cached shuffled indices for dataset at /home/sr2464/palmer_scratch/datasets/UKB_Large_rsfMRI_and_tffMRI_Arrow_WithRegression_v3_with_metadata/train_ukbiobank/cache-d8ca8d5320239fd9.arrow


Dataset({
    features: ['Raw_Recording', 'Voxelwise_RobustScaler_Normalized_Recording', 'Filename', 'Patient ID', 'Order', 'eid', 'Gender', 'Age.At.MHQ', 'PHQ9.Severity', 'Depressed.At.Baseline', 'Neuroticism', 'Self.Harm.Ever', 'Not.Worth.Living', 'PCL.Score', 'GAD7.Severity', 'cls_token_pca_components', 'whole_cls_token', 'recording_pca_components'],
    num_rows: 37156
})

In [71]:
raw_data_pca_nonnan = np.array(non_nan_ds["recording_pca_components"], dtype=np.float32)
print(raw_data_pca_nonnan.shape)
cls_token_pca_nonnan = np.array(non_nan_ds["cls_token_pca_components"], dtype=np.float32)
print(cls_token_pca_nonnan.shape)
whole_cls_token = np.array(non_nan_ds["whole_cls_token"], dtype=np.float32)
print(whole_cls_token.shape)

labels = non_nan_ds[variable_of_interest]
labels = [int(num) for num in labels]
labels[:10]

(37156, 200)
(37156, 200)
(37156, 512)


[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]

In [72]:
# Run on CLS tokens
split_idx = int(cls_token_pca_nonnan.shape[0] * 0.8)
train_X_cls_token = cls_token_pca_nonnan[:split_idx]
test_X_cls_token = cls_token_pca_nonnan[split_idx:]
print(train_X_cls_token.shape)
print(test_X_cls_token.shape)
train_y_cls_token = labels[:split_idx]
test_y_cls_token = labels[split_idx:]

(29724, 200)
(7432, 200)


In [73]:
num_positive_examples = sum(train_y_cls_token)
num_neg_examples = len(train_y_cls_token) - num_positive_examples
pos_weight = num_neg_examples / num_positive_examples
pos_weight = torch.tensor(pos_weight, dtype=torch.float32)
print(num_positive_examples, "positives")
print(num_neg_examples, "negatives")
print(len(train_y_raw_data), "total")
print("pos_weight:", pos_weight)

9014 positives
20710 negatives
29638 total
pos_weight: tensor(2.2975)


In [74]:
train_X_raw_data[0, :10]

array([-49.608303 ,  13.449668 ,   3.423456 , -10.528745 , -14.761459 ,
        24.146687 ,   3.298543 ,   6.3007097,  12.420247 ,   2.5742261],
      dtype=float32)

In [75]:
clf = MLPClassifier(random_state=1234, max_iter=200, input_size=200, hidden_size=100, output_size=1)
clf.fit(train_X_cls_token, train_y_cls_token, pos_weight, learning_rate=0.001)
round(clf.score(test_X_cls_token, test_y_cls_token) * 100, 3)

Iteration 1, loss = 0.00386414
Iteration 2, loss = 0.00385036
Iteration 3, loss = 0.00383675
Iteration 4, loss = 0.00381619
Iteration 5, loss = 0.00379961
Iteration 6, loss = 0.00377229
Iteration 7, loss = 0.00374215
Iteration 8, loss = 0.00370291
Iteration 9, loss = 0.00365133
Iteration 10, loss = 0.00358683
Iteration 11, loss = 0.00351842
Iteration 12, loss = 0.00344800
Iteration 13, loss = 0.00337361
Iteration 14, loss = 0.00328715
Iteration 15, loss = 0.00321726
Iteration 16, loss = 0.00314335
Iteration 17, loss = 0.00306498
Iteration 18, loss = 0.00297001
Iteration 19, loss = 0.00288122
Iteration 20, loss = 0.00280603
Iteration 21, loss = 0.00272209
Iteration 22, loss = 0.00264820
Iteration 23, loss = 0.00255514
Iteration 24, loss = 0.00248840
Iteration 25, loss = 0.00241085
Iteration 26, loss = 0.00234415
Iteration 27, loss = 0.00227581
Iteration 28, loss = 0.00218893
Iteration 29, loss = 0.00215187
Iteration 30, loss = 0.00206139
Iteration 31, loss = 0.00199988
Iteration 32, los

57.683

In [76]:
# Run on whole CLS tokens
split_idx = int(whole_cls_token.shape[0] * 0.8)
train_X_cls_token = whole_cls_token[:split_idx]
test_X_cls_token = whole_cls_token[split_idx:]
print(train_X_cls_token.shape)
print(test_X_cls_token.shape)
train_y_cls_token = labels[:split_idx]
test_y_cls_token = labels[split_idx:]

(29724, 512)
(7432, 512)


In [77]:
num_positive_examples = sum(train_y_cls_token)
num_neg_examples = len(train_y_cls_token) - num_positive_examples
pos_weight = num_neg_examples / num_positive_examples
pos_weight = torch.tensor(pos_weight, dtype=torch.float32)
print(num_positive_examples, "positives")
print(num_neg_examples, "negatives")
print(len(train_y_raw_data), "total")
print("pos_weight:", pos_weight)

9014 positives
20710 negatives
29638 total
pos_weight: tensor(2.2975)


In [78]:
clf = MLPClassifier(random_state=1234, max_iter=200, input_size=512, hidden_size=100, output_size=1)
clf.fit(train_X_cls_token, train_y_cls_token, pos_weight, learning_rate=0.001)
round(clf.score(test_X_cls_token, test_y_cls_token) * 100, 3)

Iteration 1, loss = 0.00386912
Iteration 2, loss = 0.00386874
Iteration 3, loss = 0.00386639
Iteration 4, loss = 0.00386762
Iteration 5, loss = 0.00386635
Iteration 6, loss = 0.00386522
Iteration 7, loss = 0.00386724
Iteration 8, loss = 0.00386505
Iteration 9, loss = 0.00386603
Iteration 10, loss = 0.00386607
Iteration 11, loss = 0.00386442
Iteration 12, loss = 0.00386765
Iteration 13, loss = 0.00386518
Iteration 14, loss = 0.00386355
Iteration 15, loss = 0.00386188
Iteration 16, loss = 0.00386242
Iteration 17, loss = 0.00386526
Iteration 18, loss = 0.00386200
Iteration 19, loss = 0.00386089
Iteration 20, loss = 0.00386156
Iteration 21, loss = 0.00386061
Iteration 22, loss = 0.00386017
Iteration 23, loss = 0.00385923
Iteration 24, loss = 0.00386097
Iteration 25, loss = 0.00386205
Iteration 26, loss = 0.00386023
Iteration 27, loss = 0.00386086
Iteration 28, loss = 0.00385938
Iteration 29, loss = 0.00385941
Iteration 30, loss = 0.00385785
Iteration 31, loss = 0.00385824
Iteration 32, los

63.011

In [79]:
# Run on raw data
split_idx = int(raw_data_pca_nonnan.shape[0] * 0.8)
train_X_raw_data = raw_data_pca_nonnan[:split_idx]
test_X_raw_data = raw_data_pca_nonnan[split_idx:]
print(train_X_raw_data.shape)
print(test_X_raw_data.shape)
train_y_raw_data = labels[:split_idx]
test_y_raw_data = labels[split_idx:]

(29724, 200)
(7432, 200)


In [80]:
num_positive_examples = sum(train_y_raw_data)
num_neg_examples = len(train_y_raw_data) - num_positive_examples
pos_weight = num_neg_examples / num_positive_examples
pos_weight = torch.tensor(pos_weight, dtype=torch.float32)
print(num_positive_examples, "positives")
print(num_neg_examples, "negatives")
print(len(train_y_raw_data), "total")
print("pos_weight:", pos_weight)

9014 positives
20710 negatives
29724 total
pos_weight: tensor(2.2975)


In [81]:
train_X_raw_data[0, :10]

array([-46.49019   ,  -6.9693494 , -44.529865  , -28.447311  ,
       -17.217026  ,   0.47021937,  14.024554  , -16.150738  ,
         8.224347  ,   0.1882209 ], dtype=float32)

In [82]:
clf = MLPClassifier(random_state=1234, max_iter=200, input_size=200, hidden_size=100, output_size=1)
clf.fit(train_X_raw_data, train_y_raw_data, pos_weight, learning_rate=0.001)
round(clf.score(test_X_raw_data, test_y_raw_data) * 100, 3)

Iteration 1, loss = 0.00392885
Iteration 2, loss = 0.00373737
Iteration 3, loss = 0.00357905
Iteration 4, loss = 0.00336706
Iteration 5, loss = 0.00312664
Iteration 6, loss = 0.00284627
Iteration 7, loss = 0.00255931
Iteration 8, loss = 0.00231338
Iteration 9, loss = 0.00207599
Iteration 10, loss = 0.00183750
Iteration 11, loss = 0.00160573
Iteration 12, loss = 0.00142137
Iteration 13, loss = 0.00125967
Iteration 14, loss = 0.00111212
Iteration 15, loss = 0.00096995
Iteration 16, loss = 0.00081980
Iteration 17, loss = 0.00071726
Iteration 18, loss = 0.00061473
Iteration 19, loss = 0.00055633
Iteration 20, loss = 0.00049653
Iteration 21, loss = 0.00041248
Iteration 22, loss = 0.00033714
Iteration 23, loss = 0.00027824
Iteration 24, loss = 0.00022153
Iteration 25, loss = 0.00017779
Iteration 26, loss = 0.00013486
Iteration 27, loss = 0.00010689
Iteration 28, loss = 0.00008044
Iteration 29, loss = 0.00006571
Iteration 30, loss = 0.00005707
Iteration 31, loss = 0.00004927
Iteration 32, los

59.338