In [None]:
pip install momentfm

In [None]:
# alternative
!pip install git+https://github.com/moment-timeseries-foundation-model/moment.git

In [None]:
!pip install numpy pandas scikit-learn matplotlib tqdm

In [4]:
# getting the MOMENT model 
from momentfm import MOMENTPipeline

model = MOMENTPipeline.from_pretrained(
    "AutonLab/MOMENT-1-large", 
    model_kwargs={'task_name': 'embedding'}, # We are loading the model in `embedding` mode to learn representations
    local_files_only=True,  # Whether or not to only look at local files (i.e., do not try to download the model).
)

  torch.utils._pytree._register_pytree_node(


In [6]:
model.init()
print(model)

MOMENTPipeline(
  (normalizer): RevIN()
  (tokenizer): Patching()
  (patch_embedding): PatchEmbedding(
    (value_embedding): Linear(in_features=8, out_features=1024, bias=False)
    (position_embedding): PositionalEmbedding()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
  



In [8]:
# Number of parameters in the encoder
num_params = sum(p.numel() for p in model.encoder.parameters())
print(f"Number of parameters: {num_params}")

Number of parameters: 341231104


In [10]:
# import NHANES data 
import torch
import pandas as pd

df_min5 = pd.read_csv("data/data_wide.csv", index_col=0)
df_min5.head(3)

Unnamed: 0,seqn,gender,age,race,education,married,pir,bmi,time1,time2,...,time2007,time2008,time2009,time2010,time2011,time2012,time2013,time2014,time2015,time2016
1,21009,1,55,3,3,1,3.79,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,21010,2,52,3,4,6,1.24,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,21012,1,63,4,3,6,0.89,0,1,0,...,1,1,0,0,0,1,1,0,0,0


In [12]:
df_raw = pd.read_csv("data/raw_data_full.csv", index_col=0)
df_raw.head(3)

Unnamed: 0,seqn,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x10078,x10079,x10080,gender,age,race,education,married,pir,bmi
1,21009,0,0,0,0,0,0,0,0,0,...,0,0,0,1,55,3,3,1,3.79,1
2,21010,1,0,0,0,0,0,0,0,0,...,0,0,0,2,52,3,4,6,1.24,1
3,21012,86,272,1204,1646,0,0,0,0,0,...,37,10,45,1,63,4,3,6,0.89,0


In [14]:
df_raw_recoded = pd.read_csv("data/raw_data_recoded.csv", index_col=0)
df_raw_recoded.head(3)

Unnamed: 0,seqn,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x10078,x10079,x10080,gender,age,race,education,married,pir,bmi
1,21009,0,0,0,0,0,0,0,0,0,...,0,0,0,1,55,3,3,1,3.79,1
2,21010,0,0,0,0,0,0,0,0,0,...,0,0,0,2,52,3,4,6,1.24,1
3,21012,0,1,2,2,0,0,0,0,0,...,0,0,0,1,63,4,3,6,0.89,0


In [16]:
# reshaping the data with L2 normalization 
import numpy as np

def normalize_l2(x):
    x = np.array(x)
    if x.ndim == 1:
        norm = np.linalg.norm(x)
        if norm < 1e-10:  # Use a small threshold instead of exact zero
            return x
        return x / norm
    else:
        # Create a copy of x to store the result
        normalized = np.zeros_like(x, dtype=np.float64)
        
        # Calculate norms for each row
        norms = np.linalg.norm(x, 2, axis=1, keepdims=True)
        
        # Process each row separately, avoiding division by zero
        for i in range(x.shape[0]):
            if norms[i] >= 1e-10:  # Only normalize if norm is not effectively zero
                normalized[i] = x[i] / norms[i]
            else:
                normalized[i] = x[i]  # Keep original values if norm is effectively zero
                
        return normalized

def prepare_data_from_df(df, value_columns, n_channels=1):
    MAX_SEQ_LEN = 512
    
    # convert time series columns to numpy array
    data = df[value_columns].values
    n_batchsize, n_context = data.shape
    print(f"Original data shape: {data.shape}")
    
    # confirm the reshaping
    context_per_channel = n_context // n_channels
    if n_context % n_channels != 0:
        raise ValueError(f"Number of features ({n_context}) must be divisible by number of channels ({n_channels})")
    
    # check if sequence length is greater than max=512 and truncate if needed.
    if context_per_channel > MAX_SEQ_LEN:
        print(f"Warning: Context length per channel ({context_per_channel}) exceeds maximum of {MAX_SEQ_LEN}. "
              f"Truncating to {MAX_SEQ_LEN}.")
        new_n_context = n_channels * MAX_SEQ_LEN
        data = data[:, :new_n_context]
        context_per_channel = MAX_SEQ_LEN
    
    # Apply L2 normalization to the data
    data = normalize_l2(data)
    
    # reshape the data into [batchsize, channel, context]
    data_reshaped = data.reshape(n_batchsize, n_channels, context_per_channel)
    print(f"Reshaped data shape: {data_reshaped.shape}")
    
    # Convert to torch tensor
    data_tensor = torch.FloatTensor(data_reshaped)
    print(f"Tensor shape: {data_tensor.shape}")
    
    return data_tensor  # [batchsize, channel, context_length]

In [18]:
val_col_raw = [col for col in df_raw.columns if col.startswith('x')]

raw_data_tensor = prepare_data_from_df(df_raw, val_col_raw, n_channels=1)
raw_recoded_tensor = prepare_data_from_df(df_raw_recoded, val_col_raw, n_channels=1)

Original data shape: (6943, 10080)
Reshaped data shape: (6943, 1, 512)
Tensor shape: torch.Size([6943, 1, 512])
Original data shape: (6943, 10080)
Reshaped data shape: (6943, 1, 512)
Tensor shape: torch.Size([6943, 1, 512])


In [None]:
###### DONT RUN. embedding of the data 
from pprint import pprint

output = model(x_enc=data_tens or)
pprint(output)

In [None]:
##### DONT RUN: get 3500 random subset from the data_tensor
import random

np.random.seed(1)
random_indices = random.sample(range(data_tensor.shape[0]), 3500)
subset_data = data_tensor[random_indices]

print(f"Subset shape: {subset_data.shape}")   

In [20]:
# two chunks of subset for raw data 
subset_raw_data1 = raw_data_tensor[:3500]
subset_raw_data2 = raw_data_tensor[3500:]

# two chunks of subset for raw recoded data 
subset_raw_recoded1 = raw_recoded_tensor[:3500]
subset_raw_recoded2 = raw_recoded_tensor[3500:]

print(f"Subset shape: {subset_raw_data1.shape}")
print(f"Subset shape: {subset_raw_data2.shape}")
print(f"Subset shape: {subset_raw_recoded1.shape}")
print(f"Subset shape: {subset_raw_recoded2.shape}")

Subset shape: torch.Size([3500, 1, 512])
Subset shape: torch.Size([3443, 1, 512])
Subset shape: torch.Size([3500, 1, 512])
Subset shape: torch.Size([3443, 1, 512])


In [21]:
# raw data embedding of the subset1 
from pprint import pprint

output_raw_data1 = model(x_enc=subset_raw_data1)
pprint(output_raw_data1)

  return fn(*args, **kwargs)


TimeseriesOutputs(forecast=None,
                  anomaly_scores=None,
                  logits=None,
                  labels=None,
                  input_mask=tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]]),
                  pretrain_mask=None,
                  reconstruction=None,
                  embeddings=tensor([[-0.0473,  0.0136, -0.0410,  ..., -0.0213,  0.0609,  0.0162],
        [-0.0275,  0.0420, -0.0908,  ..., -0.0521,  0.0782,  0.0214],
        [-0.0185,  0.0294, -0.0654,  ..., -0.0367,  0.0742,  0.0219],
        ...,
        [-0.0356,  0.0182, -0.0384,  ..., -0.0519,  0.0499,  0.0014],
        [-0.0172,  0.0361, -0.0469,  ..., -0.0571,  0.0201, -0.0457],
        [-0.0258,  0.0236, -0.0491,  ..., -0.0192,  0.0838,  0.0407]]),
                  metadata='mean',
      

In [23]:
# raw recoded embedding of the subset1  
from pprint import pprint

output_raw_recoded1 = model(x_enc=subset_raw_recoded1)
pprint(output_raw_recoded1)

  return fn(*args, **kwargs)


TimeseriesOutputs(forecast=None,
                  anomaly_scores=None,
                  logits=None,
                  labels=None,
                  input_mask=tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]]),
                  pretrain_mask=None,
                  reconstruction=None,
                  embeddings=tensor([[-0.0511,  0.0006, -0.0611,  ..., -0.0590,  0.0795, -0.0204],
        [ 0.0258,  0.0726, -0.0624,  ..., -0.0107,  0.1001,  0.0757],
        [-0.0292,  0.0068, -0.0555,  ..., -0.0334,  0.0631,  0.0157],
        ...,
        [-0.0531,  0.0089, -0.0191,  ..., -0.0462,  0.0588, -0.0121],
        [-0.0421,  0.0116, -0.0417,  ..., -0.0265,  0.0450,  0.0009],
        [ 0.0055, -0.0005, -0.0563,  ..., -0.0362,  0.1011,  0.0285]]),
                  metadata='mean',
      

In [35]:
# extract raw data embedding results of the subset1 
embeddings1 = output_raw_data1.embeddings
embeddings1_np = embeddings1.detach().cpu().numpy()
 
# convert to dataframe 
embeddings1_df = pd.DataFrame(embeddings1_np)

# combine with columns up to 'bmi' from the original dataset 
demographic_cols = ['seqn', 'gender', 'age', 'race', 'education', 'married', 'pir', 'bmi']
df_subset1 = df_raw.iloc[:3500].reset_index(drop=True)
embeddings1_df = pd.concat([pd.DataFrame(embeddings1_np), df_subset1[demographic_cols]], axis=1)

# reorder columns 
original_cols1 = df_subset1[demographic_cols].columns.tolist()
embeddings1_cols = list(range(embeddings1_np.shape[1])) 
embeddings1_df = embeddings1_df[original_cols1 + embeddings1_cols]

embeddings1_df.index = range(1, len(embeddings1_df) + 1)
embeddings1_df.head()

Unnamed: 0,seqn,gender,age,race,education,married,pir,bmi,0,1,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
1,21009,1,55,3,3,1,3.79,1,-0.047266,0.013634,...,-0.025214,0.007113,-0.00604,0.042259,-0.065077,-0.043545,-0.015492,-0.021318,0.060858,0.016189
2,21010,2,52,3,4,6,1.24,1,-0.02745,0.042044,...,-0.025256,0.043053,-0.00315,0.040694,-0.025756,-0.039395,-0.029901,-0.052143,0.078164,0.021447
3,21012,1,63,4,3,6,0.89,0,-0.018542,0.029361,...,-0.024592,0.015016,0.012961,0.01789,-0.030306,-0.033326,-0.008552,-0.036696,0.074204,0.021912
4,21015,1,83,3,4,1,1.2,1,-0.031758,0.029571,...,-0.02167,0.05137,-0.003156,0.034144,-0.092206,-0.017973,-0.016825,-0.053386,0.01683,0.02252
5,21017,2,37,1,2,6,0.21,0,-0.034428,0.024669,...,-0.015435,0.002695,-0.073576,0.035567,-0.042098,-0.081595,0.011101,-0.027794,0.098153,0.013926


In [39]:
# extract raw recoded embedding results of the subset1 
embeddings_re1 = output_raw_recoded1.embeddings
embeddings_re1_np = embeddings_re1.detach().cpu().numpy()
 
# convert to dataframe 
embeddings_re1_df = pd.DataFrame(embeddings_re1_np)

# combine with columns up to 'bmi' from the original dataset 
df_re_subset1 = df_raw_recoded.iloc[:3500].reset_index(drop=True)
embeddings_re1_df = pd.concat([pd.DataFrame(embeddings_re1_np), df_re_subset1[demographic_cols]], axis=1)

# reorder columns 
original_re_cols1 = df_re_subset1[demographic_cols].columns.tolist()
embeddings_re1_cols = list(range(embeddings_re1_np.shape[1])) 
embeddings_re1_df = embeddings_re1_df[original_cols1 + embeddings_re1_cols]

embeddings_re1_df.index = range(1, len(embeddings_re1_df) + 1)
embeddings_re1_df.head()

Unnamed: 0,seqn,gender,age,race,education,married,pir,bmi,0,1,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
1,21009,1,55,3,3,1,3.79,1,-0.051112,0.000583,...,-0.033599,-0.001142,-0.036977,0.045779,-0.044817,-0.04378,0.017271,-0.059013,0.079525,-0.020382
2,21010,2,52,3,4,6,1.24,1,0.025789,0.072576,...,-0.024518,0.038166,-0.006393,0.081935,-0.061219,-0.066133,-0.013672,-0.010671,0.100083,0.075739
3,21012,1,63,4,3,6,0.89,0,-0.029184,0.00685,...,-0.020409,0.023395,0.010668,0.01965,-0.036732,-0.034918,0.003772,-0.033445,0.063125,0.01567
4,21015,1,83,3,4,1,1.2,1,-0.004394,0.023608,...,-0.06517,0.028681,0.013572,0.049638,-0.00748,-0.028836,-0.026296,-0.024483,0.084238,0.035587
5,21017,2,37,1,2,6,0.21,0,-0.029962,0.019147,...,-0.019211,0.008637,-0.080567,0.046919,-0.084663,-0.026244,-0.007111,-0.028101,0.072586,-0.000554


In [61]:
# save raw data embeddings of subset1  
embeddings1_df.to_csv("./data/embeddings_moment_raw_subset1_1024.csv")

# save raw recoded embeddings of subset1
embeddings_re1_df.to_csv("./data/embeddings_moment_recoded_subset1_1024.csv")

In [None]:
# reducing dimension of subset1 to 50 

def reduce_dimension(embedding, dim=50):
    return embedding[:, :dim]
reduced_embeddings1_np = reduce_dimension(embeddings1_np)

# convert to dataframe 
red_embeddings1_df = pd.DataFrame(reduced_embeddings1_np)
red_embeddings1_df = pd.concat([pd.DataFrame(reduced_embeddings1_np), df_subset1.loc[:, :'bmi']], axis=1)

original1_cols = df_subset1.loc[:, :'bmi'].columns.tolist()
embedding1_cols = list(range(reduced_embeddings1_np.shape[1])) 
red_embeddings1_df = red_embeddings1_df[original1_cols + embedding1_cols]
red_embeddings1_df.index = range(1, len(red_embeddings1_df) + 1)

red_embeddings1_df.head()

In [None]:
# save embeddings with dimension reduction 
red_embeddings1_df.to_csv("./data/embeddings_moment_subset1_50.csv")

In [22]:
# raw data embedding of the subset2
from pprint import pprint

output_raw_data2 = model(x_enc=subset_raw_data2)
pprint(output_raw_data2)

  return fn(*args, **kwargs)


TimeseriesOutputs(forecast=None,
                  anomaly_scores=None,
                  logits=None,
                  labels=None,
                  input_mask=tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]]),
                  pretrain_mask=None,
                  reconstruction=None,
                  embeddings=tensor([[ 0.0056,  0.0720, -0.0840,  ..., -0.0391,  0.0646,  0.0334],
        [-0.0201,  0.0343, -0.0359,  ..., -0.0435,  0.0362,  0.0151],
        [ 0.0155,  0.0674, -0.0613,  ..., -0.0411,  0.0832,  0.0300],
        ...,
        [ 0.0157,  0.0589, -0.0608,  ..., -0.0037,  0.0763,  0.0768],
        [-0.0628,  0.0219, -0.0226,  ..., -0.0174,  0.0440, -0.0131],
        [ 0.0007,  0.0392, -0.0483,  ..., -0.0391,  0.0222,  0.0107]]),
                  metadata='mean',
      

In [24]:
# raw recoded embedding of the subset2
from pprint import pprint

output_raw_recoded2 = model(x_enc=subset_raw_recoded2)
pprint(output_raw_recoded2)

  return fn(*args, **kwargs)


TimeseriesOutputs(forecast=None,
                  anomaly_scores=None,
                  logits=None,
                  labels=None,
                  input_mask=tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]]),
                  pretrain_mask=None,
                  reconstruction=None,
                  embeddings=tensor([[ 0.0249,  0.0778, -0.0382,  ..., -0.0316,  0.0723,  0.0573],
        [-0.0357, -0.0006, -0.0092,  ..., -0.0455,  0.0743,  0.0346],
        [ 0.0411,  0.0659, -0.0626,  ..., -0.0255,  0.0685,  0.0663],
        ...,
        [ 0.0290,  0.0650, -0.0598,  ..., -0.0446,  0.0708,  0.0667],
        [-0.0237,  0.0191, -0.0397,  ..., -0.0456,  0.0475, -0.0037],
        [-0.0372,  0.0094, -0.0615,  ..., -0.0248,  0.0371,  0.0028]]),
                  metadata='mean',
      

In [28]:
# extract raw data embedding results of the subset2
embeddings2 = output_raw_data2.embeddings
embeddings2_np = embeddings2.detach().cpu().numpy()
 
# convert to dataframe 
embeddings2_df = pd.DataFrame(embeddings2_np)

# combine with columns up to 'bmi' from the original dataset 
demographic_cols = ['seqn', 'gender', 'age', 'race', 'education', 'married', 'pir', 'bmi']
df_subset2 = df_raw.iloc[3500:].reset_index(drop=True)
embeddings2_df = pd.concat([pd.DataFrame(embeddings2_np), df_subset2[demographic_cols]], axis=1)

# reorder columns 
original_cols2 = df_subset2[demographic_cols].columns.tolist()
embeddings2_cols = list(range(embeddings2_np.shape[1])) 
embeddings2_df = embeddings2_df[original_cols2 + embeddings2_cols]

embeddings2_df.index = range(1, len(embeddings2_df) + 1)
embeddings2_df.head()

Unnamed: 0,seqn,gender,age,race,education,married,pir,bmi,0,1,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
1,31183,2,33,3,5,1,3.9,1,0.005632,0.072009,...,-0.009222,0.028637,0.010225,0.06847,-0.041944,-0.057954,-0.008119,-0.039079,0.064605,0.033372
2,31186,2,46,1,1,3,1.68,1,-0.020101,0.034255,...,0.000466,-0.001557,-0.049511,0.017965,-0.038907,0.000107,-0.019617,-0.043522,0.036228,0.015108
3,31187,2,22,3,4,6,4.29,1,0.015528,0.067407,...,-0.008971,0.037552,0.016371,0.050205,-0.055978,-0.055804,-0.023536,-0.041118,0.083221,0.029989
4,31194,2,47,1,1,3,1.02,1,-0.011209,-0.005836,...,-0.040793,0.001773,-0.029853,0.055659,-0.034658,-0.02929,-0.020252,-0.045634,0.098191,0.011912
5,31195,1,73,3,4,1,5.0,0,-0.03253,-0.007258,...,-0.015335,0.029405,-0.045255,0.041964,-0.056479,-0.051723,-0.009947,-0.018775,0.073497,0.011463


In [34]:
# extract raw recoded embedding results of the subset2
embeddings_re2 = output_raw_recoded2.embeddings
embeddings_re2_np = embeddings_re2.detach().cpu().numpy()
 
# convert to dataframe 
embeddings_re2_df = pd.DataFrame(embeddings_re2_np)

# combine with columns up to 'bmi' from the original dataset 
df_re_subset2 = df_raw_recoded.iloc[3500:].reset_index(drop=True)
embeddings_re2_df = pd.concat([pd.DataFrame(embeddings_re2_np), df_re_subset2[demographic_cols]], axis=1)

# reorder columns 
original_re2_cols = df_re_subset2[demographic_cols].columns.tolist()
embeddings_re2_cols = list(range(embeddings_re2_np.shape[1])) 
embeddings_re2_df = embeddings_re2_df[original_re2_cols + embeddings_re2_cols]

embeddings_re2_df.index = range(1, len(embeddings_re2_df) + 1)
embeddings_re2_df.head()

Unnamed: 0,seqn,gender,age,race,education,married,pir,bmi,0,1,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
1,31183,2,33,3,5,1,3.9,1,0.024941,0.07782,...,-0.019332,0.028063,-0.00706,0.057758,-0.036486,-0.051136,-0.023133,-0.031572,0.072255,0.057259
2,31186,2,46,1,1,3,1.68,1,-0.035718,-0.000637,...,-0.023972,-0.001264,-0.072019,0.037286,-0.029342,-0.00597,-0.017131,-0.045468,0.074284,0.034569
3,31187,2,22,3,4,6,4.29,1,0.041061,0.065904,...,-0.019696,0.025494,0.017388,0.063759,-0.051173,-0.042629,-0.02184,-0.025535,0.068539,0.06632
4,31194,2,47,1,1,3,1.02,1,-0.038439,0.011988,...,-0.009369,-0.014868,-0.041542,0.059325,-0.055968,-0.041293,-0.013903,-0.044787,0.070615,0.004772
5,31195,1,73,3,4,1,5.0,0,-0.026982,0.012981,...,-0.019387,0.013981,-0.058329,0.064103,-0.052476,-0.023171,-0.026183,-0.048432,0.096744,0.054973


In [36]:
# save raw data embeddings of subset2 
embeddings2_df.to_csv("./data/embeddings_moment_raw_subset2_1024.csv")

# save raw recoded embeddings of subset2
embeddings_re2_df.to_csv("./data/embeddings_moment_recoded_subset2_1024.csv")

In [None]:
# reducing dimension for subset2
def reduce_dimension(embedding, dim=50):
    return embedding[:, :dim]
reduced_embeddings2_np = reduce_dimension(embeddings2_np)

# convert to dataframe 
red_embeddings2_df = pd.DataFrame(reduced_embeddings2_np)
red_embeddings2_df = pd.concat([pd.DataFrame(reduced_embeddings2_np), df_subset2.loc[:, :'bmi']], axis=1)

original2_cols = df_subset2.loc[:, :'bmi'].columns.tolist()
embedding2_cols = list(range(reduced_embeddings2_np.shape[1])) 
red_embeddings2_df = red_embeddings2_df[original2_cols + embedding2_cols]
red_embeddings2_df.index = range(1, len(red_embeddings2_df) + 1)

red_embeddings2_df.head()

In [None]:
# save embeddings with dimension reduction for subset 2
red_embeddings2_df.to_csv("./data/embeddings_moment_subset2_50.csv")

In [None]:
# PCA on the embeddings of subset1
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

embeddings1_manifold = PCA(n_components=2).fit_transform(embeddings1_np)
c = np.mean(embeddings1_np, axis=1)
categorical = False

plt.figure(figsize=(10, 8))
plt.scatter(
    embeddings1_manifold[:, 0],
    embeddings1_manifold[:, 1],
    c=c,
    cmap='magma'
)
plt.colorbar(label='Mean Value' if not categorical else 'Subject ID')
plt.title('PCA Projection of Time Series Embeddings')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()