In [None]:
pip install momentfm

In [None]:
# alternative
!pip install git+https://github.com/moment-timeseries-foundation-model/moment.git

In [None]:
!pip install numpy pandas scikit-learn matplotlib tqdm

In [34]:
# getting the MOMENT model 
from momentfm import MOMENTPipeline

model = MOMENTPipeline.from_pretrained(
    "AutonLab/MOMENT-1-large", 
    model_kwargs={'task_name': 'embedding'}, # We are loading the model in `embedding` mode to learn representations
    local_files_only=True,  # Whether or not to only look at local files (i.e., do not try to download the model).
)

In [36]:
model.init()
print(model)

MOMENTPipeline(
  (normalizer): RevIN()
  (tokenizer): Patching()
  (patch_embedding): PatchEmbedding(
    (value_embedding): Linear(in_features=8, out_features=1024, bias=False)
    (position_embedding): PositionalEmbedding()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
  



In [38]:
# Number of parameters in the encoder
num_params = sum(p.numel() for p in model.encoder.parameters())
print(f"Number of parameters: {num_params}")

Number of parameters: 341231104


In [40]:
# import NHANES data 
import torch
import pandas as pd

df = pd.read_csv("data/data_wide.csv", index_col=0)
df.head(3)

Unnamed: 0,seqn,gender,age,race,education,marital_status,pir,bmi,time1,time2,...,time2007,time2008,time2009,time2010,time2011,time2012,time2013,time2014,time2015,time2016
1,21009,1,55,3,3,1,3.79,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,21010,2,52,3,4,6,1.24,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,21012,1,63,4,3,6,0.89,0,1,0,...,1,1,0,0,0,1,1,0,0,0


In [42]:
#################### reshaping for entire data 
import numpy as np 

def prepare_data_from_df(df, value_columns, n_channels=1):
    MAX_SEQ_LEN = 512

    # convert time series columns to numpy array
    data = df[value_columns].values 
    n_batchsize, n_context = data.shape 
    print(f"Original data shape: {data.shape}")

    # confirm the reshaping  
    context_per_channel = n_context // n_channels
    if n_context % n_channels != 0:
        raise ValueError(f"Number of features ({n_context}) must be divisible by number of channels ({n_channels})")

    # check if sequence length is greater than max=512 and truncate if needed. 
    if context_per_channel > MAX_SEQ_LEN:
        print(f"Warning: Context length per channel ({context_per_channel}) exceeds maximum of {MAX_SEQ_LEN}. "
              f"Truncating to {MAX_SEQ_LEN}.")
        
        new_n_context = n_channels * MAX_SEQ_LEN        
        data = data[:, :new_n_context]
        context_per_channel = MAX_SEQ_LEN

    # reshape the data into [batchsize, channel, context]
    data_reshaped = data.reshape(n_batchsize, n_channels, context_per_channel)
    print(f"Reshaped data shape: {data_reshaped.shape}")
    
    # Convert to torch tensor 
    data_tensor = torch.FloatTensor(data_reshaped)
    print(f"Tensor shape: {data_tensor.shape}")

    return data_tensor # [batchsze, channel, context_length]

In [44]:
value_columns = [col for col in df.columns if col.startswith('time')]
data_tensor = prepare_data_from_df(df, value_columns, n_channels=1)

Original data shape: (6943, 2016)
Reshaped data shape: (6943, 1, 512)
Tensor shape: torch.Size([6943, 1, 512])


In [None]:
###### DONT RUN. embedding of the data 
from pprint import pprint

output = model(x_enc=data_tensor)
pprint(output)

In [46]:
# Create a random subset with 100 subjects
import numpy as np

np.random.seed(1)
random_indices = np.random.choice(df.index, size=100, replace=False)
df_subset = df.loc[random_indices].reset_index(drop=True)

df_subset.head(10)

Unnamed: 0,seqn,gender,age,race,education,marital_status,pir,bmi,time1,time2,...,time2007,time2008,time2009,time2010,time2011,time2012,time2013,time2014,time2015,time2016
0,21329,2,46,3,3,1,0.83,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1,41027,2,43,4,5,1,3.98,1,0,0,...,1,1,1,2,2,0,0,0,0,0
2,35501,2,54,3,5,1,5.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,26898,1,71,3,1,2,1.22,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,30634,2,25,1,5,5,4.99,0,0,0,...,0,0,0,0,0,0,1,1,2,1
5,39616,2,49,5,5,1,2.71,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,38773,1,64,3,4,1,4.92,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,30764,2,59,3,4,2,2.08,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,30419,2,75,4,4,3,1.31,1,1,0,...,0,0,1,0,1,0,0,0,0,1
9,23916,2,48,1,2,1,1.85,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
# reshaping for the subset data 
import numpy as np

def prepare_data_from_subset(df_subset, value_columns, n_channels=1):
    MAX_SEQ_LEN = 512

    # convert time series columns to numpy array
    data = df_subset[value_columns].values 
    n_batchsize, n_context = data.shape 
    print(f"Original data shape: {data.shape}")

    # confirm the reshaping  
    context_per_channel = n_context // n_channels
    if n_context % n_channels != 0:
        raise ValueError(f"Number of features ({n_context}) must be divisible by number of channels ({n_channels})")

    # check if sequence length is greater than max=512 and truncate if needed. 
    if context_per_channel > MAX_SEQ_LEN:
        print(f"Warning: Context length per channel ({context_per_channel}) exceeds maximum of {MAX_SEQ_LEN}. "
              f"Truncating to {MAX_SEQ_LEN}.")
        
        new_n_context = n_channels * MAX_SEQ_LEN        
        data = data[:, :new_n_context]
        context_per_channel = MAX_SEQ_LEN

    # reshape the data into [batchsize, channel, context]
    data_reshaped = data.reshape(n_batchsize, n_channels, context_per_channel)
    print(f"Reshaped data shape: {data_reshaped.shape}")
    
    # Convert to torch tensor 
    data_tensor = torch.FloatTensor(data_reshaped)
    print(f"Tensor shape: {data_tensor.shape}")

    return data_tensor # [batchsze, channel, context_length]

In [50]:
value_columns = [col for col in df.columns if col.startswith('time')]
subset_tensor = prepare_data_from_df(df_subset, value_columns, n_channels=1)

Original data shape: (100, 2016)
Reshaped data shape: (100, 1, 512)
Tensor shape: torch.Size([100, 1, 512])


In [20]:
# embedding of the subset 
from pprint import pprint

output = model(x_enc=subset_tensor)
pprint(output)

  return fn(*args, **kwargs)


TimeseriesOutputs(forecast=None,
                  anomaly_scores=None,
                  logits=None,
                  labels=None,
                  input_mask=tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]]),
                  pretrain_mask=None,
                  reconstruction=None,
                  embeddings=tensor([[-0.0031,  0.0043, -0.0594,  ..., -0.0378,  0.0567,  0.0157],
        [-0.0179,  0.0409, -0.0476,  ..., -0.0444,  0.0335,  0.0060],
        [-0.0641,  0.0223, -0.0463,  ..., -0.0526,  0.0805, -0.0118],
        ...,
        [-0.0257,  0.0272, -0.0355,  ..., -0.0311,  0.0469,  0.0067],
        [-0.0399, -0.0048, -0.0713,  ..., -0.0355,  0.0164, -0.0327],
        [-0.0287,  0.0184, -0.0437,  ..., -0.0207,  0.0453, -0.0491]]),
                  metadata='mean',
      

In [22]:
# extract embedding results 
embeddings = output.embeddings
embeddings_np = embeddings.detach().cpu().numpy()

# convert to dataframe 
embeddings_df = pd.DataFrame(embeddings_np)

# combine with columns up to 'bmi' from the original dataset 
embeddings_df = pd.concat([pd.DataFrame(embeddings_np), df_subset.loc[:, :'bmi']], axis=1)

# reorder columns 
original_cols = df_subset.loc[:, :'bmi'].columns.tolist()
embedding_cols = list(range(embeddings_np.shape[1])) 
embeddings_df = embeddings_df[original_cols + embedding_cols]

embeddings_df.head()

Unnamed: 0,seqn,gender,age,race,education,marital_status,pir,bmi,0,1,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,21329,2,46,3,3,1,0.83,1,-0.003087,0.004301,...,-0.002942,0.002535,-0.029975,0.030609,-0.057033,-0.004992,-0.018127,-0.037825,0.056666,0.015719
1,41027,2,43,4,5,1,3.98,1,-0.017937,0.04093,...,-0.028096,0.01056,-0.037647,0.014155,-0.028075,0.003573,-0.008654,-0.044398,0.033527,0.005961
2,35501,2,54,3,5,1,5.0,1,-0.064108,0.022306,...,-0.020523,0.022452,-0.049166,0.010092,-0.04629,-0.004279,0.002121,-0.052649,0.080519,-0.011789
3,26898,1,71,3,1,2,1.22,0,-0.071492,0.018901,...,-0.008691,0.020025,-0.007206,-0.004514,-0.041426,-0.009954,-0.02037,-0.003807,0.051596,-0.034902
4,30634,2,25,1,5,5,4.99,0,-0.046053,0.025882,...,0.003497,-0.000622,-0.030226,-0.04241,0.020231,-0.005351,-0.013401,-0.026792,0.0246,-0.033693


In [24]:
# save embeddings
embeddings_df.to_csv("embeddings_moment_subset1024.csv")

In [26]:
# reducing dimension 
def reduce_dimension(embedding, dim=50):
    return embedding[:, :dim]
reduced_embeddings_np = reduce_dimension(embeddings_np)

# convert to dataframe 
red_embeddings_df = pd.DataFrame(reduced_embeddings_np)
red_embeddings_df = pd.concat([pd.DataFrame(reduced_embeddings_np), df_subset.loc[:, :'bmi']], axis=1)

original_cols = df_subset.loc[:, :'bmi'].columns.tolist()
embedding_cols = list(range(reduced_embeddings_np.shape[1])) 
red_embeddings_df = red_embeddings_df[original_cols + embedding_cols]

red_embeddings_df.head()

Unnamed: 0,seqn,gender,age,race,education,marital_status,pir,bmi,0,1,...,40,41,42,43,44,45,46,47,48,49
0,21329,2,46,3,3,1,0.83,1,-0.003087,0.004301,...,-0.01585,-0.009552,0.010179,0.013681,-0.048924,-0.849025,-0.031381,-0.148296,0.001853,-0.052792
1,41027,2,43,4,5,1,3.98,1,-0.017937,0.04093,...,0.01351,0.005174,-0.035583,-0.003567,-0.068727,-0.899353,-0.073769,-0.13018,0.006322,-0.03242
2,35501,2,54,3,5,1,5.0,1,-0.064108,0.022306,...,-0.00021,0.033271,-0.01592,0.00879,-0.051048,-0.819557,-0.088359,-0.126066,-0.005064,-0.022679
3,26898,1,71,3,1,2,1.22,0,-0.071492,0.018901,...,0.028872,0.025953,0.006941,-0.003139,-0.05638,-0.877287,-0.072296,-0.139279,-0.038927,-0.037218
4,30634,2,25,1,5,5,4.99,0,-0.046053,0.025882,...,0.078069,-0.002142,0.009237,0.027928,-0.037717,-0.816038,-0.050357,-0.145099,-0.034092,-0.046128


In [28]:
# save embeddings with dimension reduction 
red_embeddings_df.to_csv("embeddings_moment_subset50.csv")

In [None]:
# visualize 
import numpy as np
import matplotlib.pyplot as plt

fig, axs = plt.subplots(1, 5, figsize=(30, 6), sharey=True)
axs.flatten()
for i, idx in enumerate(np.arange(0, n_samples+1, n_samples//4-1)):
    axs[i].plot(y[idx].squeeze().numpy())
    axs[i].set_xticks(
        ticks=np.arange(0, seq_len+1, 128), 
        labels=np.arange(0, seq_len+1, 128), 
        fontdict={"fontsize" : 16}
    )
    axs[i].set_title(
        "Frequency: {:.2f}".format(c[:, 0][idx].squeeze().numpy(), ),
        fontsize=16
    )
axs[0].set_yticks(
    ticks=np.arange(-1.5, 1.5, 0.5), 
    labels=np.arange(-1.5, 1.5, 0.5),
    fontdict={"fontsize" : 16}
)
plt.show()

In [None]:
import torch

device = torch.device("cpu") # CUDA not available 

model.to(device)
y = y.to(device)

model.eval()

with torch.no_grad():
    outputs = model(x_enc=y)

In [None]:
from sklearn.decomposition import PCA

embeddings = outputs.embeddings.detach().cpu().numpy()

# Perform PCA on the embeddings
embeddings_manifold = PCA(n_components=2).fit_transform(embeddings)

In [None]:
plt.title(f"$y = \sin(2c \pi x) + \epsilon$", fontsize=20)
plt.scatter(
    embeddings_manifold[:, 0], 
    embeddings_manifold[:, 1],
    c=c[:, 0].squeeze().numpy(),
    cmap='magma'
)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.colorbar(
    boundaries=np.arange(
    synthetic_dataset.freq_range[0],
    synthetic_dataset.freq_range[1]+1, 1)
)
plt.show()