In [2]:
pip install momentfm

Note: you may need to restart the kernel to use updated packages.


In [None]:
# alternative
!pip install git+https://github.com/moment-timeseries-foundation-model/moment.git

In [4]:
!pip install numpy pandas scikit-learn matplotlib tqdm



In [6]:
from momentfm import MOMENTPipeline

model = MOMENTPipeline.from_pretrained(
    "AutonLab/MOMENT-1-large", 
    model_kwargs={'task_name': 'embedding'}, # We are loading the model in `embedding` mode to learn representations
    local_files_only=True,  # Whether or not to only look at local files (i.e., do not try to download the model).
)

  torch.utils._pytree._register_pytree_node(


In [7]:
model.init()
print(model)

MOMENTPipeline(
  (normalizer): RevIN()
  (tokenizer): Patching()
  (patch_embedding): PatchEmbedding(
    (value_embedding): Linear(in_features=8, out_features=1024, bias=False)
    (position_embedding): PositionalEmbedding()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
  



In [10]:
# Number of parameters in the encoder
num_params = sum(p.numel() for p in model.encoder.parameters())
print(f"Number of parameters: {num_params}")

Number of parameters: 341231104


In [12]:
# import NHANES data 
import torch
import pandas as pd

df = pd.read_csv("data/data_wide.csv", index_col=0)
df.head(3)

Unnamed: 0,seqn,gender,age,race,education,marital_status,pir,bmi,time1,time2,...,time2007,time2008,time2009,time2010,time2011,time2012,time2013,time2014,time2015,time2016
1,21009,1,55,3,3,1,3.79,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,21010,2,52,3,4,6,1.24,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,21012,1,63,4,3,6,0.89,0,1,0,...,1,1,0,0,0,1,1,0,0,0


In [14]:
# reshaping for entire data
import numpy as np

def prepare_data_from_df(df, value_columns, seq_len=512):
    MAX_SEQ_LEN = 512
    
    # truncate if seq_len is too large  
    if seq_len > MAX_SEQ_LEN:
        print(f"Warning: seq_len ({seq_len}) exceeds MOMENT's maximum length of {MAX_SEQ_LEN}. "
              f"Truncating to {MAX_SEQ_LEN}.")
        seq_len = MAX_SEQ_LEN
    
    # convert time series columns to numpy array
    data = df[value_columns].values  # shape: [n_samples, seq_len]
    n_timesteps, n_channels = data.shape 
    print(f"Original data shape: {data.shape}")
    
    # sequence calculation  
    n_samples = n_timesteps // seq_len
    print(f"Number of sequences: {n_samples}")
    
    # trim data to fit complete sequences
    trimmed_length = n_samples * seq_len
    data = data[:trimmed_length]
    print(f"Trimmed data shape: {data.shape}")
    
    # reshape the data into [n_samples, seq_len, n_channels]
    data_reshaped = data.reshape(n_samples, seq_len, n_channels)
    print(f"Reshaped data shape: {data_reshaped.shape}")
    
    # Convert to torch tensor 
    data_tensor = torch.FloatTensor(data_reshaped).permute(0, 2, 1)
    print(f"Tensor shape after permute: {data_tensor.shape}")
    
    return data_tensor

In [16]:
value_columns = [col for col in df.columns if col.startswith('time')]
data_tensor = prepare_data_from_df(df, value_columns, seq_len=512)

Original data shape: (6943, 2016)
Number of sequences: 13
Trimmed data shape: (6656, 2016)
Reshaped data shape: (13, 512, 2016)
Tensor shape after permute: torch.Size([13, 2016, 512])


In [18]:
# Create a random subset with 100 subjects
import numpy as np

np.random.seed(1)
random_indices = np.random.choice(df.index, size=100, replace=False)
df_subset = df.loc[random_indices].reset_index(drop=True)

df_subset.head(10)

Unnamed: 0,seqn,gender,age,race,education,marital_status,pir,bmi,time1,time2,...,time2007,time2008,time2009,time2010,time2011,time2012,time2013,time2014,time2015,time2016
0,21329,2,46,3,3,1,0.83,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1,41027,2,43,4,5,1,3.98,1,0,0,...,1,1,1,2,2,0,0,0,0,0
2,35501,2,54,3,5,1,5.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,26898,1,71,3,1,2,1.22,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,30634,2,25,1,5,5,4.99,0,0,0,...,0,0,0,0,0,0,1,1,2,1
5,39616,2,49,5,5,1,2.71,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,38773,1,64,3,4,1,4.92,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,30764,2,59,3,4,2,2.08,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,30419,2,75,4,4,3,1.31,1,1,0,...,0,0,1,0,1,0,0,0,0,1
9,23916,2,48,1,2,1,1.85,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# reshaping for the subset data 
import numpy as np

def prepare_data_from_subset(df_subset, value_columns, seq_len=512):
    MAX_SEQ_LEN = 512
    
    # truncate if seq_len is too large  
    if seq_len > MAX_SEQ_LEN:
        print(f"Warning: seq_len ({seq_len}) exceeds MOMENT's maximum length of {MAX_SEQ_LEN}. "
              f"Truncating to {MAX_SEQ_LEN}.")
        seq_len = MAX_SEQ_LEN
    
    # convert time series columns to numpy array
    data = df_subset[value_columns].values  # shape: [n_samples, n_features]
    n_samples, n_features = data.shape 
    print(f"Original data shape: {data.shape}")

    # sequence calculation  
    n_channels = n_features // seq_len  # 2016 // 512 = 3.9375
    n_features_to_use = n_channels * seq_len  # Use only complete sequences
    print(f"Number of sequences: {n_features_to_use}")

    # Trim features to make it divisible by seq_len
    data = data[:, :n_features_to_use]  # Keep only first 1536 features (3 * 512)
    
    # reshape the data into [n_samples, seq_len, features_per_seq]
    data_reshaped = data.reshape(n_samples, -1, seq_len)
    print(f"Reshaped data shape: {data_reshaped.shape}")  # Should be [100, 3, 512]
    
    # Convert to torch tensor [n_samples, seq_len, features_per_seq]
    data_tensor = torch.FloatTensor(data_reshaped)
    print(f"Final tensor shape: {data_tensor.shape}")
    
    return data_tensor

In [22]:
value_columns = [col for col in df.columns if col.startswith('time')]
subset_tensor = prepare_data_from_subset(df_subset, value_columns, seq_len=512)

Original data shape: (100, 2016)
Number of sequences: 1536
Reshaped data shape: (100, 3, 512)
Final tensor shape: torch.Size([100, 3, 512])


In [24]:
# embedding of the subset 
from pprint import pprint

output = model(x_enc=subset_tensor)
pprint(output)

  return fn(*args, **kwargs)


TimeseriesOutputs(forecast=None,
                  anomaly_scores=None,
                  logits=None,
                  labels=None,
                  input_mask=tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]]),
                  pretrain_mask=None,
                  reconstruction=None,
                  embeddings=tensor([[-0.0354,  0.0161, -0.0563,  ..., -0.0327,  0.0278, -0.0040],
        [-0.0329,  0.0378, -0.0479,  ..., -0.0616,  0.0425, -0.0033],
        [-0.0604,  0.0210, -0.0378,  ..., -0.0562,  0.0548, -0.0362],
        ...,
        [-0.0298,  0.0150, -0.0464,  ..., -0.0202,  0.0452, -0.0013],
        [-0.0498,  0.0053, -0.0689,  ..., -0.0247,  0.0309, -0.0252],
        [-0.0347,  0.0018, -0.0715,  ..., -0.0377,  0.0464, -0.0225]]),
                  metadata='mean',
      

In [46]:
# extract embedding results 
embeddings = output.embeddings
embeddings_np = embeddings.detach().cpu().numpy()

# convert to dataframe 
embeddings_df = pd.DataFrame(embeddings_np)

# add seqn from the original data 
embeddings_df['seqn'] = df_subset['seqn'].values

df_subset['seqn'].head()

0    21329
1    41027
2    35501
3    26898
4    30634
Name: seqn, dtype: int64

In [42]:
# extract embedding results 
embeddings = output.embeddings
embeddings_np = embeddings.detach().cpu().numpy()

# reshape from 3D to 2D 
n_samples, n_channels, seq_len = embeddings_np.shape
embeddings_2d = embeddings_np.reshape(n_samples, n_channels * seq_len)

# convert to dataframe 
embeddings_df = pd.DataFrame(embeddings_2d)

# add seqn from the original data 
embeddings_df['seqn'] = df_subset['seqn'].values

df_subset['seqn'].head()

ValueError: not enough values to unpack (expected 3, got 2)

In [48]:
# save embeddings
embeddings_df.to_csv("embeddings_moment_subset.csv")

In [40]:
# example from the MOMENT github
from pprint import pprint
import torch

# takes in tensor of shape [batchsize, n_channels, context_length]
x2 = torch.randn(16, 1, 512)
output2 = model(x_enc=x2)
pprint(output2)

  return fn(*args, **kwargs)


TimeseriesOutputs(forecast=None,
                  anomaly_scores=None,
                  logits=None,
                  labels=None,
                  input_mask=tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]]),
                  pretrain_mask=None,
                  reconstruction=None,
                  embeddings=tensor([[-0.0734,  0.0262, -0.0285,  ...,  0.0253,  0.0104, -0.0256],
        [-0.0594,  0.0318, -0.0520,  ..., -0.0105,  0.0315, -0.0190],
        [-0.0837,  0.0390, -0.0605,  ..., -0.0186,  0.0378, -0.0244],
        ...,
        [-0.0636,  0.0517, -0.0606,  ..., -0.0151,  0.0408, -0.0017],
        [-0.0691,  0.0250, -0.0576,  ...,  0.0018,  0.0342,  0.0124],
        [-0.0554,  0.0218, -0.0521,  ..., -0.0201,  0.0335,  0.0247]]),
                  metadata='mean',
      

In [None]:
from momentfm.data.synthetic_data import SyntheticDataset

synthetic_dataset = SyntheticDataset(
    n_samples=1024,
    freq=1,
    freq_range=(1, 32), 
    noise_mean=0.,
    noise_std=0.1,
    random_seed=13
)
y, c = synthetic_dataset.gen_sinusoids_with_varying_freq()
n_samples = synthetic_dataset.n_samples
seq_len = synthetic_dataset.seq_len

In [None]:
# visualize 
import numpy as np
import matplotlib.pyplot as plt

fig, axs = plt.subplots(1, 5, figsize=(30, 6), sharey=True)
axs.flatten()
for i, idx in enumerate(np.arange(0, n_samples+1, n_samples//4-1)):
    axs[i].plot(y[idx].squeeze().numpy())
    axs[i].set_xticks(
        ticks=np.arange(0, seq_len+1, 128), 
        labels=np.arange(0, seq_len+1, 128), 
        fontdict={"fontsize" : 16}
    )
    axs[i].set_title(
        "Frequency: {:.2f}".format(c[:, 0][idx].squeeze().numpy(), ),
        fontsize=16
    )
axs[0].set_yticks(
    ticks=np.arange(-1.5, 1.5, 0.5), 
    labels=np.arange(-1.5, 1.5, 0.5),
    fontdict={"fontsize" : 16}
)
plt.show()

In [None]:
import torch

device = torch.device("cpu") # CUDA not available 

model.to(device)
y = y.to(device)

model.eval()

with torch.no_grad():
    outputs = model(x_enc=y)

In [None]:
from sklearn.decomposition import PCA

embeddings = outputs.embeddings.detach().cpu().numpy()

# Perform PCA on the embeddings
embeddings_manifold = PCA(n_components=2).fit_transform(embeddings)

In [None]:
plt.title(f"$y = \sin(2c \pi x) + \epsilon$", fontsize=20)
plt.scatter(
    embeddings_manifold[:, 0], 
    embeddings_manifold[:, 1],
    c=c[:, 0].squeeze().numpy(),
    cmap='magma'
)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.colorbar(
    boundaries=np.arange(
    synthetic_dataset.freq_range[0],
    synthetic_dataset.freq_range[1]+1, 1)
)
plt.show()