In [2]:
pip install momentfm

Note: you may need to restart the kernel to use updated packages.


In [None]:
# alternative
!pip install git+https://github.com/moment-timeseries-foundation-model/moment.git

In [4]:
!pip install numpy pandas scikit-learn matplotlib tqdm



In [2]:
# getting the MOMENT model 
from momentfm import MOMENTPipeline

model = MOMENTPipeline.from_pretrained(
    "AutonLab/MOMENT-1-large", 
    model_kwargs={'task_name': 'embedding'}, # We are loading the model in `embedding` mode to learn representations
    local_files_only=True,  # Whether or not to only look at local files (i.e., do not try to download the model).
)

  torch.utils._pytree._register_pytree_node(


In [4]:
model.init()
print(model)

MOMENTPipeline(
  (normalizer): RevIN()
  (tokenizer): Patching()
  (patch_embedding): PatchEmbedding(
    (value_embedding): Linear(in_features=8, out_features=1024, bias=False)
    (position_embedding): PositionalEmbedding()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
  



In [6]:
# Number of parameters in the encoder
num_params = sum(p.numel() for p in model.encoder.parameters())
print(f"Number of parameters: {num_params}")

Number of parameters: 341231104


In [8]:
# import NHANES data 
import torch
import pandas as pd

df = pd.read_csv("data/data_wide.csv", index_col=0)
df.head(3)

Unnamed: 0,seqn,gender,age,race,education,married,pir,bmi,time1,time2,...,time2007,time2008,time2009,time2010,time2011,time2012,time2013,time2014,time2015,time2016
1,21009,1,55,3,3,1,3.79,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,21010,2,52,3,4,6,1.24,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,21012,1,63,4,3,6,0.89,0,1,0,...,1,1,0,0,0,1,1,0,0,0


In [10]:
########### reshaping the data with L2 normalization 
import numpy as np

def normalize_l2(x):
    x = np.array(x)
    if x.ndim == 1:
        norm = np.linalg.norm(x)
        if norm < 1e-10:  # Use a small threshold instead of exact zero
            return x
        return x / norm
    else:
        # Create a copy of x to store the result
        normalized = np.zeros_like(x, dtype=np.float64)
        
        # Calculate norms for each row
        norms = np.linalg.norm(x, 2, axis=1, keepdims=True)
        
        # Process each row separately, avoiding division by zero
        for i in range(x.shape[0]):
            if norms[i] >= 1e-10:  # Only normalize if norm is not effectively zero
                normalized[i] = x[i] / norms[i]
            else:
                normalized[i] = x[i]  # Keep original values if norm is effectively zero
                
        return normalized

def prepare_data_from_df(df, value_columns, n_channels=1):
    MAX_SEQ_LEN = 512
    
    # convert time series columns to numpy array
    data = df[value_columns].values
    n_batchsize, n_context = data.shape
    print(f"Original data shape: {data.shape}")
    
    # confirm the reshaping
    context_per_channel = n_context // n_channels
    if n_context % n_channels != 0:
        raise ValueError(f"Number of features ({n_context}) must be divisible by number of channels ({n_channels})")
    
    # check if sequence length is greater than max=512 and truncate if needed.
    if context_per_channel > MAX_SEQ_LEN:
        print(f"Warning: Context length per channel ({context_per_channel}) exceeds maximum of {MAX_SEQ_LEN}. "
              f"Truncating to {MAX_SEQ_LEN}.")
        new_n_context = n_channels * MAX_SEQ_LEN
        data = data[:, :new_n_context]
        context_per_channel = MAX_SEQ_LEN
    
    # Apply L2 normalization to the data
    data = normalize_l2(data)
    
    # reshape the data into [batchsize, channel, context]
    data_reshaped = data.reshape(n_batchsize, n_channels, context_per_channel)
    print(f"Reshaped data shape: {data_reshaped.shape}")
    
    # Convert to torch tensor
    data_tensor = torch.FloatTensor(data_reshaped)
    print(f"Tensor shape: {data_tensor.shape}")
    
    return data_tensor  # [batchsize, channel, context_length]

In [12]:
value_columns = [col for col in df.columns if col.startswith('time')]
data_tensor = prepare_data_from_df(df, value_columns, n_channels=1)

Original data shape: (6943, 2016)
Reshaped data shape: (6943, 1, 512)
Tensor shape: torch.Size([6943, 1, 512])


In [None]:
###### DONT RUN. embedding of the data 
from pprint import pprint

output = model(x_enc=data_tensor)
pprint(output)

In [28]:
# get 300 subset from the data_tensor
import random

np.random.seed(1)
random_indices = random.sample(range(data_tensor.shape[0]), 300)
subset_data = data_tensor[random_indices]

print(f"Subset shape: {subset_data.shape}")   

Subset shape: torch.Size([300, 1, 512])


In [16]:
# embedding of the subset 
from pprint import pprint

output = model(x_enc=subset_data)
pprint(output)

  return fn(*args, **kwargs)


TimeseriesOutputs(forecast=None,
                  anomaly_scores=None,
                  logits=None,
                  labels=None,
                  input_mask=tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]]),
                  pretrain_mask=None,
                  reconstruction=None,
                  embeddings=tensor([[-0.0398,  0.0628, -0.0473,  ..., -0.0509,  0.0527,  0.0226],
        [-0.0466,  0.0181, -0.0629,  ..., -0.0323,  0.0591,  0.0176],
        [-0.0352,  0.0474, -0.0512,  ..., -0.0361,  0.0505,  0.0264],
        ...,
        [-0.0719,  0.0227, -0.0586,  ..., -0.0125,  0.0874, -0.0066],
        [-0.0287,  0.0332, -0.0436,  ..., -0.0798,  0.0474, -0.0012],
        [-0.0706, -0.0038, -0.0497,  ..., -0.0576,  0.0249, -0.0204]]),
                  metadata='mean',
      

In [46]:
# extract embedding results 
embeddings = output.embeddings
embeddings_np = embeddings.detach().cpu().numpy()

# convert to dataframe 
embeddings_df = pd.DataFrame(embeddings_np)

# combine with columns up to 'bmi' from the original dataset 
df_subset = df.iloc[random_indices].reset_index(drop=True)
embeddings_df = pd.concat([pd.DataFrame(embeddings_np), df_subset.loc[:, :'bmi']], axis=1)

# reorder columns 
original_cols = df_subset.loc[:, :'bmi'].columns.tolist()
embeddings_cols = list(range(embeddings_np.shape[1])) 
embeddings_df = embeddings_df[original_cols + embeddings_cols]

embeddings_df.index = range(1, len(embeddings_df) + 1)

embeddings_df.head()

Unnamed: 0,seqn,gender,age,race,education,married,pir,bmi,0,1,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
1,23801,2,41,1,3,5,2.18,0,-0.039757,0.062818,...,-0.047503,0.009559,-0.032484,0.025933,-0.00887,-0.025857,-0.045546,-0.050864,0.052668,0.022633
2,39147,1,60,4,2,5,2.73,1,-0.046569,0.018072,...,-0.04407,0.035174,-0.020534,-0.050973,-0.029079,-0.009815,-0.027949,-0.032295,0.059068,0.017603
3,22952,1,45,3,4,1,2.99,1,-0.035208,0.047367,...,-0.078315,0.041009,-0.022166,0.025798,-0.024566,0.01494,-0.031748,-0.036083,0.050512,0.026384
4,27293,1,50,5,1,3,3.6,1,-0.014122,0.014995,...,-0.004092,0.028698,-0.061429,0.026577,-0.066454,-0.012173,-0.008214,-0.007548,0.053773,0.037469
5,28316,1,21,4,3,5,1.83,1,-0.042227,0.046679,...,-0.042061,0.016599,-0.025376,0.019031,-0.008142,-0.003758,-0.022597,-0.062916,0.068851,-0.011816


In [48]:
# save embeddings
embeddings_df.to_csv("./data/embeddings_moment_subset1024.csv")

In [54]:
# reducing dimension 
def reduce_dimension(embedding, dim=50):
    return embedding[:, :dim]
reduced_embeddings_np = reduce_dimension(embeddings_np)

# convert to dataframe 
red_embeddings_df = pd.DataFrame(reduced_embeddings_np)
red_embeddings_df = pd.concat([pd.DataFrame(reduced_embeddings_np), df_subset.loc[:, :'bmi']], axis=1)

original_cols = df_subset.loc[:, :'bmi'].columns.tolist()
embedding_cols = list(range(reduced_embeddings_np.shape[1])) 
red_embeddings_df = red_embeddings_df[original_cols + embedding_cols]
red_embeddings_df.index = range(1, len(red_embeddings_df) + 1)

red_embeddings_df.head()

Unnamed: 0,seqn,gender,age,race,education,married,pir,bmi,0,1,...,40,41,42,43,44,45,46,47,48,49
1,23801,2,41,1,3,5,2.18,0,-0.039757,0.062818,...,-0.006033,0.026869,-0.01692,-0.033945,-0.070028,-0.835683,-0.031737,-0.096679,0.006887,-0.007863
2,39147,1,60,4,2,5,2.73,1,-0.046569,0.018072,...,0.031063,0.012934,0.000944,-0.001351,-0.050381,-0.826628,-0.07511,-0.126727,-0.000692,-0.070777
3,22952,1,45,3,4,1,2.99,1,-0.035208,0.047367,...,0.013055,0.014157,-0.028954,0.004224,-0.083269,-0.814749,-0.06777,-0.098784,0.000378,-0.03135
4,27293,1,50,5,1,3,3.6,1,-0.014122,0.014995,...,-0.049818,0.004506,0.011823,-0.00937,-0.011301,-0.823884,-0.060171,-0.136132,-0.003282,-0.003729
5,28316,1,21,4,3,5,1.83,1,-0.042227,0.046679,...,0.004981,0.044711,-0.047999,-0.011382,-0.06075,-0.770914,-0.063353,-0.157909,-0.00511,-0.048352


In [42]:
# save embeddings with dimension reduction 
red_embeddings_df.to_csv("./data/embeddings_moment_subset50.csv")

In [None]:
# visualize 
import numpy as np
import matplotlib.pyplot as plt

fig, axs = plt.subplots(1, 5, figsize=(30, 6), sharey=True)
axs.flatten()
for i, idx in enumerate(np.arange(0, n_samples+1, n_samples//4-1)):
    axs[i].plot(y[idx].squeeze().numpy())
    axs[i].set_xticks(
        ticks=np.arange(0, seq_len+1, 128), 
        labels=np.arange(0, seq_len+1, 128), 
        fontdict={"fontsize" : 16}
    )
    axs[i].set_title(
        "Frequency: {:.2f}".format(c[:, 0][idx].squeeze().numpy(), ),
        fontsize=16
    )
axs[0].set_yticks(
    ticks=np.arange(-1.5, 1.5, 0.5), 
    labels=np.arange(-1.5, 1.5, 0.5),
    fontdict={"fontsize" : 16}
)
plt.show()

In [None]:
import torch

device = torch.device("cpu") # CUDA not available 

model.to(device)
y = y.to(device)

model.eval()

with torch.no_grad():
    outputs = model(x_enc=y)

In [None]:
from sklearn.decomposition import PCA

embeddings = outputs.embeddings.detach().cpu().numpy()

# Perform PCA on the embeddings
embeddings_manifold = PCA(n_components=2).fit_transform(embeddings)

In [None]:
plt.title(f"$y = \sin(2c \pi x) + \epsilon$", fontsize=20)
plt.scatter(
    embeddings_manifold[:, 0], 
    embeddings_manifold[:, 1],
    c=c[:, 0].squeeze().numpy(),
    cmap='magma'
)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.colorbar(
    boundaries=np.arange(
    synthetic_dataset.freq_range[0],
    synthetic_dataset.freq_range[1]+1, 1)
)
plt.show()