In [None]:
pip install momentfm

In [None]:
# get the MOMENT model 

from momentfm import MOMENTPipeline

model = MOMENTPipeline.from_pretrained(
    "AutonLab/MOMENT-1-large", 
    model_kwargs={'task_name': 'embedding'}, # We are loading the model in `embedding` mode to learn representations
    local_files_only=True,  # Whether or not to only look at local files (i.e., do not try to download the model).
)

model.init()
print(model)

In [None]:
# Number of parameters in the encoder
num_params = sum(p.numel() for p in model.encoder.parameters())
print(f"Number of parameters: {num_params}")

In [None]:
import torch
import pandas as pd
from pprint import pprint

In [None]:
# Import preprocessed data 

df_min5 = pd.read_csv("002_data/data_wide.csv", index_col=0)                        # aggregated 5min 
df_min1 = pd.read_csv("002_data/data_raw_full.csv", index_col=0)                    # raw 1min 

df_min20 = pd.read_csv("002_data/data_wide_min20.csv", index_col=0)                 # aggregated 20min

In [None]:
# Prepare data into chunks to fit into MOMENT input (max 512 tokens) 

def prepare_chunks(df, value_columns, n_channels=1, max_len=512):
    data = df[value_columns].values
    n_batch, n_context = data.shape

    # context per channel
    context_per_channel = n_context // n_channels
    if n_context % n_channels != 0:
        raise ValueError("n_context must be divisible by n_channels")

    # number of full 512 windows
    n_full_chunks = context_per_channel // max_len
    chunks = []
    
    for chunk_idx in range(n_full_chunks):
        start = chunk_idx * max_len
        end = start + max_len
        chunk_raw = data[:, start:end]  # (batch, 512)
        chunk_reshaped = chunk_raw.reshape(n_batch, n_channels, max_len)
        chunks.append(torch.FloatTensor(chunk_reshaped))

    remainder = context_per_channel % max_len
    if remainder > 0:
        start = n_full_chunks * max_len
        end = context_per_channel
        chunk_raw = data[:, start:end]  # (batch, remainder)
        chunk_reshaped = chunk_raw.reshape(n_batch, n_channels, remainder)
        chunks.append(torch.FloatTensor(chunk_reshaped))
        print(f"Added partial chunk {n_full_chunks} with length {remainder}")
    
    return chunks

In [None]:
######################## For 5 min aggregated data ########################
########################################################################

model.init()

val_col_min5 = [col for col in df_min5.columns if col.startswith('time')]

df_min5_tensor = prepare_chunks(df_min5, val_col_min5, n_channels=1)
df_min5_c0 = df_min5_tensor[0]  # first 512 inputs 

# get embeddings in two subsets due to time/ efficiency 

df_min5_s1 = df_min5_c0[:3500]
df_min5_s2 = df_min5_c0[3500:]

with torch.no_grad():
    output_df_min5_s1 = model(x_enc=df_min5_s1)
    output_df_min5_s2 = model(x_enc=df_min5_s2)

min5_embeddings_s1 = output_df_min5_s1.embeddings 
min5_embeddings_s2 = output_df_min5_s2.embeddings 

min5_embeddings = torch.cat([min5_embeddings_s1, min5_embeddings_s2], dim=0)
min5_embeddings_np = min5_embeddings.detach().cpu().numpy()

# combine with demographic columns 
demo_cols = ['seqn', 'gender', 'age', 'race', 'education', 'marital_status', 'pir', 'bmi']
min5_data_all = df_min5.reset_index(drop=True)
min5_embeddings_df = pd.concat([min5_data_all[demo_cols], pd.DataFrame(min5_embeddings_np)], axis=1)

min5_original_cols = demo_cols
min5_embedding_cols = list(range(min5_embeddings_np.shape[1]))
min5_embeddings_df.columns = min5_original_cols + min5_embedding_cols
min5_embeddings_df.index = range(1, len(min5_embeddings_df) + 1)


In [None]:
# save 
min5_embeddings_df.to_csv("./002_data/embeddings_min5_moment1024.csv", index=False)

In [None]:
######################## For 1 min raw data ########################
########################################################################

model.init()

val_col_min1 = [col for col in df_min1.columns if col.startswith('x')]

df_min1_tensor = prepare_chunks(df_min1, val_col_min1, n_channels=1)
df_min1_c0 = df_min1_tensor[0]  # first 512 inputs 

# get embeddings in two subsets due to time/ efficiency 

df_min1_s1 = df_min1_tensor[:3500]
df_min1_s2 = df_min1_tensor[3500:]

with torch.no_grad():
    output_df_min1_s1 = model(x_enc=df_min1_s1)
    output_df_min1_s2 = model(x_enc=df_min1_s2)

min1_embeddings_s1 = output_df_min1_s1.embeddings 
min1_embeddings_s2 = output_df_min1_s2.embeddings 
min1_embeddings = torch.cat([min1_embeddings_s1, min1_embeddings_s2], dim=0)
min1_embeddings_np = min5_embeddings.detach().cpu().numpy()

# combine with demographic columns 
demo_cols = ['seqn', 'gender', 'age', 'race', 'education', 'marital_status', 'pir', 'bmi']
min1_data_all = df_min1.reset_index(drop=True)
min1_embeddings_df = pd.concat([min1_data_all[demo_cols], pd.DataFrame(min1_embeddings_np)], axis=1)

min1_original_cols = demo_cols
min1_embedding_cols = list(range(min1_embeddings_np.shape[1]))
min1_embeddings_df.columns = min1_original_cols + min1_embedding_cols
min1_embeddings_df.index = range(1, len(min1_embeddings_df) + 1)

In [None]:
# save 
min1_embeddings_df.to_csv("./002_data/embeddings_min1_moment1024.csv")


In [None]:
######################## For 20 min aggregated data ########################
########################################################################

model.init()

val_col_min20 = [col for col in df_min20.columns if col.startswith('time')]

df_min20_tensor = prepare_chunks(df_min20, val_col_min20, n_channels=1)
df_min20_c0 = df_min20_tensor[0]  # first 512 inputs 

# get embeddings in two subsets due to time/ efficiency 

df_min20_s1 = df_min20_c0[:3500]
df_min20_s2 = df_min20_c0[3500:]

with torch.no_grad():
    output_df_min20_s1 = model(x_enc=df_min20_s1)
    output_df_min20_s2 = model(x_enc=df_min20_s2)

min20_embeddings_s1 = output_df_min20_s1.embeddings 
min20_embeddings_s2 = output_df_min20_s2.embeddings 

min20_embeddings = torch.cat([min20_embeddings_s1, min20_embeddings_s2], dim=0)
min20_embeddings_np = min20_embeddings.detach().cpu().numpy()

# combine with demographic columns 
demo_cols = ['seqn', 'gender', 'age', 'race', 'education', 'marital_status', 'pir', 'bmi']
min20_data_all = df_min20.reset_index(drop=True)
min20_embeddings_df = pd.concat([min20_data_all[demo_cols], pd.DataFrame(min20_embeddings_np)], axis=1)

min20_original_cols = demo_cols
min20_embedding_cols = list(range(min20_embeddings_np.shape[1]))
min20_embeddings_df.columns = min20_original_cols + min20_embedding_cols
min20_embeddings_df.index = range(1, len(min20_embeddings_df) + 1)


In [None]:
# save 
min20_embeddings_df.to_csv("./002_data/embeddings_min20_moment1024.csv", index=False)