# Feature Creation for Time Series Data
- turn a time series graph into a simple graph without time
- turn time information into node's features

In [1]:
import torch
print(torch.__version__)

2.6.0+cu118


In [2]:
import pickle, sys, os

In [3]:
data_folder = '/home/twp/lantian/Conformalized-Network-Source-Detection/SD-STGCN/dataset/highSchool/data/SIR'
# print out list of files in data_folder
files = os.listdir(data_folder)
for idx, file in enumerate(files):
    print(f"{idx}: {file}")


0: SIR_nsrc10_Rzero2.5_beta0.25_gamma0_T30_ls21200_nf16_entire.pickle
1: SIR_nsrc7_Rzero43.44_beta0.25_gamma0.15_T30_ls21200_nf16_entire.pickle
2: SIR_nsrc14_Rzero43.44_beta0.25_gamma0.15_T30_ls21200_nf16_torchentire.pickle
3: SIR_nsrc14_Rzero43.44_beta0.25_gamma0.15_T30_ls21200_nf16_entire.pickle
4: SIR_nsrc1_Rzero2.5_beta0.25_gamma0_T30_ls21200_nf16_entire.pickle
5: split
6: SIR_nsrc1_Rzero43.44_beta0.25_gamma0.15_T30_ls21200_nf16_entire.pickle
7: SIR_nsrc10_Rzero2.5_beta0.25_gamma0_T30_ls21200_nf16_torchentire.pickle
8: SIR_nsrc1_Rzero43.44_beta0.25_gamma0.15_T30_ls21200_nf16_torchentire.pickle
9: SIR_nsrc1_Rzero2.5_beta0.25_gamma0_T30_ls21200_nf16_torchentire.pickle
10: SIR_nsrc7_Rzero43.44_beta0.25_gamma0.15_T30_ls21200_nf16_torchentire.pickle
11: SIR_nsrc7_Rzero2.5_beta0.25_gamma0_T30_ls21200_nf16_entire.pickle
12: SIR_nsrc1_Rzero2.5_beta0.25_gamma0_T30_ls21200_nf16_torchentire_with_node_features.pickle
13: SIR_nsrc7_Rzero2.5_beta0.25_gamma0_T30_ls21200_nf16_torchentire.pickle


In [4]:
data_file = files[13]
#data_file = 'SIR_nsrc1_Rzero43.44_beta0.25_gamma0.15_T30_ls21200_nf16_torchentire.pickle'

data_path = os.path.join(data_folder, data_file)
# Load the data
with open(data_path, 'rb') as f:
    data = pickle.load(f)


In [5]:
print(f"Data type: {type(data)}")  # tuple
print(f"Data length: {len(data)}") # length

Data type: <class 'tuple'>
Data length: 3


In [6]:
import numpy as np

# view first element
i = 0
item = data[i]
print(f"Item {i}: Time series")
print(f"  Type: {type(item)}")
print(f"  Length: {len(item)} ")
print(f"  Shape: {item.shape} ([number of simulations], [number of snapshots], [number of nodes])")


# Use torch.randint to sample indices directly on the tensor for speed
numel = item.numel()
num_samples = min(10000, numel)
indices = torch.randint(0, numel, (num_samples,))
sampled = item.view(-1)[indices]

unique_digits = torch.unique(sampled)
print(f"  Unique digits in tensor (sampled): {unique_digits.tolist()} (0 = susceptible, 1 = infected, 2 = recovered)")


Item 0: Time series
  Type: <class 'torch.Tensor'>
  Length: 21200 
  Shape: torch.Size([21200, 30, 774]) ([number of simulations], [number of snapshots], [number of nodes])
  Unique digits in tensor (sampled): [0, 1] (0 = susceptible, 1 = infected, 2 = recovered)


In [7]:
i = 1
item = data[i]
print(f"Item {i}: Node labels/ground truth (whether the node is a source or not)")
print(f"  Type: {type(item)}")
print(f"  Length: {len(item)} ")
print(f"  Shape: {item.shape} ([number of simulations], [number of nodes])")

unique_digits = torch.unique(item)
print(f"  Unique digits in tensor: {unique_digits.tolist()}")

# for each simulation, count the number of sources
# print out "{xx} simulations have {yy} sources; {zz} simulations have {ww} sources"
num_simulations = item.shape[0]
num_sources = item.sum(dim=1)  # sum across nodes for each simulation
source_counts = torch.unique(num_sources, return_counts=True)
for count, num_simulations_with_count in zip(*source_counts):
    print(f"{num_simulations_with_count.item()} simulations have {count.item()} sources")   


Item 1: Node labels/ground truth (whether the node is a source or not)
  Type: <class 'torch.Tensor'>
  Length: 21200 
  Shape: torch.Size([21200, 774]) ([number of simulations], [number of nodes])
  Unique digits in tensor: [0, 1]
21200 simulations have 7 sources


In [8]:
i = 2
item = data[i]
print(f"Item {i}: skip idx: the first N snapshots to ignore")
print(f"  Type: {type(item)}")
print(f"  Length: {len(item)} ")

# number of unique digits in the list
import numpy as np
unique_digits, counts = np.unique(item, return_counts=True)
print(f"  Unique digits in list: {unique_digits.tolist()}")

# set skip_idx to the first element
skip_idx = item[0]
print(f"  Skip idx: {skip_idx[0]} (the first {skip_idx[0]} snapshots to ignore)")

Item 2: skip idx: the first N snapshots to ignore
  Type: <class 'list'>
  Length: 21200 
  Unique digits in list: [1]
  Skip idx: 1 (the first 1 snapshots to ignore)


# convert time series to features
- a function `timeseries_to_features` takes a time series, a skip value, and an optional `nf` value (default is 16). This function converts the time series into a feature vector `[t0, t1, t2]` as follows:
    - The time series is a list of integers of length 30, where each value can be 0 (susceptible), 1 (infected), or 2 (recovered). The values are non-decreasing over time.
    - Ignore the first `skip` elements of the time series, then consider the next `nf` elements.
    - Count the number of 0s, 1s, and 2s in this segment:
        - `t0` = number of 0s (susceptible)
        - `t1` = number of 1s (infected)
        - `t2` = number of 2s (recovered)
    - Return the feature vector `[t0, t1, t2]`.


In [9]:
def timeseries_to_features(time_series, skip, nf=16):
    """
    Convert a time series into a feature vector [t0, t1, t2].

    Args:
        time_series (list or np.ndarray): The time series data (length = 30).
        skip (int): The number of initial snapshots to ignore.
        nf (int): The number of snapshots to consider (default is 16).

    Returns:
        list: A feature vector [t0, t1, t2], where:
              t0 = number of 0s,
              t1 = number of 1s,
              t2 = number of 2s.
    """
    # Slice the time series based on skip and nf
    sliced_series = time_series[skip:skip + nf]

    # Count occurrences of 0, 1, and 2
    t0 = sliced_series.count(0)
    t1 = sliced_series.count(1)
    t2 = sliced_series.count(2)

    # Return the feature vector
    return [t0, t1, t2]

In [10]:
import random

time_series = data[0]
# print some sample node and their time series
# Select 3 random simulations
random_sims = random.sample(range(time_series.shape[0]), 3)

for sim in random_sims:
    labels = data[1][sim]  # get labels for this simulation
    source_nodes = [i for i, label in enumerate(labels) if label == 1]
    non_source_nodes = [i for i, label in enumerate(labels) if label == 0]

    print(f"\nSimulation {sim}:")
    if source_nodes:
        selected_source = random.choice(source_nodes)
        ts = time_series[sim, :, selected_source].tolist()
        print(f"  Source node {selected_source} time series: \n{ts}")
        skip = data[2][sim][0] if isinstance(data[2][sim], list) else data[2][sim]
        features = timeseries_to_features(ts, skip)
        print(f"    Features: {features}")
    else:
        print("  No source nodes in this simulation.")

    if len(non_source_nodes) >= 4:
        selected_non_sources = random.sample(non_source_nodes, 2)
    else:
        selected_non_sources = non_source_nodes

    for node in selected_non_sources:
        ts = time_series[sim, :, node].tolist()
        print(f"  Non-source node {node} time series: \n{ts}")
        skip = data[2][sim][0] if isinstance(data[2][sim], list) else data[2][sim]
        features = timeseries_to_features(ts, skip)
        print(f"    Features: {features}")


Simulation 15234:
  Source node 104 time series: 
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    Features: [0, 16, 0]
  Non-source node 202 time series: 
[0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    Features: [3, 13, 0]
  Non-source node 134 time series: 
[0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    Features: [2, 14, 0]

Simulation 19599:
  Source node 464 time series: 
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    Features: [0, 16, 0]
  Non-source node 647 time series: 
[0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    Features: [2, 14, 0]
  Non-source node 736 time series: 
[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
    Features: [4, 12, 0]

Simulation 2086:
  Source node 256 time series: 
[1, 1, 1

In [11]:
import torch

def convert_timeseries_to_features(data, nf=16, normalize=False):
    """
    Convert the time series data from the 'data' tuple into feature vectors for all nodes,
    ensuring the output format is [21200, 3, 774].

    Args:
        data (tuple): The dataset tuple containing:
                      - data[0]: Time series (shape: [21200, 30, 774]).
                      - data[2]: Skip values (shape: [21200, 1]).
        nf (int): The number of snapshots to consider (default is 16).
        normalize (bool): Whether to normalize the output values by dividing by nf.

    Returns:
        torch.Tensor: A PyTorch tensor of shape [21200, 3, 774], where:
                      - 21200: Number of simulations.
                      - 3: Feature vector dimensions [t0, t1, t2].
                      - 774: Number of nodes per simulation.
    """
    time_series = data[0]  # Shape: [21200, 30, 774]
    skip_values = data[2]  # Shape: [21200, 1]

    # Initialize an empty tensor to store features
    num_simulations, num_snapshots, num_nodes = time_series.shape
    node_features = torch.zeros((num_simulations, 3, num_nodes), dtype=torch.float32)

    for sim_idx in range(num_simulations):
        skip = skip_values[sim_idx][0]  # Get the skip value for this simulation
        sliced_series = time_series[sim_idx, skip:skip + nf, :]  # Slice the time series based on skip and nf

        # Count occurrences of 0, 1, and 2 across the sliced series
        t0 = (sliced_series == 0).sum(dim=0)  # Count 0s for each node
        t1 = (sliced_series == 1).sum(dim=0)  # Count 1s for each node
        t2 = (sliced_series == 2).sum(dim=0)  # Count 2s for each node

        # Normalize if required
        if normalize:
            t0 = t0 / nf
            t1 = t1 / nf
            t2 = t2 / nf

        # Store the feature vectors in the output tensor
        node_features[sim_idx, 0, :] = t0
        node_features[sim_idx, 1, :] = t1
        node_features[sim_idx, 2, :] = t2

    return node_features

In [12]:
node_features = convert_timeseries_to_features(data, normalize=True)

In [13]:
print(f"Node features shape: {len(node_features)} simulations, {len(node_features[0])} nodes, {len(node_features[0][0])} features per node")
# print the first 5 features for the first simulation
for i in range(5):
    print(f"Simulation {i} features: {node_features[i, :, :5]}")  # Print first 5 nodes for brevity

Node features shape: 21200 simulations, 3 nodes, 774 features per node
Simulation 0 features: tensor([[0.1250, 0.0625, 0.1250, 0.1250, 0.0625],
        [0.8750, 0.9375, 0.8750, 0.8750, 0.9375],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]])
Simulation 1 features: tensor([[0.1250, 0.0625, 0.0625, 0.1250, 0.0625],
        [0.8750, 0.9375, 0.9375, 0.8750, 0.9375],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]])
Simulation 2 features: tensor([[0.0625, 0.1250, 0.1250, 0.0625, 0.1250],
        [0.9375, 0.8750, 0.8750, 0.9375, 0.8750],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]])
Simulation 3 features: tensor([[0.1875, 0.1250, 0.1250, 0.1875, 0.1875],
        [0.8125, 0.8750, 0.8750, 0.8125, 0.8125],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]])
Simulation 4 features: tensor([[0.1875, 0.1875, 0.1250, 0.1250, 0.1875],
        [0.8125, 0.8125, 0.8750, 0.8750, 0.8125],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000]])


In [14]:
# Replace data[0] with node_features and save as a new pickle file
new_data = (node_features, data[1], data[2])

# Define new file name
new_data_file = data_file.replace('.pickle', '_with_node_features.pickle')
new_data_path = os.path.join(data_folder, new_data_file)

# Save the new data tuple
with open(new_data_path, 'wb') as f_out:
    pickle.dump(new_data, f_out)

print(f"Saved new data with node features to {new_data_path}")

Saved new data with node features to /home/twp/lantian/Conformalized-Network-Source-Detection/SD-STGCN/dataset/highSchool/data/SIR/SIR_nsrc7_Rzero2.5_beta0.25_gamma0_T30_ls21200_nf16_torchentire_with_node_features.pickle


# load and assign features to graph

In [82]:
import networkx as nx

graph_folder = '/home/twp/lantian/Conformalized-Network-Source-Detection/SD-STGCN/dataset/highSchool/data/graph'
graph_name = 'highSchool.edgelist'

# load the graph using networkx
graph_path = os.path.join(graph_folder, graph_name)
graph = nx.read_edgelist(graph_path)

# print the number of nodes and edges
print(f"Number of nodes: {graph.number_of_nodes()}")
print(f"Number of edges: {graph.number_of_edges()}")

# are there any node features?
if nx.get_node_attributes(graph, 'feature'):
    print("Node features found.")
else:
    print("No node features found.")

Number of nodes: 774
Number of edges: 7992
No node features found.
