In [1]:
seed = 2023
from nlpsig_networks.scripts.swnu_network_functions import (
    obtain_SWNUNetwork_input
)

## Rumours

In [275]:
from __future__ import annotations

import nlpsig
from nlpsig_networks.pytorch_utils import _get_timestamp, SaveBestModel, set_seed
from nlpsig_networks.swnu_network import SWNUNetwork
from nlpsig_networks.scripts.implement_model import implement_model
from typing import Iterable
import torch
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import os


def obtain_SWNUNetwork_input(
    method: str,
    dimension: int,
    df: pd.DataFrame,
    id_column: str,
    label_column: str,
    embeddings: np.array,
    k: int,
    features: list[str] | str | None = None,
    standardise_method: list[str] | str | None = None,
    include_features_in_path: bool = False,
    seed: int = 42,
    path_indices : list | np.array | None = None
) -> dict[str, torch.tensor | int]:
    # use nlpsig to construct the path as a numpy array
    # first define how we construct the path
    # i.e. padding by history for the last k posts,
    # include features and apply requested standardisation
    # construct the path using dimension reduced embeddings 
    # and include the currrent embedding in the path
    path_specifics = {"pad_by": "history",
                      "zero_padding": True,
                      "method": "k_last",
                      "k": k,
                      "features": features,
                      "standardise_method": standardise_method,
                      "embeddings": "dim_reduced",
                      "include_current_embedding": True}
    
    # first perform dimension reduction on embeddings
    if dimension == embeddings.shape[1]:
        # no need to perform dimensionality reduction
        embeddings_reduced = embeddings
    else:
        reduction = nlpsig.DimReduce(method=method,
                                     n_components=dimension)
        embeddings_reduced = reduction.fit_transform(embeddings,
                                                     random_state=seed)
    
    # obtain path by using PrepareData class and .pad method
    paths = nlpsig.PrepareData(df,
                               id_column=id_column,
                               label_column=label_column,
                               embeddings=embeddings,
                               embeddings_reduced=embeddings_reduced)
    paths.pad(**path_specifics)
    
    # slice the path in specified way
    if path_indices is not None:
        paths.array_padded = paths.array_padded[path_indices]
        paths.embeddings = paths.embeddings[path_indices]
        paths.embeddings_reduced = paths.embeddings_reduced[path_indices]
    
    # construct path for SWNUNetwork which is given as a dictionary with keys
    # "x_data", "input_channels" and "num_features"
    # include features and (full, not dimension reduced) embeddings in the FFN input
    return paths.get_torch_path_for_SWNUNetwork(
        include_features_in_path=include_features_in_path,
        include_features_in_input=True,
        include_embedding_in_input=True,
        reduced_embeddings=False
    )

In [276]:
%run load_sbert-embeddings.py

In [277]:
df_rumours.head()

Unnamed: 0,id,label,datetime,text,timeline_id,set
0,5.249902e+17,0,2014-10-22 18:26:23,Police have clarified that there were two shoo...,0,train
1,5.249906e+17,0,2014-10-22 18:27:58,"@CTVNews you guys ""confirmed"" there were 3 sho...",0,train
2,5.249908e+17,1,2014-10-22 18:28:46,@CTVNews get it right. http://t.co/GHYxMuzPG9,0,train
3,5.249927e+17,1,2014-10-22 18:36:29,RT @CTVNews Police have clarified that there w...,0,train
4,5.250038e+17,1,2014-10-22 19:20:41,@CTVNews @ctvsaskatoon so what happened at Rid...,0,train


In [278]:
len(df_rumours)

5568

In [279]:
import signatory

In [280]:
x_data = obtain_SWNUNetwork_input(
    method="umap",
    dimension=3,
    df=df_rumours,
    id_column='timeline_id',
    label_column='label',
    embeddings=sbert_embeddings,
    k=10,
    features=['time_encoding', 'timeline_index'],
    standardise_method=None,
    include_features_in_path=True,
)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


In [281]:
x_data.keys()

dict_keys(['x_data', 'input_channels', 'num_features'])

In [282]:
x_data["x_data"]["path"].shape

torch.Size([5568, 10, 5])

In [283]:
x_data["x_data"]["features"].shape

torch.Size([5568, 386])

In [284]:
x_data["input_channels"]

5

In [285]:
x_data["num_features"]

386

In [286]:
path = x_data["x_data"]["path"]

In [287]:
path.shape

torch.Size([5568, 10, 5])

In [288]:
signatures = signatory.logsignature(path, 2, stream=True)

In [289]:
signatures.shape

torch.Size([5568, 9, 15])

Apply convolution to this path.

In [290]:
path[0]

tensor([[ 2.0148e+03,  1.0000e+00, -7.2296e+00,  2.7356e+00,  8.0972e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00]],
       dtype=torch.float64)

In [291]:
signatures[0]

tensor([[-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          0.0000e+00,  0.0000e+00,

In [292]:
path[2]

tensor([[ 2.0148e+03,  1.0000e+00, -7.2296e+00,  2.7356e+00,  8.0972e+00],
        [ 2.0148e+03,  2.0000e+00, -7.4468e+00,  3.6928e+00,  9.9551e+00],
        [ 2.0148e+03,  3.0000e+00, -9.0931e+00,  1.2844e+01,  9.0442e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00]],
       dtype=torch.float64)

In [293]:
signatures[2]

tensor([[ 3.0124e-06,  1.0000e+00, -2.1716e-01,  9.5729e-01,  1.8579e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 4.5345e-06,  2.0000e+00, -1.8635e+00,  1.0108e+01,  9.4692e-01,
          7.4518e-07, -2.3145e-06,  1.3055e-05, -2.7860e-06, -7.1461e-01,
          4.0969e+00, -1.3844e+00, -2.0557e-01,  1.6283e+00, -8.9369e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          2.0148e+03, -1.8773e+03,  1.0183e+04,  9.5393e+02,  5.5832e+00,
          6.4156e+00, -9.0082e+00, -3.4197e+01,  5.7501e+00, -4.8567e+01],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          2.0148e+03, -1.8773e+03,  1.0183e+04,  9.5393e+02,  5.5832e+00,
          6.4156e+00, -9.0082e+00, -3.4197e+01,  5.7501e+00, -4.8567e+01],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          2.0148e+03, -1.8773e+03,

`True` values in the mask are the ones that indicate to MultiheadAttention to ignore in attention calculation.

In [294]:
signatures[0][1:].shape

torch.Size([8, 15])

In [295]:
signatures[0][:-1].shape

torch.Size([8, 15])

In [296]:
def obtain_signatures_mask(signatures: torch.tensor) -> torch.tensor:
    # assuming that padding was applied from below
    # signatures has dimensions [batch, length, channels]
    # compare each row with the row above it (for each batch)
    equal_to_previous = torch.eq(signatures[:,1:], signatures[:,:-1])
    # look for cases when the entire row is equal to the previous row
    equal_to_previous_row = torch.all(equal_to_previous, dim=2)
    false_tensor = torch.full((signatures.shape[0],1), False, dtype=torch.bool)
    # return bool tensor of dimension [batch, length]
    return torch.cat((false_tensor, equal_to_previous_row), dim=1)

In [297]:
mask = obtain_signatures_mask(signatures)

In [298]:
mask[0]

tensor([False,  True,  True,  True,  True,  True,  True,  True,  True])

In [299]:
mask_ = (torch.sum(path, 2) == 0)[:,:-1]

In [300]:
mask_[0]

tensor([False,  True,  True,  True,  True,  True,  True,  True,  True])

In [301]:
torch.equal(mask, mask_)

True

In [147]:
mask.dtype

torch.bool

In [148]:
mask.shape

torch.Size([5568, 9])

In [149]:
path[0]

tensor([[ 2.0148e+03,  1.0000e+00, -7.2296e+00,  2.7356e+00,  8.0972e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00]],
       dtype=torch.float64)

In [150]:
signatures[0]

tensor([[-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          0.0000e+00,  0.0000e+00,

In [151]:
mask[0]

tensor([False,  True,  True,  True,  True,  True,  True,  True,  True])

In [108]:
path[1]

tensor([[ 2.0148e+03,  1.0000e+00, -7.2296e+00,  2.7356e+00,  8.0972e+00],
        [ 2.0148e+03,  2.0000e+00, -7.4468e+00,  3.6928e+00,  9.9551e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00]],
       dtype=torch.float64)

In [152]:
signatures[1]

tensor([[ 3.0124e-06,  1.0000e+00, -2.1716e-01,  9.5729e-01,  1.8579e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          1.0074e+03, -2.1876e+02,  9.6438e+02,  1.8716e+03,  3.5062e+00,
         -8.8913e-01, -3.1197e+00, -3.1634e+00, -5.8367e+00, -1.3345e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          1.0074e+03, -2.1876e+02,  9.6438e+02,  1.8716e+03,  3.5062e+00,
         -8.8913e-01, -3.1197e+00, -3.1634e+00, -5.8367e+00, -1.3345e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          1.0074e+03, -2.1876e+02,  9.6438e+02,  1.8716e+03,  3.5062e+00,
         -8.8913e-01, -3.1197e+00, -3.1634e+00, -5.8367e+00, -1.3345e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          1.0074e+03, -2.1876e+02,

In [153]:
mask[1]

tensor([False, False,  True,  True,  True,  True,  True,  True,  True])

In [156]:
import torch.nn as nn
mha = nn.MultiheadAttention(
    embed_dim=15,
    num_heads=1,
    batch_first=True,
).double()

In [157]:
attn_output, attn_output_weights = mha(signatures.double(), signatures.double(), signatures.double(), key_padding_mask=mask)

In [158]:
i=0
print(signatures[i])
# print(mask[i])
print(attn_output[i])

tensor([[-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          0.0000e+00,  0.0000e+00,

In [160]:
i=1
print(signatures[i])
print(mask[i])
print(attn_output[i])

tensor([[ 3.0124e-06,  1.0000e+00, -2.1716e-01,  9.5729e-01,  1.8579e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          1.0074e+03, -2.1876e+02,  9.6438e+02,  1.8716e+03,  3.5062e+00,
         -8.8913e-01, -3.1197e+00, -3.1634e+00, -5.8367e+00, -1.3345e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          1.0074e+03, -2.1876e+02,  9.6438e+02,  1.8716e+03,  3.5062e+00,
         -8.8913e-01, -3.1197e+00, -3.1634e+00, -5.8367e+00, -1.3345e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          1.0074e+03, -2.1876e+02,  9.6438e+02,  1.8716e+03,  3.5062e+00,
         -8.8913e-01, -3.1197e+00, -3.1634e+00, -5.8367e+00, -1.3345e+00],
        [-2.0148e+03, -1.0000e+00,  7.2296e+00, -2.7356e+00, -8.0972e+00,
          1.0074e+03, -2.1876e+02,

In [118]:
i=2
print(path[i])
print(mask[i])
print(attn_output[i])

tensor([[ 2.0148e+03,  1.0000e+00, -7.2296e+00,  2.7356e+00,  8.0972e+00],
        [ 2.0148e+03,  2.0000e+00, -7.4468e+00,  3.6928e+00,  9.9551e+00],
        [ 2.0148e+03,  3.0000e+00, -9.0931e+00,  1.2844e+01,  9.0442e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00]],
       dtype=torch.float64)
tensor([False, False, False,  True,  True,  True,  True,  True,  True,  True])
tensor([[ 369.3173,  610.7261, -434.8319,  221.3471,  672.8438],
        [ 369.3173,  610.7261, -434.8319,  221.3471,  672.8438],
        [ 36

In [119]:
i=4
print(path[i])
print(mask[i])
print(attn_output[i])

tensor([[ 2.0148e+03,  1.0000e+00, -7.2296e+00,  2.7356e+00,  8.0972e+00],
        [ 2.0148e+03,  2.0000e+00, -7.4468e+00,  3.6928e+00,  9.9551e+00],
        [ 2.0148e+03,  3.0000e+00, -9.0931e+00,  1.2844e+01,  9.0442e+00],
        [ 2.0148e+03,  4.0000e+00, -6.7662e+00,  2.6486e+00,  8.3875e+00],
        [ 2.0148e+03,  5.0000e+00, -5.3710e+00,  7.5870e+00,  1.2499e+01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00]],
       dtype=torch.float64)
tensor([False, False, False, False, False,  True,  True,  True,  True,  True])
tensor([[ 369.3173,  610.7261, -434.8319,  221.3471,  672.8438],
        [ 369.3173,  610.7261, -434.8319,  221.3471,  672.8438],
        [ 36