In [None]:
import numpy as np
import pickle
import os

seed = 2023

In [None]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
from nlpsig_networks.scripts.seqsignet_functions import seqsignet_hyperparameter_search

In [None]:
output_dir = "rumours_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## Rumours

In [None]:
%run load_sbert-embeddings.py

In [None]:
df_rumours.head()

## Seq-Sig-Net

In [None]:
features = ["time_encoding", "timeline_index"]
standardise_method = ["z_score", None]
include_features_in_path = True
include_features_in_input = False

In [None]:
num_epochs = 100
dimensions = [15]
swnu_hidden_dim_sizes_and_sig_depths = [([12], 3), ([10], 4)]
lstm_hidden_dim_sizes = [384]
ffn_hidden_dim_sizes = [[256, 256], [512, 512]]
dropout_rates = [0.1, 0.2]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
bidirectional = True
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 5

In [None]:
kwargs = {
    "num_epochs": num_epochs,
    "df": df_rumours,
    "id_column": "timeline_id",
    "label_column": "label",
    "embeddings": sbert_embeddings,
    "y_data": y_data,
    "output_dim": output_dim,
    "dimensions": dimensions,
    "log_signature": True,
    "swnu_hidden_dim_sizes_and_sig_depths": swnu_hidden_dim_sizes_and_sig_depths,
    "lstm_hidden_dim_sizes": lstm_hidden_dim_sizes,
    "ffn_hidden_dim_sizes": ffn_hidden_dim_sizes,
    "dropout_rates": dropout_rates,
    "learning_rates": learning_rates,
    "BiLSTM": True,
    "seeds": seeds,
    "loss": loss,
    "gamma": gamma,
    "device": device,
    "features": features,
    "standardise_method": standardise_method,
    "include_features_in_path": include_features_in_path,
    "include_features_in_input": include_features_in_input,
    "split_ids": split_ids,
    "k_fold": True,
    "patience": patience,
    "validation_metric": validation_metric,
    "verbose": False,
}

# history_length=11

In [None]:
shift = 3
window_size = 5
n = 3

## UMAP

In [None]:
(
    seqsignet_network_umap_kfold_11,
    best_seqsignet_network_umap_kfold_11,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

In [None]:
seqsignet_network_umap_kfold_11

In [None]:
best_seqsignet_network_umap_kfold_11["f1"].mean()

In [None]:
best_seqsignet_network_umap_kfold_11["precision"].mean()

In [None]:
best_seqsignet_network_umap_kfold_11["recall"].mean()

In [None]:
np.stack(best_seqsignet_network_umap_kfold_11["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_11["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_11["recall_scores"]).mean(axis=0)

## GRP

In [None]:
(
    seqsignet_network_grp_kfold_11,
    best_seqsignet_network_grp_kfold_11,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["gaussian_random_projection"],
    results_output=f"{output_dir}/seqsignet_grp_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

In [None]:
seqsignet_network_grp_kfold_11

In [None]:
best_seqsignet_network_grp_kfold_11["f1"].mean()

In [None]:
best_seqsignet_network_grp_kfold_11["precision"].mean()

In [None]:
best_seqsignet_network_grp_kfold_11["recall"].mean()

In [None]:
np.stack(best_seqsignet_network_grp_kfold_11["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_grp_kfold_11["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_grp_kfold_11["recall_scores"]).mean(axis=0)

# history_length=20

In [None]:
shift = 3
window_size = 5
n = 6

## UMAP

In [None]:
(
    seqsignet_network_umap_kfold_20,
    best_seqsignet_network_umap_kfold_20,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

## UMAP

In [None]:
seqsignet_network_umap_kfold_20

In [None]:
best_seqsignet_network_umap_kfold_20["f1"].mean()

In [None]:
best_seqsignet_network_umap_kfold_20["precision"].mean()

In [None]:
best_seqsignet_network_umap_kfold_20["recall"].mean()

In [None]:
np.stack(best_seqsignet_network_umap_kfold_20["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_20["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_20["recall_scores"]).mean(axis=0)

## GRP

In [None]:
(
    seqsignet_network_grp_kfold_20,
    best_seqsignet_network_grp_kfold_20,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["gaussian_random_projection"],
    results_output=f"{output_dir}/seqsignet_grp_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

In [None]:
seqsignet_network_grp_kfold_20

In [None]:
best_seqsignet_network_grp_kfold_20["f1"].mean()

In [None]:
best_seqsignet_network_grp_kfold_20["precision"].mean()

In [None]:
best_seqsignet_network_grp_kfold_20["recall"].mean()

In [None]:
np.stack(best_seqsignet_network_grp_kfold_20["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_grp_kfold_20["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_grp_kfold_20["recall_scores"]).mean(axis=0)

# history_length=35

In [None]:
shift = 3
window_size = 5
n = 11

## UMAP

In [None]:
(
    seqsignet_network_umap_kfold_35,
    best_seqsignet_network_umap_kfold_35,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

In [None]:
seqsignet_network_umap_kfold_35

In [None]:
best_seqsignet_network_umap_kfold_35["f1"].mean()

In [None]:
best_seqsignet_network_umap_kfold_35["precision"].mean()

In [None]:
best_seqsignet_network_umap_kfold_35["recall"].mean()

In [None]:
np.stack(best_seqsignet_network_umap_kfold_35["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_35["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_35["recall_scores"]).mean(axis=0)

## GRP

In [None]:
(
    seqsignet_network_grp_kfold_35,
    best_seqsignet_network_grp_kfold_35,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["gaussian_random_projection"],
    results_output=f"{output_dir}/seqsignet_grp_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

In [None]:
seqsignet_network_grp_kfold_35

best_seqsignet_network_grp_kfold_35["f1"].mean()

In [None]:
best_seqsignet_network_grp_kfold_35["precision"].mean()

In [None]:
best_seqsignet_network_grp_kfold_35["recall"].mean()

In [None]:
np.stack(best_seqsignet_network_grp_kfold_35["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_grp_kfold_35["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_grp_kfold_35["recall_scores"]).mean(axis=0)

# history_length=80

In [None]:
shift = 3
window_size = 5
n = 26

## UMAP

In [None]:
(
    seqsignet_network_umap_kfold_80,
    best_seqsignet_network_umap_kfold_80,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

In [None]:
seqsignet_network_umap_kfold_80

In [None]:
best_seqsignet_network_umap_kfold_80["f1"].mean()

In [None]:
best_seqsignet_network_umap_kfold_80["precision"].mean()

In [None]:
best_seqsignet_network_umap_kfold_80["recall"].mean()

In [None]:
np.stack(best_seqsignet_network_umap_kfold_80["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_80["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_80["recall_scores"]).mean(axis=0)

## GRP

In [None]:
(
    seqsignet_network_grp_kfold_80,
    best_seqsignet_network_grp_kfold_80,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["gaussian_random_projection"],
    results_output=f"{output_dir}/seqsignet_grp_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

In [None]:
seqsignet_network_grp_kfold_80

In [None]:
best_seqsignet_network_grp_kfold_80["f1"].mean()

In [None]:
best_seqsignet_network_grp_kfold_80["precision"].mean()

In [None]:
best_seqsignet_network_grp_kfold_80["recall"].mean()

In [None]:
np.stack(best_seqsignet_network_grp_kfold_80["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_grp_kfold_80["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_grp_kfold_80["recall_scores"]).mean(axis=0)