In [None]:
import pickle
import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm
import re

seed = 2023

In [None]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
from nlpsig_networks.scripts.swnu_network_functions import (
    swnu_network_hyperparameter_search,
    obtain_SWNUNetwork_input,
)

In [None]:
output_dir = "rumours_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## Rumours

In [None]:
%run load_sbert-embeddings.py

In [None]:
df_rumours.head()

## SWNU Network

In [None]:
features = ["time_encoding", "timeline_index"]
standardise_method = ["z_score", None]
include_features_in_path = True
include_features_in_input = True

In [None]:
num_epochs = 100
dimensions = [15]
swnu_hidden_dim_sizes_and_sig_depths = [([12], 3), ([10], 3)]
ffn_hidden_dim_sizes = [[32,32], [128,128], [512,512]]
dropout_rates = [0.1]
learning_rates = [5e-4, 3e-4, 1e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 3

In [None]:
kwargs = {
    "num_epochs": num_epochs,
    "df": df_rumours,
    "id_column": "timeline_id",
    "label_column": "label",
    "embeddings": sbert_embeddings,
    "y_data": y_data,
    "output_dim": output_dim,
    "dimensions": dimensions,
    "log_signature": True,
    "pooling": "signature",
    "swnu_hidden_dim_sizes_and_sig_depths": swnu_hidden_dim_sizes_and_sig_depths,
    "ffn_hidden_dim_sizes": ffn_hidden_dim_sizes,
    "dropout_rates": dropout_rates,
    "learning_rates": learning_rates,
    "BiLSTM": True,
    "seeds": seeds,
    "loss": loss,
    "gamma": gamma,
    "device": device,
    "features": features,
    "standardise_method": standardise_method,
    "include_features_in_path": include_features_in_path,
    "include_features_in_input": include_features_in_input,
    "split_ids": split_ids,
    "k_fold": True,
    "patience": patience,
    "validation_metric": validation_metric,
    "verbose": False,
}

# w=5

In [None]:
size = 5

## grp

In [None]:
(
    swnu_network_grp_kfold_5,
    best_swnu_network_grp_kfold_5,
    _,
    __,
) = swnu_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["gaussian_random_projection"],
    results_output=f"{output_dir}/swnu_network_grp_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

In [None]:
swnu_network_grp_kfold_5.groupby(
    ["dimensions", "swnu_hidden_dim", "ffn_hidden_dim", "dropout_rate", "learning_rate"]
).mean()

In [None]:
best_swnu_network_grp_kfold_5

In [None]:
best_swnu_network_grp_kfold_5["f1"].mean()

In [None]:
best_swnu_network_grp_kfold_5["precision"].mean()

In [None]:
best_swnu_network_grp_kfold_5["recall"].mean()

In [None]:
np.stack(best_swnu_network_grp_kfold_5["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_swnu_network_grp_kfold_5["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_swnu_network_grp_kfold_5["recall_scores"]).mean(axis=0)

# w=11

In [None]:
size = 11

## grp

In [None]:
(
    swnu_network_grp_kfold_11,
    best_swnu_network_grp_kfold_11,
    _,
    __,
) = swnu_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["gaussian_random_projection"],
    results_output=f"{output_dir}/swnu_network_grp_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

In [None]:
swnu_network_grp_kfold_11.groupby(
    ["dimensions", "swnu_hidden_dim", "ffn_hidden_dim", "dropout_rate", "learning_rate"]
).mean()

In [None]:
best_swnu_network_grp_kfold_11

In [None]:
best_swnu_network_grp_kfold_11["f1"].mean()

In [None]:
best_swnu_network_grp_kfold_11["precision"].mean()

In [None]:
best_swnu_network_grp_kfold_11["recall"].mean()

In [None]:
np.stack(best_swnu_network_grp_kfold_11["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_swnu_network_grp_kfold_11["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_swnu_network_grp_kfold_11["recall_scores"]).mean(axis=0)

# w=20

In [None]:
size = 20

## grp

In [None]:
(
    swnu_network_grp_kfold_20,
    best_swnu_network_grp_kfold_20,
    _,
    __,
) = swnu_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["gaussian_random_projection"],
    results_output=f"{output_dir}/swnu_network_grp_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

In [None]:
swnu_network_grp_kfold_20.groupby(
    ["dimensions", "swnu_hidden_dim", "ffn_hidden_dim", "dropout_rate", "learning_rate"]
).mean()

In [None]:
best_swnu_network_grp_kfold_20

In [None]:
best_swnu_network_grp_kfold_20["f1"].mean()

In [None]:
best_swnu_network_grp_kfold_20["precision"].mean()

In [None]:
best_swnu_network_grp_kfold_20["recall"].mean()

In [None]:
np.stack(best_swnu_network_grp_kfold_20["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_swnu_network_grp_kfold_20["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_swnu_network_grp_kfold_20["recall_scores"]).mean(axis=0)

# w=35

In [None]:
size = 35

## grp

In [None]:
(
    swnu_network_grp_kfold_35,
    best_swnu_network_grp_kfold_35,
    _,
    __,
) = swnu_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["gaussian_random_projection"],
    results_output=f"{output_dir}/swnu_network_grp_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

In [None]:
swnu_network_grp_kfold_35.groupby(
    ["dimensions", "swnu_hidden_dim", "ffn_hidden_dim", "dropout_rate", "learning_rate"]
).mean()

In [None]:
best_swnu_network_grp_kfold_35

In [None]:
best_swnu_network_grp_kfold_35["f1"].mean()

In [None]:
best_swnu_network_grp_kfold_35["precision"].mean()

In [None]:
best_swnu_network_grp_kfold_35["recall"].mean()

In [None]:
np.stack(best_swnu_network_grp_kfold_35["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_swnu_network_grp_kfold_35["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_swnu_network_grp_kfold_35["recall_scores"]).mean(axis=0)

# w=80

In [None]:
size = 80

## grp

In [None]:
(
    swnu_network_grp_kfold_80,
    best_swnu_network_grp_kfold_80,
    _,
    __,
) = swnu_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["gaussian_random_projection"],
    results_output=f"{output_dir}/swnu_network_grp_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

In [None]:
swnu_network_grp_kfold_80.groupby(
    ["dimensions", "swnu_hidden_dim", "ffn_hidden_dim", "dropout_rate", "learning_rate"]
).mean()

In [None]:
best_swnu_network_grp_kfold_80

In [None]:
best_swnu_network_grp_kfold_80["f1"].mean()

In [None]:
best_swnu_network_grp_kfold_80["precision"].mean()

In [None]:
best_swnu_network_grp_kfold_80["recall"].mean()

In [None]:
np.stack(best_swnu_network_grp_kfold_80["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_swnu_network_grp_kfold_80["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_swnu_network_grp_kfold_80["recall_scores"]).mean(axis=0)