Import Libraries

In [1]:
import numpy as np
import pandas as pd
import json
import re
import pickle
import datetime

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


Model Specifics

In [3]:
# ================================
# model specifics
model_specifics = {
    "data": "Rumoureval",
    "global_embedding_tp": "SBERT",  # options: SBERT, BERT_cls , BERT_mean, BERT_max
    "dimensionality_reduction_tp": "umap",  # options: ppapca, ppapcappa, umap
    "dimensionality_reduction_components": 15,  # options: any int number between 1 and embedding dimensions
    "dimensionality_reduction": True,  # options: True, False
    "time_injection_history_tp": None,  # options: timestamp, None
    "time_injection_post_tp": "timestamp",  # options: timestamp, None
    "signature_dimensions": 3,  # options: any int number larger than 1
    "post_embedding_tp": "windowsigsentence",  # options: sentence, reduced, sentencelstm (1), sentencesiglstm (2), windowsigsentence (3)
    "feature_combination_method": "concatenation",  # options: concatenation, gated_addition, gated_concatenation
    "signature_tp": "log",  # options: log, sig
    "augmentation_tp": "Conv1d",  # options: Conv1d, CNN
    "loss_function": "focal",  # options: focal, cbfocal
    "reduced_network_components": 10,  # any integer greater than 1
    "k_window": 5,  # window of looking back k-1 posts + current
    "classifier_name": "Seq-Sig-Net",  #'Conv1d3kernel13channelSigLSTMSig-LSTM', # options: FFN2hidden (any future classifiers added)
    "classes_num": "2class",  # options: 3class (5class to be added in the future)
}

Read converstations data

In [4]:
f_name = "/storage/ttseriotou/rumour_eval/data/conversations.json"

with open(f_name, "r") as f:
    data = json.load(f)

Data Conversion for the Longitudinal Task

In [5]:
# Convert conversation thread to linear timeline: we use timestamps of each post in the twitter thread to obtain a chronologically ordered list.
def tree2timeline(conversation):
    timeline = []
    timeline.append(
        (
            conversation["source"]["id"],
            conversation["source"]["created_at"],
            conversation["source"]["stance"],
        )
    )
    replies = conversation["replies"]
    replies_idstr = []
    replies_timestamp = []
    for reply in replies:
        replies_idstr.append((reply["id"], reply["created_at"], reply["stance"]))
        replies_timestamp.append(reply["created_at"])

    sorted_replies = [x for (y, x) in sorted(zip(replies_timestamp, replies_idstr))]
    timeline.extend(sorted_replies)
    return timeline


stance_timelines = {"dev": [], "train": [], "test": []}
switch_timelines = {"dev": [], "train": [], "test": []}
check = []
count_switch_threads = 0
all_support_switches = 0
all_oppose_switches = 0
count_threads = 0

for subset in list(data.keys()):
    count_threads += len(data[subset])
    for conv in data[subset]:
        timeline = tree2timeline(conv)
        stance_timelines[subset].append(timeline)
        support = 0
        deny = 0
        old_sum = 0
        switch_events = []
        for i, s in enumerate(timeline):
            if s[2] == "support":
                support = support + 1
            elif s[2] == "query" or s[2] == "deny":
                deny = deny + 1

            new_sum = support - deny
            check.append(new_sum)

            if i != 0 and old_sum == 0 and new_sum != 0:
                # A switch in stance from supporting to opposing the claim starts
                if new_sum < 0:
                    switch_events.append((s[0], s[1], -1))
                # A switch in stance from opposing to supporting the claim starts
                elif new_sum > 0:
                    switch_events.append((s[0], s[1], 1))
            elif (
                i != 0
                and old_sum < 0
                and new_sum < 0
                and -1 in [x[2] for x in switch_events]
            ):
                # A switch in stance from supporting to opposing the claim continues
                switch_events.append((s[0], s[1], -2))
            elif (
                i != 0
                and old_sum > 0
                and new_sum > 0
                and 1 in [x[2] for x in switch_events]
            ):
                # A switch in stance from opposing to supporting the claim continues
                switch_events.append((s[0], s[1], 2))

            else:
                switch_events.append((s[0], s[1], 0))
            old_sum = new_sum

        support_switch = [x[2] for x in switch_events].count(1)
        oppose_switch = [x[2] for x in switch_events].count(-1)

        if support_switch + oppose_switch > 0:
            count_switch_threads = count_switch_threads + 1
            all_support_switches += support_switch
            all_oppose_switches += oppose_switch

        switch_timelines[subset].append(switch_events)
print(
    "Ratio of threads with switches / total number of threads: {}/{}".format(
        count_switch_threads, count_threads
    )
)
print(
    "# of switches of stance from opposition to support of claim: ",
    all_support_switches,
)
print(
    "# of switches of stance from support to opposition of claim: ", all_oppose_switches
)

Ratio of threads with switches / total number of threads: 155/325
# of switches of stance from opposition to support of claim:  103
# of switches of stance from support to opposition of claim:  132


Count all posts in training data

In [6]:
posts_train_n = 0
posts_dev_n = 0
posts_test_n = 0

for i in switch_timelines["train"]:
    posts_train_n += len(i)
for i in switch_timelines["dev"]:
    posts_dev_n += len(i)
for i in switch_timelines["test"]:
    posts_test_n += len(i)

print("Total of train/dev/test posts:", posts_train_n, posts_dev_n, posts_test_n)

Total of train/dev/test posts: 4238 281 1049


Read alredy formed embeddings

In [7]:
# read data
sbert_file = "/storage/ttseriotou/rumour_eval/data/longrumoureval_sbert/sbert.pkl"
with open(sbert_file, "rb") as g:
    emb_data = pickle.load(g)

Converstion of Labels to binary

In [8]:
def simplify_label(y):
    # If the label is -2,-1,2 this is is relabeled to 1
    if y != 0:
        y = 1
    return y


for subset in ["train", "dev", "test"]:
    for i, thread in enumerate(switch_timelines[subset]):
        switch_timelines[subset][i] = [
            (x, z, simplify_label(y)) for (x, z, y) in thread
        ]

Combine all embeddings and get a df

In [9]:
def embedding_df(emb_data):
    # embeddings in df
    row_list = []

    for subset in ["train", "dev", "test"]:
        for thread in emb_data[subset]:
            record = np.concatenate(
                (
                    np.array(thread["source"]["id"]).reshape(
                        1,
                    ),
                    np.array(subset).reshape(
                        1,
                    ),
                    thread["source"]["emb"],
                )
            )
            row_list.append(record)

            for tweet in thread["replies"]:
                record = np.concatenate(
                    (
                        np.array(tweet["id"]).reshape(
                            1,
                        ),
                        np.array(subset).reshape(
                            1,
                        ),
                        tweet["emb"],
                    )
                )
                row_list.append(record)

    df_emb = pd.DataFrame(row_list)
    df_emb.columns = ["id", "subset"] + [
        "e" + str(i + 1)
        for i in range(emb_data["train"][0]["replies"][0]["emb"].shape[0])
    ]
    df_emb["id"] = df_emb["id"].astype("float")
    df_emb[[c for c in df_emb.columns if re.match("^e\w*[0-9]", c)]] = df_emb[
        [c for c in df_emb.columns if re.match("^e\w*[0-9]", c)]
    ].astype("float")

    return df_emb


df_emb = embedding_df(emb_data)
print(df_emb.shape)
df_emb.head()

(5568, 386)


Unnamed: 0,id,subset,e1,e2,e3,e4,e5,e6,e7,e8,...,e375,e376,e377,e378,e379,e380,e381,e382,e383,e384
0,5.249902e+17,train,0.064823,0.009375,-0.033364,-0.040282,0.08319,0.043792,-0.012862,-0.038167,...,-0.021066,0.047782,-0.041925,-0.029498,0.028939,-0.030646,0.00524,-0.011905,0.018394,0.001854
1,5.250038e+17,train,0.018231,0.079517,0.032132,0.026828,0.00789,0.042798,-0.037635,0.013557,...,0.033683,0.015816,-0.07775,-0.000398,0.026768,-0.050967,-0.027182,0.021025,-0.02862,0.008729
2,5.249927e+17,train,0.053822,0.009492,-0.009654,-0.032107,0.098263,0.069386,0.010227,-0.036979,...,-0.023394,0.075511,-0.03601,-0.033392,0.034138,-0.032089,0.003894,-0.037643,0.02218,0.011833
3,5.249908e+17,train,-0.031863,0.055065,-0.024028,-0.020351,-0.030556,-0.030321,0.0173,0.00252,...,0.060842,0.04316,-0.102785,-0.049965,0.001174,0.078667,0.044994,0.104832,0.009009,0.09331
4,5.249906e+17,train,-0.02926,-0.053905,-0.039863,0.023316,0.133505,0.001998,0.001952,-0.012355,...,0.091279,0.08823,-0.047634,-0.044625,0.036614,-0.095384,0.040821,-0.102738,-0.03456,0.093606


Dimensionality Reduction

In [10]:
# dimensionality reduction
from dimensionality_reduction import DimensionalityReduction

reduction = DimensionalityReduction(
    method=model_specifics["dimensionality_reduction_tp"],
    components=model_specifics["dimensionality_reduction_components"],
)
embeddings_reduced = reduction.fit_transform(
    np.array(df_emb[[c for c in df_emb.columns if re.match("^e\w*[0-9]", c)]])
)

print(embeddings_reduced.shape)

# form dataframe
df_emb_reduced = pd.concat(
    [
        df_emb.reset_index(drop=True),
        pd.DataFrame(
            embeddings_reduced,
            columns=["d" + str(i + 1) for i in range(embeddings_reduced.shape[1])],
        ),
    ],
    axis=1,
)
print(df_emb_reduced.shape)

(5568, 15)
(5568, 401)


Data Classes and Preparation for Training

In [11]:
# TRAIN/DEV/TEST
total_year_hours = 365 * 24


def time_fraction(x):
    return (
        x.year
        + abs(x - datetime.datetime(x.year, 1, 1, 0)).total_seconds()
        / 3600.0
        / total_year_hours
    )


def get_data_format(df_emb_x, switch_timelines, subset="train", k=3):
    ####################################

    # format for data
    zeros = np.concatenate(
        (
            np.array([100]),
            np.repeat(
                0,
                df_emb_x[
                    [c for c in df_emb_x.columns if not re.match("^e\w*[0-9]", c)]
                ].shape[1],
            ),
        ),
        axis=0,
    )

    sample_list = []

    start_i = 0
    end_i = 0

    df = pd.DataFrame(
        [],
        columns=["id", "label", "timestamp"]
        + [c for c in df_emb_x.columns if re.match("^e\w*[0-9]", c)]
        + [c for c in df_emb_x.columns if re.match("^d\w*[0-9]", c)],
    )

    for e, thread in enumerate(switch_timelines[subset]):
        df_thread = pd.DataFrame(thread, columns=["id", "timestamp", "label"])
        df_thread = df_thread.reindex(columns=["id", "label", "timestamp"])

        df_thread["timeline_id"] = str(e)
        df_thread["id"] = df_thread["id"].astype("float")
        df_thread["timestamp"] = pd.to_datetime(df_thread["timestamp"])
        df_thread["timestamp"] = df_thread["timestamp"].map(
            lambda t: time_fraction(t.replace(tzinfo=None))
        )

        df_thread = df_thread.merge(df_emb_x, on="id", how="left")

        df = pd.concat([df, df_thread])
        df_thread = df_thread[
            ["id", "label", "timestamp"]
            + [c for c in df_emb_x.columns if re.match("^d\w*[0-9]", c)]
        ]

        for tweet in thread:
            end_i += 1
            if (end_i - start_i) > k:
                start_i = end_i - k
            df_add = df_thread[start_i:end_i]

            # padding
            padding_n = k - (end_i - start_i)
            zeros_tile = np.tile(zeros, (padding_n, 1))
            df_padi = np.concatenate((df_add, zeros_tile), axis=0)[np.newaxis, :, :]
            sample_list.append(df_padi)

        start_i = 0
        end_i = 0

    # append all samples together
    print(df.shape)

    df_padded = np.concatenate((sample_list), axis=0)
    print(df_padded.shape)

    return df_padded, df


# GET SETS FOR TRAIN/DEV/TEST
df_padded_train, df_train = get_data_format(
    df_emb_reduced, switch_timelines, subset="train", k=model_specifics["k_window"]
)
df_padded_dev, df_dev = get_data_format(
    df_emb_reduced, switch_timelines, subset="dev", k=model_specifics["k_window"]
)
df_padded_test, df_test = get_data_format(
    df_emb_reduced, switch_timelines, subset="test", k=model_specifics["k_window"]
)

(4238, 404)
(4238, 5, 18)
(281, 404)
(281, 5, 18)
(1049, 404)
(1049, 5, 18)


Preparing data for Seq-Sig-Net

In [12]:
# torch conversion and removal of label and time dimensions for now
def get_time_currentpost(df_padded, df, model_specifics):
    k = model_specifics["k_window"]
    path = torch.from_numpy(df_padded[:, :, 3:].astype(float))

    if model_specifics["time_injection_post_tp"] == "timestamp":
        mean = df_padded[:, :, 2][df_padded[:, :, 2] != 0].mean()
        std = df_padded[:, :, 2][df_padded[:, :, 2] != 0].std()
        time_feature = (
            torch.from_numpy(df_padded[:, :, 2].astype(float)).unsqueeze(1) - mean
        ) / std
    else:
        time_feature = None

    if (
        (model_specifics["post_embedding_tp"] == "sentence")
        | (model_specifics["post_embedding_tp"] == "sentencelstm")
        | (model_specifics["post_embedding_tp"] == "sentencesiglstm")
        | (model_specifics["post_embedding_tp"] == "windowsigsentence")
    ):
        bert_embeddings = (
            torch.tensor(
                df[[c for c in df.columns if re.match("^e\w*[0-9]", c)]]
                .astype(float)
                .values
            )
            .unsqueeze(2)
            .repeat(1, 1, k)
        )
    elif model_specifics["post_embedding_tp"] == "reduced":
        bert_embeddings = (
            torch.tensor(
                df[[c for c in df.columns if re.match("^d\w*[0-9]", c)]]
                .astype(float)
                .values
            )
            .unsqueeze(2)
            .repeat(1, 1, k)
        )
    else:
        bert_embeddings = None

    x_data = torch.transpose(path, 1, 2)

    if time_feature != None:
        x_data = torch.cat((x_data, time_feature), dim=1)
    if bert_embeddings != None:
        x_data = torch.cat((x_data, bert_embeddings), dim=1)

    if model_specifics["post_embedding_tp"] == "windowsigsentence":
        # shift
        emb_str = "^e\w*[0-9]"
        embed_list = [c for c in df.columns if re.match(emb_str, c)]
        df["timeline_match1"] = df["timeline_id"].eq(df.timeline_id.shift(1))
        df["timeline_match2"] = df["timeline_id"].eq(df.timeline_id.shift(2))
        df["timeline_match3"] = df["timeline_id"].eq(df.timeline_id.shift(3))
        df["timeline_match4"] = df["timeline_id"].eq(df.timeline_id.shift(4))
        df["timeline_match5"] = df["timeline_id"].eq(df.timeline_id.shift(5))
        df["timeline_match6"] = df["timeline_id"].eq(df.timeline_id.shift(6))
        df["timeline_match7"] = df["timeline_id"].eq(df.timeline_id.shift(7))
        df["timeline_match8"] = df["timeline_id"].eq(df.timeline_id.shift(8))
        df["timeline_match9"] = df["timeline_id"].eq(df.timeline_id.shift(9))
        df["timeline_match10"] = df["timeline_id"].eq(df.timeline_id.shift(10))
        df["timeline_match12"] = df["timeline_id"].eq(df.timeline_id.shift(12))
        df["timeline_match15"] = df["timeline_id"].eq(df.timeline_id.shift(15))
        df["timeline_match16"] = df["timeline_id"].eq(df.timeline_id.shift(16))
        df["timeline_match18"] = df["timeline_id"].eq(df.timeline_id.shift(18))
        df["timeline_match21"] = df["timeline_id"].eq(df.timeline_id.shift(21))
        df["timeline_match24"] = df["timeline_id"].eq(df.timeline_id.shift(24))
        # CONSTRACT NEW SHIFTED MATRCES
        x_datam1 = torch.roll(x_data, 3, 0)
        x_datam2 = torch.roll(x_data, 6, 0)
        x_datam3 = torch.roll(x_data, 9, 0)
        x_datam4 = torch.roll(x_data, 12, 0)
        x_datam5 = torch.roll(x_data, 15, 0)
        x_datam6 = torch.roll(x_data, 18, 0)
        x_datam7 = torch.roll(x_data, 21, 0)
        x_datam8 = torch.roll(x_data, 24, 0)
        # CREATE MASK AND THEN ASSING 0S BASED ON THAT MASK
        mask_m1 = torch.zeros_like(x_data)
        mask_m1[~torch.tensor(df["timeline_match3"].values)] = 2
        mask_m1 = mask_m1.ge(1)
        ####
        mask_m2 = torch.zeros_like(x_data)
        mask_m2[~torch.tensor(df["timeline_match6"].values)] = 2
        mask_m2 = mask_m2.ge(1)
        #####
        mask_m3 = torch.zeros_like(x_data)
        mask_m3[~torch.tensor(df["timeline_match9"].values)] = 2
        mask_m3 = mask_m3.ge(1)
        #####
        mask_m4 = torch.zeros_like(x_data)
        mask_m4[~torch.tensor(df["timeline_match12"].values)] = 2
        mask_m4 = mask_m4.ge(1)
        #####
        mask_m5 = torch.zeros_like(x_data)
        mask_m5[~torch.tensor(df["timeline_match15"].values)] = 2
        mask_m5 = mask_m5.ge(1)
        #####
        mask_m6 = torch.zeros_like(x_data)
        mask_m6[~torch.tensor(df["timeline_match18"].values)] = 2
        mask_m6 = mask_m6.ge(1)
        #####
        mask_m7 = torch.zeros_like(x_data)
        mask_m7[~torch.tensor(df["timeline_match21"].values)] = 2
        mask_m7 = mask_m7.ge(1)
        #####
        mask_m8 = torch.zeros_like(x_data)
        mask_m8[~torch.tensor(df["timeline_match24"].values)] = 2
        mask_m8 = mask_m8.ge(1)
        #####
        x_datam1[mask_m1] = 0
        x_datam2[mask_m2] = 0
        x_datam3[mask_m3] = 0
        x_datam4[mask_m4] = 0
        x_datam5[mask_m5] = 0
        x_datam6[mask_m6] = 0
        x_datam7[mask_m7] = 0
        x_datam8[mask_m8] = 0
        #####
        x_data = torch.cat(
            (
                x_data.unsqueeze(3),
                x_datam1.unsqueeze(3),
                x_datam2.unsqueeze(3),
                x_datam3.unsqueeze(3),
                x_datam4.unsqueeze(3),
                x_datam5.unsqueeze(3),
                x_datam6.unsqueeze(3),
                x_datam7.unsqueeze(3),
                x_datam8.unsqueeze(3),
            ),
            dim=3,
        )

    print(x_data.shape)

    return x_data, path


# final data for model
x_train, path = get_time_currentpost(df_padded_train, df_train, model_specifics)
x_valid, _ = get_time_currentpost(df_padded_dev, df_dev, model_specifics)
x_test, _ = get_time_currentpost(df_padded_test, df_test, model_specifics)

torch.Size([4238, 400, 5, 9])
torch.Size([281, 400, 5, 9])
torch.Size([1049, 400, 5, 9])


In [13]:
# Get y labels and test ids
y_train = torch.tensor(df_train["label"].astype(float).values, dtype=torch.int64)
y_valid = torch.tensor(df_dev["label"].astype(float).values, dtype=torch.int64)
y_test = torch.tensor(df_test["label"].astype(float).values, dtype=torch.int64)

test_pids = torch.tensor(df_padded_test[:, 0, 0].astype(float))

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import signatory
import numpy as np

# ARCHITECTURE 3 - BEST


class StackedDeepLSTMSigNet(nn.Module):
    def __init__(
        self,
        input_channels,
        output_channels,
        sig_d,
        hidden_dim_lstm,
        post_dim,
        hidden_dim,
        output_dim,
        dropout_rate,
        add_time=False,
        augmentation_tp="Conv1d",
        augmentation_layers=(),
        comb_method="concatenation",
        attention=False,
    ):
        super(StackedDeepLSTMSigNet, self).__init__()
        self.input_channels = input_channels
        self.add_time = add_time
        self.augmentation_tp = augmentation_tp
        self.comb_method = comb_method
        self.BiLSTM = BiLSTM
        self.attention = attention
        input_bert_dim = 384

        # self attention
        self.self_attn = nn.MultiheadAttention(
            input_channels, num_heads=3, bias=True
        ).double()

        # Convolution
        self.conv = nn.Conv1d(
            input_channels, output_channels, 3, stride=1, padding=1
        ).double()
        self.augment = signatory.Augment(
            in_channels=input_channels,
            layer_sizes=augmentation_layers,
            kernel_size=3,
            padding=1,
            stride=1,
            include_original=False,
            include_time=False,
        ).double()
        # Non-linearity
        self.tanh1 = nn.Tanh()
        # Signature with lift
        self.signature1 = signatory.LogSignature(depth=sig_d, stream=True)
        if self.add_time:
            input_dim_lstm = signatory.logsignature_channels(output_channels + 1, sig_d)
        else:
            input_dim_lstm = signatory.logsignature_channels(output_channels, sig_d)

        # Signatures and LSTMs for signature windows
        self.lstm_sig1 = nn.LSTM(
            input_size=input_dim_lstm,
            hidden_size=hidden_dim_lstm[-2],
            num_layers=1,
            batch_first=True,
            bidirectional=False,
        ).double()
        self.signature2 = signatory.LogSignature(depth=sig_d, stream=False)

        input_dim_lstmsig = signatory.logsignature_channels(hidden_dim_lstm[-2], sig_d)
        self.lstm_sig2 = nn.LSTM(
            input_size=input_dim_lstmsig,
            hidden_size=hidden_dim_lstm[-1],
            num_layers=1,
            batch_first=True,
            bidirectional=True,
        ).double()
        self.signature3 = signatory.LogSignature(depth=sig_d, stream=False)

        # combination method
        if comb_method == "concatenation":
            # input_dim = signatory.logsignature_channels(hidden_dim_lstm[-1], sig_d) + post_dim
            input_dim = hidden_dim_lstm[-1] + post_dim
        elif comb_method == "gated_addition":
            input_dim = input_bert_dim
            input_gated_linear = (
                signatory.logsignature_channels(hidden_dim_lstm[-1], sig_d) + 1
            )
            self.fc_scale = nn.Linear(input_gated_linear, input_bert_dim)
            # define the scaler parameter
            self.scaler = torch.nn.Parameter(torch.zeros(1, input_bert_dim))
        elif comb_method == "gated_concatenation":
            input_gated_linear = (
                signatory.logsignature_channels(hidden_dim_lstm[-1], sig_d) + 1
            )
            input_dim = input_bert_dim + input_gated_linear
            # define the scaler parameter
            self.scaler1 = torch.nn.Parameter(torch.zeros(1, input_gated_linear))
        elif comb_method == "scaled_concatenation":
            input_dim = (
                signatory.logsignature_channels(hidden_dim_lstm[-1], sig_d) + post_dim
            )
            # define the scaler parameter
            self.scaler2 = torch.nn.Parameter(torch.tensor([0.0]))

        # Linear function
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        # Non-linearity
        self.relu1 = nn.ReLU()
        # Dropout
        self.dropout = nn.Dropout(dropout_rate)
        # Linear function 2:
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        # Non-linearity 2
        self.relu2 = nn.ReLU()
        # Linear function 3 (readout):
        self.fc3 = nn.Linear(hidden_dim, output_dim)

    def _unit_deepsignet(self, u):
        if self.attention:
            # self attention
            out = torch.transpose(u, 0, 1)
            out = torch.transpose(out, 0, 2)
            out_att = self.self_attn(out, out, out)
            out = torch.transpose(out_att[0], 0, 1)
            out = torch.transpose(out, 1, 2)
        else:
            out = u

        # Convolution
        if self.augmentation_tp == "Conv1d":
            out = self.conv(out)  # get only the path information
            out = self.tanh1(out)
            out = torch.transpose(out, 1, 2)  # swap dimensions
        else:
            out = self.augment(torch.transpose(out, 1, 2))

        # Add time for signature
        if self.add_time:
            out = torch.cat(
                (
                    out,
                    torch.transpose(
                        u[:, self.input_channels : (self.input_channels + 1), :], 1, 2
                    ),
                ),
                dim=2,
            )

        # Signature
        out = self.signature1(out)
        out, (_, _) = self.lstm_sig1(out)
        # Signature
        out = self.signature2(out)
        return out

    def forward(self, x):
        # deepsig net for each history window
        out = self._unit_deepsignet(x[:, : self.input_channels, :, -1])
        out = out.unsqueeze(1)
        for window in range(x.shape[3] - 1, 0, -1):
            out_unit = self._unit_deepsignet(x[:, : self.input_channels, :, window - 1])
            out_unit = out_unit.unsqueeze(1)
            out = torch.cat((out, out_unit), dim=1)

        # LSTM that combines all deepsignet windows together
        _, (out, _) = self.lstm_sig2(out)
        out = out[-1, :, :] + out[-2, :, :]
        # out = self.signature3(out)

        # Combine Last Post Embedding
        if self.comb_method == "concatenation":
            out = torch.cat(
                (
                    out,
                    x[:, self.input_channels : (self.input_channels + 1), :, 0].max(2)[
                        0
                    ],
                    x[:, (self.input_channels + 1) :, 0, 0],
                ),
                dim=1,
            )
        elif self.comb_method == "gated_addition":
            out_gated = torch.cat(
                (
                    out,
                    x[:, self.input_channels : (self.input_channels + 1), :, 0].max(2)[
                        0
                    ],
                ),
                dim=1,
            )
            out_gated = self.fc_scale(out_gated.float())
            out_gated = self.tanh1(out_gated)
            out_gated = torch.mul(self.scaler, out_gated)
            # concatenation with bert output
            out = out_gated + x[:, (self.input_channels + 1) :, 0]
        elif self.comb_method == "gated_concatenation":
            out_gated = torch.cat(
                (
                    out,
                    x[:, self.input_channels : (self.input_channels + 1), :, 0].max(2)[
                        0
                    ],
                ),
                dim=1,
            )
            out_gated = torch.mul(self.scaler1, out_gated)
            # concatenation with bert output
            out = torch.cat((out_gated, x[:, (self.input_channels + 1) :, 0, 0]), dim=1)
        elif self.comb_method == "scaled_concatenation":
            out_gated = torch.cat(
                (
                    out,
                    x[:, self.input_channels : (self.input_channels + 1), :, 0].max(2)[
                        0
                    ],
                ),
                dim=1,
            )
            out_gated = self.scaler2 * out_gated
            # concatenation with bert output
            out = torch.cat((out_gated, x[:, (self.input_channels + 1) :, 0, 0]), dim=1)

        # FFN: Linear function 1
        out = self.fc1(out.float())
        # Non-linearity 1
        out = self.relu1(out)
        # Dropout
        out = self.dropout(out)

        # FFN: Linear function 2
        out = self.fc2(out)
        # Non-linearity 2
        out = self.relu2(out)
        # Dropout
        out = self.dropout(out)

        # FFN: Linear function 3 (readout)
        out = self.fc3(out)
        return out


########################

Run Seq-Sig-Net

In [16]:
from sklearn import metrics
import random
from datetime import date
import math

from classification_utils import Folds, set_seed, validation, training, testing
from deepsignatureffn import FocalLoss, ClassBalanced_FocalLoss
from deepsignatureffn import StackedDeepSigNet

# ================================
save_results = True
# ================================
k = model_specifics["k_window"]
# GLOBAL MODEL PARAMETERS
input_dim = x_train.shape[1]
output_channels = [10]  # [10,12]
hidden_dim = [64]  # [32, 64]
output_dim = 2
loss = model_specifics["loss_function"]  #'focal' #cbfocal
dropout_rate = [0.1]
if model_specifics["time_injection_history_tp"] == "timestamp":
    add_time = True
else:
    add_time = False

sig_d = 3
hidden_dim_lstm = [(10, 300)]
input_channels = path.shape[2]
history_len = x_train.shape[3]
post_dim = x_train.shape[1] - input_channels
add_time = False
augmentation_tp = "Conv1d"
augmentation_layers = ()
attention = False
BiLSTM = False
blocks = 2
# ================================
num_epochs = 100
learning_rate = [0.0003]  # [0.0001, 0.0003]
gamma = [3]  # [2,3]
beta = 0.999
BATCH_SIZE = 64
NUM_folds = 1
patience = 4
weight_decay_adam = 0.0001
RANDOM_SEED_list = [0]  # [0, 1, 12, 123, 1234]

# ================================
if model_specifics["dimensionality_reduction"] == True:
    model_code_name = (
        model_specifics["data"]
        + "_"
        + model_specifics["global_embedding_tp"]
        + "_"
        + str(model_specifics["dimensionality_reduction_tp"])
        + str(model_specifics["dimensionality_reduction_components"])
        + "_"
        + str(model_specifics["time_injection_history_tp"])
        + str(model_specifics["time_injection_post_tp"])
        + "_"
        + str(model_specifics["post_embedding_tp"])
        + "_"
        + str(model_specifics["feature_combination_method"])
        + "_"
        + str(model_specifics["signature_tp"])
        + "_"
        + str(model_specifics["signature_dimensions"])
        + "_"
        + str(model_specifics["classifier_name"])
        + "_"
        + str(model_specifics["loss_function"])
        + "_"
        + str(model_specifics["k_window"])
        + "_"
        + str(model_specifics["classes_num"])
    )
else:
    model_code_name = (
        model_specifics["data"]
        + "_"
        + model_specifics["global_embedding_tp"]
        + "_"
        + str(model_specifics["time_injection_history_tp"])
        + str(model_specifics["time_injection_post_tp"])
        + "_"
        + str(model_specifics["post_embedding_tp"])
        + "_"
        + str(model_specifics["feature_combination_method"])
        + "_"
        + str(model_specifics["signature_tp"])
        + "_"
        + str(model_specifics["signature_dimensions"])
        + "_"
        + str(model_specifics["classifier_name"])
        + "_"
        + str(model_specifics["loss_function"])
        + "_"
        + str(model_specifics["k_window"])
        + "_"
        + str(model_specifics["classes_num"])
    )


FOLDER_models = "/storage/ttseriotou/rumour_eval/models/v1/"
FOLDER_results = "/storage/ttseriotou/rumour_eval/results/v1/"

# ================================
# K FOLD RUNS
ft_i = 0  # run number
for out_ch in output_channels:
    for lr in learning_rate:
        for g in gamma:
            for dp in dropout_rate:
                for h_dim in hidden_dim:
                    for lstm_dim in hidden_dim_lstm:
                        # out_ch =  aug_l[2]
                        str_version = "tuning" + str(ft_i)
                        print(
                            "lr=",
                            lr,
                            " g=",
                            g,
                            " dp=",
                            dp,
                            " h_dim=",
                            h_dim,
                            " lstm_dim=",
                            lstm_dim,
                        )
                        ft_i += 1

                        classifier_params = {
                            "augmentation_tp": augmentation_tp,
                            "input_channels": input_channels,
                            "output_channels": out_ch,
                            "augmentation_layers": augmentation_layers,
                            "sig_d": sig_d,
                            "post_dim": post_dim,
                            "hidden_dim_lstm": lstm_dim,
                            "hidden_dim": h_dim,
                            "output_dim": output_dim,
                            "dropout_rate": dp,
                            "num_epochs": num_epochs,
                            "learning_rate": lr,
                            "BiLSTM": BiLSTM,
                            "blocks": blocks,
                            "gamma": g,
                            "k_window": k,
                            "BATCH_SIZE": BATCH_SIZE,
                            "NUM_folds": NUM_folds,
                            "patience": patience,
                            "weight_decay_adam": weight_decay_adam,
                            "RANDOM_SEED_list": RANDOM_SEED_list,
                        }

                        for my_ran_seed in RANDOM_SEED_list:
                            set_seed(my_ran_seed)
                            myGenerator = torch.Generator()
                            myGenerator.manual_seed(my_ran_seed)
                            for test_fold in range(NUM_folds):
                                print("Starting random seed #", my_ran_seed)

                                # data loaders with batches
                                train = torch.utils.data.TensorDataset(x_train, y_train)
                                valid = torch.utils.data.TensorDataset(x_valid, y_valid)
                                # test = torch.utils.data.TensorDataset( torch.cat((x_test,torch.zeros(test_pids.shape).unsqueeze(1).unsqueeze(2).repeat(1, 1, k)),1) , y_test)
                                test = torch.utils.data.TensorDataset(
                                    torch.cat(
                                        (
                                            x_test,
                                            torch.zeros(test_pids.shape)
                                            .unsqueeze(1)
                                            .unsqueeze(2)
                                            .unsqueeze(3)
                                            .repeat(1, 1, k, history_len),
                                        ),
                                        1,
                                    ),
                                    y_test,
                                )

                                train_loader = torch.utils.data.DataLoader(
                                    dataset=train, batch_size=BATCH_SIZE, shuffle=True
                                )
                                valid_loader = torch.utils.data.DataLoader(
                                    dataset=valid, batch_size=BATCH_SIZE, shuffle=True
                                )
                                test_loader = torch.utils.data.DataLoader(
                                    dataset=test, batch_size=BATCH_SIZE, shuffle=True
                                )

                                # early stopping params
                                last_metric = 0
                                trigger_times = 0
                                best_metric = 0

                                # model definitions
                                # model = StackedDeepSigNet(input_channels, out_ch, sig_d, lstm_dim, post_dim, h_dim, output_dim, dp, add_time, augmentation_tp, augmentation_layers, BiLSTM, comb_method=model_specifics['feature_combination_method'], blocks=blocks)
                                model = StackedDeepLSTMSigNet(
                                    input_channels,
                                    out_ch,
                                    sig_d,
                                    lstm_dim,
                                    post_dim,
                                    h_dim,
                                    output_dim,
                                    dp,
                                    add_time,
                                    augmentation_tp,
                                    augmentation_layers,
                                    comb_method=model_specifics[
                                        "feature_combination_method"
                                    ],
                                    attention=attention,
                                )

                                # loss function
                                if loss == "focal":
                                    alpha_values = torch.Tensor(
                                        [
                                            math.sqrt(
                                                1
                                                / (
                                                    y_train[y_train == 0].shape[0]
                                                    / y_train.shape[0]
                                                )
                                            ),
                                            math.sqrt(
                                                1
                                                / (
                                                    y_train[y_train == 1].shape[0]
                                                    / y_train.shape[0]
                                                )
                                            ),
                                        ]
                                    )
                                    criterion = FocalLoss(gamma=g, alpha=alpha_values)
                                elif loss == "cbfocal":
                                    classifier_params["beta"] = beta
                                    samples_count = torch.Tensor(
                                        [
                                            y_train[y_train == 0].shape[0],
                                            y_train[y_train == 1].shape[0],
                                            y_train[y_train == 2].shape[0],
                                        ]
                                    )
                                    criterion = ClassBalanced_FocalLoss(
                                        gamma=g,
                                        beta=beta,
                                        no_of_classes=3,
                                        samples_per_cls=samples_count,
                                    )
                                optimizer = torch.optim.Adam(
                                    model.parameters(),
                                    lr=lr,
                                    weight_decay=weight_decay_adam,
                                )

                                # model train/validation per epoch
                                for epoch in range(num_epochs):
                                    training(
                                        model,
                                        train_loader,
                                        criterion,
                                        optimizer,
                                        epoch,
                                        num_epochs,
                                    )

                                    # Early stopping
                                    _, f1_v, labels_val, predicted_val = validation(
                                        model, valid_loader, criterion
                                    )

                                    print("Current Macro F1:", f1_v)

                                    if f1_v > best_metric:
                                        best_metric = f1_v

                                        # test and save so far best model
                                        (
                                            predicted_test,
                                            labels_test,
                                            pids_test,
                                        ) = testing(model, test_loader)

                                        results = {
                                            "model_code_name": model_code_name,
                                            "model_specifics": model_specifics,
                                            "classifier_params": classifier_params,
                                            "date_run": date.today().strftime(
                                                "%d/%m/%Y"
                                            ),
                                            "test_pids": pids_test,  # test_pids,
                                            "labels": labels_test,
                                            "predictions": predicted_test,
                                            "labels_val": labels_val,
                                            "predicted_val": predicted_val,
                                            "test_fold": test_fold,
                                            "random_seed": my_ran_seed,
                                            "epoch": epoch,
                                        }

                                        if save_results == True:
                                            # file_name_results = FOLDER_results + model_code_name + "_" + str(my_ran_seed) + "seed" + "_" + str_version + '.pkl'
                                            # file_name_model = FOLDER_models + model_code_name + "_" + str(my_ran_seed) + "seed"  + "_" + str_version +'.pkl'
                                            file_name_results = (
                                                FOLDER_results
                                                + model_code_name
                                                + "_"
                                                + str(my_ran_seed)
                                                + "seed"
                                                + ".pkl"
                                            )
                                            file_name_model = (
                                                FOLDER_models
                                                + model_code_name
                                                + "_"
                                                + str(my_ran_seed)
                                                + "seed"
                                                + ".pkl"
                                            )
                                            pickle.dump(
                                                results, open(file_name_results, "wb")
                                            )
                                            # torch.save(model.state_dict(), file_name_model)

                                    if f1_v < last_metric:
                                        trigger_times += 1
                                        print("Trigger Times:", trigger_times)

                                        if trigger_times >= patience:
                                            print("Early stopping!")
                                            break

                                    else:
                                        print("Trigger Times: 0")
                                        trigger_times = 0

                                    last_metric = f1_v

lr= 0.0003  g= 3  h_dim= 64
Starting random seed # 0
[0/100, 0/67] loss: 0.11752356
Current Macro F1: 66.59561385529823
Trigger Times: 0
[1/100, 0/67] loss: 0.097271957
Current Macro F1: 64.03723094822571
Trigger Times: 1
[2/100, 0/67] loss: 0.090587251
Current Macro F1: 62.6982463494725
Trigger Times: 2
[3/100, 0/67] loss: 0.075255901
Current Macro F1: 61.185953166387705
Trigger Times: 3
[4/100, 0/67] loss: 0.08509589
Current Macro F1: 61.23592342342342
Trigger Times: 0
[5/100, 0/67] loss: 0.088144451
Current Macro F1: 63.23779968358282
Trigger Times: 0
[6/100, 0/67] loss: 0.10385884
Current Macro F1: 61.83724467622773
Trigger Times: 1
[7/100, 0/67] loss: 0.074643873
Current Macro F1: 63.483546426325724
Trigger Times: 0
[8/100, 0/67] loss: 0.082003854
Current Macro F1: 60.285137972199735
Trigger Times: 1
[9/100, 0/67] loss: 0.096544996
Current Macro F1: 61.37851262162346
Trigger Times: 0
[10/100, 0/67] loss: 0.075283684
Current Macro F1: 63.627958579881664
Trigger Times: 0
[11/100, 0/

In [17]:
model_code_name

'Rumoureval_SBERT_umap15_Nonetimestamp_windowsigsentence_concatenation_log_3_Seq-Sig-Net_focal_2class'

In [18]:
import os
from os import listdir
from os.path import isfile, join


def process_model_results2(model_code_name, FOLDER_results, type="Talklife"):
    if type == "Talklife":
        per_model_files = [
            f
            for f in listdir(FOLDER_results)
            if model_code_name in f
            if "tuning" not in f
            if "Reddit" not in f
        ]
    else:
        per_model_files = [
            f
            for f in listdir(FOLDER_results)
            if model_code_name in f
            if "tuning" not in f
        ]

    print("There are ", len(per_model_files), " files")
    metrics_overall = pd.DataFrame(
        0,
        index=["No Change", "Change", "accuracy", "macro avg", "weighted avg"],
        columns=["precision", "recall", "f1-score", "support"],
    )
    with open(FOLDER_results + per_model_files[0], "rb") as fin:
        results0 = pickle.load(fin)

    for my_ran_seed in results0["classifier_params"]["RANDOM_SEED_list"]:
        labels_final = torch.empty((0))
        predicted_final = torch.empty((0))

        seed_files = [f for f in per_model_files if (str(my_ran_seed) + "seed") in f]
        for sf in seed_files:
            with open(FOLDER_results + sf, "rb") as fin:
                results = pickle.load(fin)
                labels_results = results["labels"]
                predictions_results = results["predictions"]

            # for each seed combine fold results
            labels_final = torch.cat([labels_final, labels_results])
            predicted_final = torch.cat([predicted_final, predictions_results])

        # calculate metrics for each seed
        metrics_tab = metrics.classification_report(
            labels_final,
            predicted_final,
            target_names=["No Change", "Change"],
            output_dict=True,
        )
        metrics_tab = pd.DataFrame(metrics_tab).transpose()
        # combine the metrics with the rest of the seeds in order to take average at the end
        metrics_overall += metrics_tab

    return metrics_overall / len(results0["classifier_params"]["RANDOM_SEED_list"])

In [19]:
##CURRENT - TEST
from classification_utils import process_model_results

FOLDER_results = "/storage/ttseriotou/rumour_eval/results/v1/"

model_code_name = "Rumoureval_SBERT_umap15_Nonetimestamp_windowsigsentence_concatenation_log_3_Seq-Sig-Net_focal_2class"
process_model_results2(model_code_name, FOLDER_results, type="Rumoureval")

There are  1  files


Unnamed: 0,precision,recall,f1-score,support
No Change,0.852941,0.422587,0.565164,549.0
Change,0.592021,0.92,0.720439,500.0
accuracy,0.659676,0.659676,0.659676,0.659676
macro avg,0.722481,0.671293,0.642801,1049.0
weighted avg,0.728575,0.659676,0.639175,1049.0


In [2]:
# BEST K MODELS - TEST LOOP (window k=5)
import pandas as pd
import numpy as np
import torch
import pickle
from os import listdir
from os.path import isfile, join
from sklearn import metrics
from collections import Counter

k = 5
FOLDER_results = "/storage/ttseriotou/rumour_eval/results/v1/"

model_code_name = "Rumoureval_SBERT_umap15_Nonetimestamp_windowsigsentence_concatenation_log_3_Seq-Sig-Net_focal_5_2class"
metrics_overall = pd.DataFrame(
    0,
    index=["No Change", "Change", "accuracy", "macro avg", "weighted avg"],
    columns=["precision", "recall", "f1-score", "support"],
)

# get all tuning files
per_model_files = [
    f
    for f in listdir(FOLDER_results)
    if "tuning" in f
    if model_code_name in f
    if "Reddit" not in f
]

# get the indices of tuning files
files_ind = [
    int(f[: f.index(".")].split("_")[-1].replace("tuning", "")) for f in per_model_files
]
files_ind = list(set(files_ind))
dict_f1 = {}

print(files_ind)
for t in files_ind:
    labels_final = torch.empty((0))
    predicted_final = torch.empty((0))

    tuning_files = [f for f in per_model_files if ("tuning" + str(t) + ".") in f]
    for sf in tuning_files:
        with open(FOLDER_results + sf, "rb") as fin:
            results = pickle.load(fin)
            labels_results = results["labels_val"]
            predictions_results = results["predicted_val"]

        # for each seed combine fold results
        labels_final = torch.cat([labels_final, labels_results])
        predicted_final = torch.cat([predicted_final, predictions_results])

    # calculate metrics for each seed
    metrics_tab = metrics.classification_report(
        labels_final,
        predicted_final,
        target_names=["No Change", "Change"],
        output_dict=True,
    )
    metrics_tab = pd.DataFrame(metrics_tab).transpose()
    params = results["classifier_params"]
    f1 = metrics_tab["f1-score"]["macro avg"]
    dict_f1[t] = f1

dict_f1 = Counter(dict_f1)

for top in dict_f1.most_common()[:k]:
    labels_final = torch.empty((0))
    predicted_final = torch.empty((0))

    tuning_files = [f for f in per_model_files if ("tuning" + str(top[0]) + ".") in f]

    for sf in tuning_files:
        with open(FOLDER_results + sf, "rb") as fin:
            results = pickle.load(fin)
            labels_results = results["labels"]
            predictions_results = results["predictions"]

        # for each seed combine fold results
        labels_final = torch.cat([labels_final, labels_results])
        predicted_final = torch.cat([predicted_final, predictions_results])

    # calculate metrics for each seed
    metrics_tab = metrics.classification_report(
        labels_final,
        predicted_final,
        target_names=["No Change", "Change"],
        output_dict=True,
    )
    metrics_tab = pd.DataFrame(metrics_tab).transpose()
    params = results["classifier_params"]
    print(
        "lr=",
        params["learning_rate"],
        "dropout= ",
        params["dropout_rate"],
        "h_dim=",
        params["hidden_dim"],
        "lstm_dim=",
        params["hidden_dim_lstm"],
        "gamma=",
        params["gamma"],
        "out channels=",
        params["output_channels"],
    )
    print(metrics_tab)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95]
lr= 0.0001 dropout=  0.25 h_dim= 32 lstm_dim= (12, 300) gamma= 2 out channels= 12
              precision    recall  f1-score      support
No Change      0.728814  0.391621  0.509479   549.000000
Change         0.557029  0.840000  0.669856   500.000000
accuracy       0.605338  0.605338  0.605338     0.605338
macro avg      0.642921  0.615811  0.589668  1049.000000
weighted avg   0.646933  0.605338  0.585922  1049.000000
lr= 0.0001 dropout=  0.25 h_dim= 32 lstm_dim= (12, 300) gamma= 2 out channels= 12
              precision    recall  f1-score      support
No Change      0.728814  0.391621  0.509479   549.000000
Chang