<a href="https://colab.research.google.com/github/verma-saloni/Thesis-Work/blob/main/10_19_22_SBERT%2B_BigGraph_Embeddings_NN_gossipcop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
#%%capture
!pip install -U sentence-transformers wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


The notebook for Gossipcop. (Similar one present for Politifact is already saved in the Drive and on Github) 

In [25]:
from pathlib import Path
import numpy as np
import pandas as pd
import re
import json

from sentence_transformers import SentenceTransformer 

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import wandb
from wandb.keras import WandbCallback
from IPython.display import clear_output

In [26]:
metrics = [accuracy_score, f1_score, precision_score, recall_score]

def get_name(score_func):
    return 'eval/' + score_func.__name__.split("_")[0]

# Load data

In [27]:
from google.colab import drive
drive.mount('/content/drive')

base_dir = Path("/content/drive/MyDrive/ResearchFND")
assert base_dir.exists()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
dataset_id = 'gossipcop'

In [29]:
import ast

converters = {"retweets":ast.literal_eval, "tweets":ast.literal_eval}
df = pd.read_csv(base_dir/f"{dataset_id}_agg.csv", converters=converters)
df.head()

Unnamed: 0,title,text,tweets,retweets,label,url,num_retweets,log_num_retweets,num_tweets,log_num_tweets
0,Kendall Kylie Jenner Jenner NOT Upset Up...,,[],"[995423424741888001, 995461685166202880, 99987...",fake,,3,1.386294,0,0.0
1,Kim Kardashian Dethroned Dethroned By Khlo...,,[],"[848843565027516416, 849030801970868224, 84884...",fake,,3,1.386294,0,0.0
2,Kim Kardashian Did NOT Hot Staffer Hot Sta...,,[],"[940685393112064001, 977921622672920576, 94031...",fake,,8,2.197225,0,0.0
3,The Voice The Voice Team NOT Surprised Sur...,,[],[],fake,,0,0.0,0,0.0
4,Drake NOT Angelina Jolie s Toy Boy Toy Boy ...,,"[{'id': 948630026496323585, 'text': 'Drake NOT...","[948022124626808832, 948630026496323585, 94801...",fake,,18,2.944439,7,2.079442


# Using the embeddings for title and text from previous experiments

In [30]:
title_embedding_file = base_dir/f"{dataset_id}_sbert_title_embeddings.npy"

if title_embedding_file.exists():
    title_embeddings = np.load(title_embedding_file)
else:
    model_id = "all-mpnet-base-v2"
    model = SentenceTransformer(model_id)
    titles = df.title.tolist()
    title_embeddings = model.encode(titles, show_progress_bar=True)
    np.save(title_embedding_file, title_embeddings)

In [31]:
text_embedding_file = base_dir/f"{dataset_id}_sbert_fulltext_embeddings.npy"

if text_embedding_file.exists():
    text_embeddings = np.load(text_embedding_file)
else:
    model_id = "all-mpnet-base-v2"
    model = SentenceTransformer(model_id)
    texts = (df.title + "\n" + df.text).tolist()
    text_embeddings = model.encode(texts, show_progress_bar=True)
    np.save(text_embedding_file, text_embeddings)

# Training - Saving the machine learning model specifications

In [32]:
import tensorflow as tf

In [33]:
def get_model(d_h=64, p_in:float=0.2, p_out:float=0.2):
    model = tf.keras.Sequential([
        tf.keras.layers.Dropout(p_in), # input dropout
        tf.keras.layers.Dense(d_h, activation='relu'),
        tf.keras.layers.Dropout(p_out),  
        tf.keras.layers.Dense(1)
    ])

    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
                  optimizer='adam',metrics=['accuracy'])
    return model

In [34]:
def train(fold_id, train_idx, test_idx, params):

    # setup
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    model = get_model()
    # training
    ckpt_filepath = f'ckpt/fold_{fold_id}'
    save_model_cb = tf.keras.callbacks.ModelCheckpoint(
        filepath=ckpt_filepath,
        save_weights_only=True,
        monitor='val_accuracy',
        mode='max',
        save_best_only=True)
    history = model.fit(
        X_train, y_train, validation_data = (X_test, y_test), 
        batch_size = params['bs'] , epochs=params['epochs'],
        callbacks=[WandbCallback(), save_model_cb]
    )
    #evaluation
    model.load_weights(ckpt_filepath)
    logits = model.predict(X_test).squeeze()
    y_pred = (logits > 0.).astype(int)
    eval_results = {get_name(f):f(y_pred=y_pred, y_true=y_test) for f in metrics}
    wandb.log(eval_results)
    wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                            y_true=y_test, preds=y_pred,
                            class_names=["Fake", "Real"])})
    return history

### PyTorch_BigGraph embeddings

In [35]:
edim = 128
bg_embeddings = np.load(base_dir/f'{dataset_id}_pt_biggraph_article_embeddings_{edim}.npy')
idx = np.load(base_dir/f"{dataset_id}_pt_biggraph_article_idx_{edim}.npy")

In [36]:
graph_embeddings = np.zeros((text_embeddings.shape[0], edim))
graph_embeddings[idx] = bg_embeddings

In [37]:
params = {
    "epochs":400,
    "bs":128,
    # 'lr':1e-3
}

In [38]:
X = np.concatenate([
    text_embeddings, 
    df.log_num_retweets.to_numpy()[..., None], 
    df.log_num_tweets.to_numpy()[..., None],
    graph_embeddings], axis=1)
y = (df.label=="real").to_numpy().astype(int)

X.shape, y.shape

((19968, 898), (19968,))

In [39]:
skf = StratifiedKFold(shuffle=True, random_state=124)

Creating a group and logging all train, test data in wandb for better tracking and visualisation.

In [40]:
WANDB_ENTITY = 'saloniteam'
WANDB_PROJECTS = 'nofolds'

GROUP = f"{dataset_id}-fulltext-sbert+tw+ptbg128-mlp-e400"

for fold_id, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    clear_output()
    with wandb.init(entity=WANDB_ENTITY, project=WANDB_ENTITY, group=GROUP, name=f"{GROUP}-{fold_id}") as run:
        train(fold_id, train_idx, test_idx, params)
    break

[34m[1mwandb[0m: Currently logged in as: [33msaloni[0m ([33msaloniteam[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 2/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 3/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 4/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 5/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 6/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 7/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 8/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 9/400
Epoch 10/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 11/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 12/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 13/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 14/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 15/400
Epoch 16/400
Epoch 17/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 18/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 19/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 20/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 25/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 31/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 32/400
Epoch 33/400
Epoch 34/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 35/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 36/400
Epoch 37/400
Epoch 38/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105756-dafj960p/files/model-best)... Done. 0.1s


Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/400
Epoch 75/400
Epoch 76/400
Epoch 77/400
Epoch 78/400
Epoch 79/400
Epoch 80/400
Epoch 81/400
Epoch 82/400
Epoch 83/400
Epoch 84/400
Epoch 85/400
Epoch 86/400
Epoch 87/400
Epoch 88/400
Epoch 89/400
Epoch 90/400
Epoch 91/400
Epoch 92/400
Epoch 93/400
Epoch 94/400
Epoch 95/400
Epoch 96/400
Epoch 97/400
Epoch 98/400
Epoch 99/400
Epoch 100/400
Epoch 101/400
Epoch 102/400
Epoch 103/400
Epoch 104/400
Epoch 105/400
Epoch 106/400
Epoch 107/400
Epoch 108/400
Epoch 109/400
Epoch 110/400
Epoch 111/400
Epoch 112/400
Epoch 113/400
Epoch 114/400
Epoch 115/400
Epoch 116/400
Epoch 117/400
Epoch 118/400
Epoch 119/400
Epoch 120/400
Epoch 121/400
Epo

VBox(children=(Label(value='19.574 MB of 19.575 MB uploaded (0.007 MB deduped)\r'), FloatProgress(value=0.9999…

0,1
accuracy,▁▃▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇█▇▇██▇███████████
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval/accuracy,▁
eval/f1,▁
eval/precision,▁
eval/recall,▁
loss,█▆▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▄▇▇▆▆▆▆███▇▆▇▆▇▇▇▆▇▆▆▆▅▅▇▆▆▅▇▆▆▇▆▆▇▅▅▆▆
val_loss,▆▃▂▁▁▁▁▁▁▂▂▁▂▂▂▃▃▃▄▃▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇█▇█

0,1
accuracy,0.9585
best_epoch,45.0
best_val_loss,0.29036
epoch,399.0
eval/accuracy,0.88332
eval/f1,0.92689
eval/precision,0.89842
eval/recall,0.95723
loss,0.10251
val_accuracy,0.8663
