<a href="https://colab.research.google.com/github/verma-saloni/Thesis-Work/blob/main/10_19_22_SBERT%2B_BigGraph_Embeddings_NN_gossipcop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
#%%capture
!pip install -U sentence-transformers wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


The notebook for Gossipcop. (Similar one present for Politifact is already saved in the Drive and on Github) 

In [42]:
from pathlib import Path
import numpy as np
import pandas as pd
import re
import json

from sentence_transformers import SentenceTransformer 

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import wandb
from wandb.keras import WandbCallback
from IPython.display import clear_output

In [43]:
metrics = [accuracy_score, f1_score, precision_score, recall_score]

def get_name(score_func):
    return 'eval/' + score_func.__name__.split("_")[0]

# Load data

In [44]:
from google.colab import drive
drive.mount('/content/drive')

base_dir = Path("/content/drive/MyDrive/ResearchFND")
assert base_dir.exists()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [45]:
dataset_id = 'gossipcop'

In [46]:
import ast

converters = {"retweets":ast.literal_eval, "tweets":ast.literal_eval}
df = pd.read_csv(base_dir/f"{dataset_id}_agg.csv", converters=converters)
df.head()

Unnamed: 0,title,text,tweets,retweets,label,url,num_retweets,log_num_retweets,num_tweets,log_num_tweets
0,Kendall Kylie Jenner Jenner NOT Upset Up...,,[],"[995423424741888001, 995461685166202880, 99987...",fake,,3,1.386294,0,0.0
1,Kim Kardashian Dethroned Dethroned By Khlo...,,[],"[848843565027516416, 849030801970868224, 84884...",fake,,3,1.386294,0,0.0
2,Kim Kardashian Did NOT Hot Staffer Hot Sta...,,[],"[940685393112064001, 977921622672920576, 94031...",fake,,8,2.197225,0,0.0
3,The Voice The Voice Team NOT Surprised Sur...,,[],[],fake,,0,0.0,0,0.0
4,Drake NOT Angelina Jolie s Toy Boy Toy Boy ...,,"[{'id': 948630026496323585, 'text': 'Drake NOT...","[948022124626808832, 948630026496323585, 94801...",fake,,18,2.944439,7,2.079442


# Using the embeddings for title and text from previous experiments

In [47]:
title_embedding_file = base_dir/f"{dataset_id}_sbert_title_embeddings.npy"

if title_embedding_file.exists():
    title_embeddings = np.load(title_embedding_file)
else:
    model_id = "all-mpnet-base-v2"
    model = SentenceTransformer(model_id)
    titles = df.title.tolist()
    title_embeddings = model.encode(titles, show_progress_bar=True)
    np.save(title_embedding_file, title_embeddings)

In [48]:
text_embedding_file = base_dir/f"{dataset_id}_sbert_fulltext_embeddings.npy"

if text_embedding_file.exists():
    text_embeddings = np.load(text_embedding_file)
else:
    model_id = "all-mpnet-base-v2"
    model = SentenceTransformer(model_id)
    texts = (df.title + "\n" + df.text).tolist()
    text_embeddings = model.encode(texts, show_progress_bar=True)
    np.save(text_embedding_file, text_embeddings)

# Training - Saving the machine learning model specifications

In [49]:
import tensorflow as tf

In [50]:
def get_model(d_h=64, p_in:float=0.2, p_out:float=0.2):
    model = tf.keras.Sequential([
        tf.keras.layers.Dropout(p_in), # input dropout
        tf.keras.layers.Dense(d_h, activation='relu'),
        tf.keras.layers.Dropout(p_out),  
        tf.keras.layers.Dense(1)
    ])

    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
                  optimizer='adam',metrics=['accuracy'])
    return model

In [51]:
def train(fold_id, train_idx, test_idx, params):

    # setup
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    model = get_model()
    # training
    ckpt_filepath = f'ckpt/fold_{fold_id}'
    save_model_cb = tf.keras.callbacks.ModelCheckpoint(
        filepath=ckpt_filepath,
        save_weights_only=True,
        monitor='val_accuracy',
        mode='max',
        save_best_only=True)
    history = model.fit(
        X_train, y_train, validation_data = (X_test, y_test), 
        batch_size = params['bs'] , epochs=params['epochs'],
        callbacks=[WandbCallback(), save_model_cb]
    )
    #evaluation
    model.load_weights(ckpt_filepath)
    logits = model.predict(X_test).squeeze()
    y_pred = (logits > 0.).astype(int)
    eval_results = {get_name(f):f(y_pred=y_pred, y_true=y_test) for f in metrics}
    wandb.log(eval_results)
    wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                            y_true=y_test, preds=y_pred,
                            class_names=["Fake", "Real"])})
    return history

### PyTorch_BigGraph embeddings

In [52]:
edim = 128
bg_embeddings = np.load(base_dir/f'{dataset_id}_pt_biggraph_article_embeddings_{edim}.npy')
idx = np.load(base_dir/f"{dataset_id}_pt_biggraph_article_idx_{edim}.npy")

In [53]:
graph_embeddings = np.zeros((text_embeddings.shape[0], edim))
graph_embeddings[idx] = bg_embeddings

In [54]:
params = {
    "epochs":600,
    "bs":128,
    # 'lr':1e-3
}

In [55]:
X = np.concatenate([
    text_embeddings, 
    df.log_num_retweets.to_numpy()[..., None], 
    df.log_num_tweets.to_numpy()[..., None],
    graph_embeddings], axis=1)
y = (df.label=="real").to_numpy().astype(int)

X.shape, y.shape

((19968, 898), (19968,))

In [56]:
skf = StratifiedKFold(shuffle=True, random_state=124)

Creating a group and logging all train, test data in wandb for better tracking and visualisation.

In [57]:
WANDB_ENTITY = 'saloniteam'
WANDB_PROJECTS = 'nofolds'

GROUP = f"{dataset_id}-fulltext-sbert+tw+ptbg128-mlp-e600"

for fold_id, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    clear_output()
    with wandb.init(entity=WANDB_ENTITY, project=WANDB_ENTITY, group=GROUP, name=f"{GROUP}-{fold_id}") as run:
        train(fold_id, train_idx, test_idx, params)
    break

Epoch 1/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 2/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 3/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 4/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 5/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 6/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 7/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 8/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 9/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 10/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 11/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 12/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 13/600
Epoch 14/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 15/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 16/600
Epoch 17/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 18/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 19/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 20/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 21/600
Epoch 22/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 23/600
Epoch 24/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 25/600
Epoch 26/600
Epoch 27/600
Epoch 28/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 29/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 30/600
Epoch 31/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 32/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 33/600
Epoch 34/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 35/600
Epoch 36/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 37/600
Epoch 38/600
Epoch 39/600
Epoch 40/600
Epoch 41/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 42/600
Epoch 43/600
Epoch 44/600
Epoch 45/600

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_110651-262n1vwp/files/model-best)... Done. 0.1s


Epoch 46/600
Epoch 47/600
Epoch 48/600
Epoch 49/600
Epoch 50/600
Epoch 51/600
Epoch 52/600
Epoch 53/600
Epoch 54/600
Epoch 55/600
Epoch 56/600
Epoch 57/600
Epoch 58/600
Epoch 59/600
Epoch 60/600
Epoch 61/600
Epoch 62/600
Epoch 63/600
Epoch 64/600
Epoch 65/600
Epoch 66/600
Epoch 67/600
Epoch 68/600
Epoch 69/600
Epoch 70/600
Epoch 71/600
Epoch 72/600
Epoch 73/600
Epoch 74/600
Epoch 75/600
Epoch 76/600
Epoch 77/600
Epoch 78/600
Epoch 79/600
Epoch 80/600
Epoch 81/600
Epoch 82/600
Epoch 83/600
Epoch 84/600
Epoch 85/600
Epoch 86/600
Epoch 87/600
Epoch 88/600
Epoch 89/600
Epoch 90/600
Epoch 91/600
Epoch 92/600
Epoch 93/600
Epoch 94/600
Epoch 95/600
Epoch 96/600
Epoch 97/600
Epoch 98/600
Epoch 99/600
Epoch 100/600
Epoch 101/600
Epoch 102/600
Epoch 103/600
Epoch 104/600
Epoch 105/600
Epoch 106/600
Epoch 107/600
Epoch 108/600
Epoch 109/600
Epoch 110/600
Epoch 111/600
Epoch 112/600
Epoch 113/600
Epoch 114/600
Epoch 115/600
Epoch 116/600
Epoch 117/600
Epoch 118/600
Epoch 119/600
Epoch 120/600
Epoc

VBox(children=(Label(value='21.840 MB of 21.840 MB uploaded (0.007 MB deduped)\r'), FloatProgress(value=0.9999…

0,1
accuracy,▁▂▃▄▅▅▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇█▇████████████████
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
eval/accuracy,▁
eval/f1,▁
eval/precision,▁
eval/recall,▁
loss,█▇▆▅▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_accuracy,▁▃▇▇▇█▅██▇█▆▄▅▄▆▅▄▄▄▃▃▅▆▅▄▄▄▅▄▄▅▄▅▄▄▅▆▅▄
val_loss,▃▂▁▁▁▁▁▂▂▂▂▃▃▃▄▄▅▄▅▅▅▅▆▅▅▆▆▆▇▇▇▇▇▇█▇████

0,1
accuracy,0.96607
best_epoch,44.0
best_val_loss,0.28661
epoch,599.0
eval/accuracy,0.88658
eval/f1,0.92903
eval/precision,0.8993
eval/recall,0.96079
loss,0.0865
val_accuracy,0.8653
