<a href="https://colab.research.google.com/github/verma-saloni/Thesis-Work/blob/main/10_19_22_SBERT%2B_BigGraph_Embeddings_NN_gossipcop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#%%capture
!pip install -U sentence-transformers wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 3.5 MB/s 
[?25hCollecting wandb
  Downloading wandb-0.13.4-py2.py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 57.1 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 50.6 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 55.9 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 72.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manyli

The notebook for Gossipcop. (Similar one present for Politifact is already saved in the Drive and on Github) 

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
import re
import json

from sentence_transformers import SentenceTransformer 

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import wandb
from wandb.keras import WandbCallback
from IPython.display import clear_output

In [3]:
metrics = [accuracy_score, f1_score, precision_score, recall_score]

def get_name(score_func):
    return 'eval/' + score_func.__name__.split("_")[0]

# Load data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

base_dir = Path("/content/drive/MyDrive/ResearchFND")
assert base_dir.exists()

Mounted at /content/drive


In [5]:
dataset_id = 'gossipcop'

In [6]:
import ast

converters = {"retweets":ast.literal_eval, "tweets":ast.literal_eval}
df = pd.read_csv(base_dir/f"{dataset_id}_agg.csv", converters=converters)
df.head()

Unnamed: 0,title,text,tweets,retweets,label,url,num_retweets,log_num_retweets,num_tweets,log_num_tweets
0,Kendall Kylie Jenner Jenner NOT Upset Up...,,[],"[995423424741888001, 995461685166202880, 99987...",fake,,3,1.386294,0,0.0
1,Kim Kardashian Dethroned Dethroned By Khlo...,,[],"[848843565027516416, 849030801970868224, 84884...",fake,,3,1.386294,0,0.0
2,Kim Kardashian Did NOT Hot Staffer Hot Sta...,,[],"[940685393112064001, 977921622672920576, 94031...",fake,,8,2.197225,0,0.0
3,The Voice The Voice Team NOT Surprised Sur...,,[],[],fake,,0,0.0,0,0.0
4,Drake NOT Angelina Jolie s Toy Boy Toy Boy ...,,"[{'id': 948630026496323585, 'text': 'Drake NOT...","[948022124626808832, 948630026496323585, 94801...",fake,,18,2.944439,7,2.079442


# Using the embeddings for title and text from previous experiments

In [7]:
title_embedding_file = base_dir/f"{dataset_id}_sbert_title_embeddings.npy"

if title_embedding_file.exists():
    title_embeddings = np.load(title_embedding_file)
else:
    model_id = "all-mpnet-base-v2"
    model = SentenceTransformer(model_id)
    titles = df.title.tolist()
    title_embeddings = model.encode(titles, show_progress_bar=True)
    np.save(title_embedding_file, title_embeddings)

In [8]:
text_embedding_file = base_dir/f"{dataset_id}_sbert_fulltext_embeddings.npy"

if text_embedding_file.exists():
    text_embeddings = np.load(text_embedding_file)
else:
    model_id = "all-mpnet-base-v2"
    model = SentenceTransformer(model_id)
    texts = (df.title + "\n" + df.text).tolist()
    text_embeddings = model.encode(texts, show_progress_bar=True)
    np.save(text_embedding_file, text_embeddings)

# Training - Saving the machine learning model specifications

In [9]:
import tensorflow as tf

In [10]:
def get_model(d_h=64, p_in:float=0.2, p_out:float=0.2):
    model = tf.keras.Sequential([
        tf.keras.layers.Dropout(p_in), # input dropout
        tf.keras.layers.Dense(d_h, activation='relu'),
        tf.keras.layers.Dropout(p_out),  
        tf.keras.layers.Dense(1)
    ])

    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
                  optimizer='adam',metrics=['accuracy'])
    return model

In [11]:
def train(fold_id, train_idx, test_idx, params):

    # setup
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    model = get_model()
    # training
    ckpt_filepath = f'ckpt/fold_{fold_id}'
    save_model_cb = tf.keras.callbacks.ModelCheckpoint(
        filepath=ckpt_filepath,
        save_weights_only=True,
        monitor='val_accuracy',
        mode='max',
        save_best_only=True)
    history = model.fit(
        X_train, y_train, validation_data = (X_test, y_test), 
        batch_size = params['bs'] , epochs=params['epochs'],
        callbacks=[WandbCallback(), save_model_cb]
    )
    #evaluation
    model.load_weights(ckpt_filepath)
    logits = model.predict(X_test).squeeze()
    y_pred = (logits > 0.).astype(int)
    eval_results = {get_name(f):f(y_pred=y_pred, y_true=y_test) for f in metrics}
    wandb.log(eval_results)
    wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                            y_true=y_test, preds=y_pred,
                            class_names=["Fake", "Real"])})
    return history

### PyTorch_BigGraph embeddings

In [12]:
edim = 128
bg_embeddings = np.load(base_dir/f'{dataset_id}_pt_biggraph_article_embeddings_{edim}.npy')
idx = np.load(base_dir/f"{dataset_id}_pt_biggraph_article_idx_{edim}.npy")

In [13]:
graph_embeddings = np.zeros((text_embeddings.shape[0], edim))
graph_embeddings[idx] = bg_embeddings

In [14]:
params = {
    "epochs":200,
    "bs":128,
    # 'lr':1e-3
}

In [15]:
X = np.concatenate([
    text_embeddings, 
    df.log_num_retweets.to_numpy()[..., None], 
    df.log_num_tweets.to_numpy()[..., None],
    graph_embeddings], axis=1)
y = (df.label=="real").to_numpy().astype(int)

X.shape, y.shape

((19968, 898), (19968,))

In [16]:
skf = StratifiedKFold(shuffle=True, random_state=124)

Creating a group and logging all train, test data in wandb for better tracking and visualisation.

In [17]:
WANDB_ENTITY = 'saloniteam'
WANDB_PROJECTS = 'nofolds'

GROUP = f"{dataset_id}-fulltext-sbert+tw+ptbg128-mlp"

for fold_id, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    clear_output()
    with wandb.init(entity=WANDB_ENTITY, project=WANDB_ENTITY, group=GROUP, name=f"{GROUP}-{fold_id}") as run:
        train(fold_id, train_idx, test_idx, params)
    break

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Epoch 1/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 2/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 3/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 4/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 5/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 6/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 7/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 8/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 9/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 10/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 11/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 12/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 13/200
Epoch 14/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 15/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 16/200
Epoch 17/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 18/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 19/200
Epoch 20/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 21/200
Epoch 22/200
Epoch 23/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 24/200
Epoch 25/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 26/200
Epoch 27/200
Epoch 28/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 29/200
Epoch 30/200
Epoch 31/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 40/200
Epoch 41/200
Epoch 42/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 43/200

[34m[1mwandb[0m: Adding directory to artifact (/content/wandb/run-20221019_105142-3t1wmuk7/files/model-best)... Done. 0.1s


Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 

VBox(children=(Label(value='18.797 MB of 18.797 MB uploaded (0.007 MB deduped)\r'), FloatProgress(value=0.9999…

0,1
accuracy,▁▂▃▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇█▇█████
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval/accuracy,▁
eval/f1,▁
eval/precision,▁
eval/recall,▁
loss,█▇▆▅▅▅▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁
val_accuracy,▁▄▅▆▇▇▇█▇█▆▇▇█▇█▆▇▇▇▆▇▇▇▇▇▆▆▇▇█▇▇▇▇▆▇▆▇▇
val_loss,█▅▃▃▂▂▁▁▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▃▃▃▃▄▄▄▄▅▄▄▅▅▅▅▅

0,1
accuracy,0.95549
best_epoch,42.0
best_val_loss,0.28714
epoch,199.0
eval/accuracy,0.88433
eval/f1,0.92754
eval/precision,0.89878
eval/recall,0.9582
loss,0.11681
val_accuracy,0.87481
