<a href="https://colab.research.google.com/github/verma-saloni/Thesis-Work/blob/main/SBERT_%2B_BigGraph_Embeddings_XGboost_gossipcop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#%%capture
!pip install -U xgboost sentence-transformers wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xgboost
  Downloading xgboost-1.6.2-py3-none-manylinux2014_x86_64.whl (255.9 MB)
[K     |████████████████████████████████| 255.9 MB 44 kB/s 
[?25hCollecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 5.2 MB/s 
[?25hCollecting wandb
  Downloading wandb-0.13.4-py2.py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 45.1 MB/s 
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 55.7 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 60.7 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[

The notebook for SBERt+Biggraph embeddings, for Gossipcop dataset. Logged results on Wandb (saloniteam project)

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
import re
import json

from sentence_transformers import SentenceTransformer 

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import xgboost as xgb

import wandb
from wandb.xgboost import WandbCallback
from IPython.display import clear_output

In [3]:
metrics = [accuracy_score, f1_score, precision_score, recall_score]

def get_name(score_func):
    return score_func.__name__.split("_")[0]

# Load data

In [4]:
from google.colab import drive
drive.mount('/content/drive')
base_dir = Path("/content/drive/MyDrive/ResearchFND")
assert base_dir.exists()

Mounted at /content/drive


In [5]:
dataset_id = 'gossipcop'

In [6]:
import ast

converters = {"retweets":ast.literal_eval, "tweets":ast.literal_eval}
df = pd.read_csv(base_dir/f"{dataset_id}_agg.csv", converters=converters)
df.head()

Unnamed: 0,title,text,tweets,retweets,label,url,num_retweets,log_num_retweets,num_tweets,log_num_tweets
0,Kendall Kylie Jenner Jenner NOT Upset Up...,,[],"[995423424741888001, 995461685166202880, 99987...",fake,,3,1.386294,0,0.0
1,Kim Kardashian Dethroned Dethroned By Khlo...,,[],"[848843565027516416, 849030801970868224, 84884...",fake,,3,1.386294,0,0.0
2,Kim Kardashian Did NOT Hot Staffer Hot Sta...,,[],"[940685393112064001, 977921622672920576, 94031...",fake,,8,2.197225,0,0.0
3,The Voice The Voice Team NOT Surprised Sur...,,[],[],fake,,0,0.0,0,0.0
4,Drake NOT Angelina Jolie s Toy Boy Toy Boy ...,,"[{'id': 948630026496323585, 'text': 'Drake NOT...","[948022124626808832, 948630026496323585, 94801...",fake,,18,2.944439,7,2.079442


In [7]:
df.title.isna().sum(), (df.title == "").sum()

(0, 0)

In [8]:
titles = df.title.tolist()
texts = (df.title + " " + df.text).tolist()

In [9]:
len(texts)

19968

# Prepare emebeddings

In [10]:
title_embedding_file = base_dir/f"{dataset_id}_sbert_title_embeddings.npy"

if title_embedding_file.exists():
    title_embeddings = np.load(title_embedding_file)
else:
    model_id = "all-mpnet-base-v2"
    model = SentenceTransformer(model_id)
    titles = df.title.tolist()
    title_embeddings = model.encode(titles, show_progress_bar=True)
    np.save(title_embedding_file, title_embeddings)

In [11]:
text_embedding_file = base_dir/f"{dataset_id}_sbert_fulltext_embeddings.npy"

if text_embedding_file.exists():
    text_embeddings = np.load(text_embedding_file)
else:
    model_id = "all-mpnet-base-v2"
    model = SentenceTransformer(model_id)
    texts = (df.title + "\n" + df.text).tolist()
    text_embeddings = model.encode(texts, show_progress_bar=True)
    np.save(text_embedding_file, text_embeddings)

In [12]:
edim = 128
bg_embeddings = np.load(base_dir/f'{dataset_id}_pt_biggraph_article_embeddings_{edim}.npy')
idx = np.load(base_dir/f"{dataset_id}_pt_biggraph_article_idx_{edim}.npy")

In [13]:
graph_embeddings = np.zeros((text_embeddings.shape[0], edim))
graph_embeddings[idx] = bg_embeddings

In [14]:
X = np.concatenate([
    text_embeddings, 
    df.num_retweets.to_numpy()[..., None], 
    df.num_tweets.to_numpy()[..., None],
    graph_embeddings], axis=1)
y = (df.label=="real").to_numpy().astype(int)

X.shape, y.shape

((19968, 898), (19968,))

In [15]:
skf = StratifiedKFold(shuffle=True, random_state=124)

# Traing XGB

In [16]:
def train(train_idx, test_idx, params):

    # training
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    watchlist = [(dtrain,'train'), (dtest,'eval')]
    clf = xgb.train(params, dtrain, num_boost_round=params['num_boost_round'], early_stopping_rounds=None, evals=watchlist, callbacks=[WandbCallback()])
    #evaluation
    probs = clf.predict(dtest)
    y_pred = (probs > 0.5).astype(int)
    eval_results = {get_name(f):f(y_pred=y_pred, y_true=y_test) for f in metrics}
    wandb.log(eval_results)
    wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                            y_true=y_test, preds=y_pred,
                            class_names=["Fake", "Real"])})

In [17]:
params = {
    "objective":'binary:logistic',
    "seed":124,
    "num_boost_round":200
}

In [18]:
WANDB_ENTITY = 'saloniteam'
WANDB_PROJECT = 'nofolds'
GROUP = "gossipcop-sbert-mpnet-v2-biggraph128-xgb"

for fold_id, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    clear_output()
    with wandb.init(entity=WANDB_ENTITY, project=WANDB_PROJECT, group=GROUP, name=f"{GROUP}-fold-{fold_id}", tags=['xgb', 'sbert', 'biggraph']) as run:
        train(train_idx, test_idx, params)
    break

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Parameters: { "num_boost_round" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-logloss:0.53893	eval-logloss:0.55267
[1]	train-logloss:0.44819	eval-logloss:0.47325
[2]	train-logloss:0.38859	eval-logloss:0.42444
[3]	train-logloss:0.34590	eval-logloss:0.39308
[4]	train-logloss:0.31374	eval-logloss:0.36986
[5]	train-logloss:0.28707	eval-logloss:0.35301
[6]	train-logloss:0.26881	eval-logloss:0.34220
[7]	train-logloss:0.25077	eval-logloss:0.33604
[8]	train-logloss:0.23792	eval-logloss:0.33084
[9]	train-logloss:0.22448	eval-logloss:0.32632
[10]	train-logloss:0.21605	eval-logloss:0.32081
[11]	train-logloss:0.20413	eval-logloss:0.31601
[12]	train-logloss:0.19312	eval-logloss:0.31348
[13]	train-logloss:0.18783	eval-logloss:0.31075
[14]	train-loglo

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval-logloss,█▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▃▄
f1,▁
precision,▁
recall,▁
train-logloss,█▅▄▄▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.89234
epoch,199.0
f1,0.93247
precision,0.90463
recall,0.96209
