<a href="https://colab.research.google.com/github/verma-saloni/Thesis-Work/blob/main/07_10-22_SBERT_nn_200_pol.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U xgboost sentence-transformers wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xgboost
  Downloading xgboost-1.6.1-py3-none-manylinux2014_x86_64.whl (192.9 MB)
[K     |████████████████████████████████| 192.9 MB 64 kB/s 
[?25hCollecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 3.3 MB/s 
[?25hCollecting wandb
  Downloading wandb-0.12.21-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 40.9 MB/s 
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 38.8 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 42.8 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
import re
import json

from sentence_transformers import SentenceTransformer 

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import xgboost as xgb

import wandb
from wandb.xgboost import WandbCallback
from IPython.display import clear_output

In [3]:
metrics = [accuracy_score, f1_score, precision_score, recall_score]

def get_name(score_func):
    return score_func.__name__.split("_")[0]

# Load data

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
base_dir = Path("/content/drive/MyDrive/ResearchFND")
assert base_dir.exists()

In [21]:
import ast

converters = {"retweets":ast.literal_eval, "tweets":ast.literal_eval}
df = pd.read_csv(base_dir/"politifact_agg.csv", converters=converters)
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,tweets,retweets,label,url,tweet_ids,num_retweets,log_num_retweets,num_tweets,log_num_tweets
0,0,Actress Emma Stone ‘For the first time in his...,,[],"[1020554564334964741, 1020817527046197248, 106...",fake,,[],2911,7.976595,0,0.0
1,1,Breaking President Trump makes English the of...,,[],[],fake,,[],0,0.0,0,0.0
2,2,Friendly Fire … Charlottesville Car attacker ...,,[],"[3265439004, 3250621593, 3253922920, 326691851...",fake,,[],24,3.218876,0,0.0
3,3,If You Are Using This Toothpaste… Throw It Aw...,MightyLiving Blog\n\nHelpful inspiration from ...,[],"[911971426571255810, 1036749614853103616, 1033...",fake,mightynest.com/learn/research-by-concern/dange...,[],2569,7.851661,0,0.0
4,4,"""Face the Nation"" transcripts, August 26, 2012...","""Face the Nation"" transcripts, August 26, 2012...",[],[],real,https://web.archive.org/web/20120827001956/htt...,[],0,0.0,0,0.0


In [22]:
df.title.isna().sum(), (df.title == "").sum()

(0, 0)

# Compute emebeddings

In [25]:
title_embedding_file = base_dir/"politifact_sbert_title_embeddings.npy"

if title_embedding_file.exists():
    title_embeddings = np.load(title_embedding_file)
else:
    model_id = "all-mpnet-base-v2"
    model = SentenceTransformer(model_id)
    titles = df.title.to_list()
    title_embeddings = model.encode(titles, show_progress_bar=True)
    np.save(title_embedding_file, title_embeddings)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/28 [00:00<?, ?it/s]

In [26]:
df.text.isna().sum()

153

In [27]:
text_embedding_file = base_dir/"politifact_sbert_fulltext_embeddings.npy"

if text_embedding_file.exists():
    text_embeddings = np.load(text_embedding_file)
else:
    model_id = "all-mpnet-base-v2"
    model = SentenceTransformer(model_id)
    df.text.fillna("", inplace=True)
    texts = df.title + "\n" + df.text
    text_embeddings = model.encode(texts, show_progress_bar=True)
    np.save(text_embedding_file, text_embeddings)

Batches:   0%|          | 0/28 [00:00<?, ?it/s]

# Extra features

TODO: feel free to suggest more features here

## Source

In [14]:
source = df.news_url.str.replace("^(https://)?web.archive.org/web/\d+/", "", regex=True)

AttributeError: ignored

In [None]:
source = source.str.replace("^(https?://)?(www.)?", "", regex=True)

In [None]:
source = source.str.replace("\.(com|info|org|gov|tv|us|news|me|co.uk|net|club|co|live|edu|xyz|site|life|ru|online|tk|website|pw|one|world|mil).*$", "", regex=True)

In [None]:
len(source), len(source.unique())

## Number of retweets

In [28]:
df["num_retweets"] = df.retweets.map(lambda x: len(set(x)))
df["log_num_retweets"] = np.log1p(df.num_retweets.to_numpy())
df["num_tweets"] = df.tweets.map(lambda x: len(set([t["id"] for t in x])))
df["log_num_tweets"] = np.log1p(df.num_tweets.to_numpy())

In [29]:
df.describe()

Unnamed: 0.1,Unnamed: 0,num_retweets,log_num_retweets,num_tweets,log_num_tweets
count,894.0,894.0,894.0,894.0,894.0
mean,446.5,646.884787,3.475319,70.42953,0.742621
std,258.219868,2461.779445,2.758056,778.521306,1.815463
min,0.0,0.0,0.0,0.0,0.0
25%,223.25,1.0,0.693147,0.0,0.0
50%,446.5,31.0,3.465736,0.0,0.0
75%,669.75,305.75,5.726032,0.0,0.0
max,893.0,29060.0,10.277152,21984.0,9.998116


# Training

In [30]:
WANDB_ENTITY = 'saloniteam'
WANDB_PROJECT = 'nofolds'

### Title embeddings

In [31]:
X = title_embeddings
y = (df.label=="real").to_numpy().astype(int)

X.shape, y.shape

((894, 768), (894,))

In [32]:
skf = StratifiedKFold(shuffle=True, random_state=124)

In [33]:
def train(train_idx, test_idx, params):

    # training
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    watchlist = [(dtrain,'train'), (dtest,'eval')]
    clf = xgb.train(params, dtrain, num_boost_round=200, early_stopping_rounds=None, evals=watchlist, callbacks=[WandbCallback()])
    #evaluation
    probs = clf.predict(dtest)
    y_pred = (probs > 0.5).astype(int)
    eval_results = {get_name(f):f(y_pred=y_pred, y_true=y_test) for f in metrics}
    wandb.log(eval_results)
    wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                            y_true=y_test, preds=y_pred,
                            class_names=["Fake", "Real"])})

In [34]:
params = {
    "objective":'binary:logistic',
    "seed":124
}

In [35]:
!wandb login --relogin

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [36]:
GROUP = "politifact-sbert-mpnet-v2-title-xgb"

for fold_id, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    clear_output()
    with wandb.init(entity=WANDB_ENTITY, project=WANDB_PROJECT, group=GROUP, name=f"{GROUP}_0") as run:
        train(train_idx, test_idx, params)
    break

[34m[1mwandb[0m: Currently logged in as: [33msaloni[0m ([33msaloniteam[0m). Use [1m`wandb login --relogin`[0m to force relogin


[0]	train-logloss:0.50337	eval-logloss:0.63257
[1]	train-logloss:0.37680	eval-logloss:0.54844
[2]	train-logloss:0.29378	eval-logloss:0.50917
[3]	train-logloss:0.23477	eval-logloss:0.46907
[4]	train-logloss:0.18961	eval-logloss:0.45830
[5]	train-logloss:0.15456	eval-logloss:0.45304
[6]	train-logloss:0.12921	eval-logloss:0.43815
[7]	train-logloss:0.10886	eval-logloss:0.41948
[8]	train-logloss:0.09165	eval-logloss:0.41600
[9]	train-logloss:0.07905	eval-logloss:0.40340
[10]	train-logloss:0.06908	eval-logloss:0.40021
[11]	train-logloss:0.06044	eval-logloss:0.39733
[12]	train-logloss:0.05326	eval-logloss:0.38680
[13]	train-logloss:0.04762	eval-logloss:0.37833
[14]	train-logloss:0.04292	eval-logloss:0.37329
[15]	train-logloss:0.03869	eval-logloss:0.36210
[16]	train-logloss:0.03524	eval-logloss:0.35516
[17]	train-logloss:0.03240	eval-logloss:0.35473
[18]	train-logloss:0.02992	eval-logloss:0.35311
[19]	train-logloss:0.02762	eval-logloss:0.35236
[20]	train-logloss:0.02558	eval-logloss:0.35520
[2

VBox(children=(Label(value='0.022 MB of 0.022 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval-logloss,█▅▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
f1,▁
precision,▁
recall,▁
train-logloss,█▄▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.82682
epoch,199.0
f1,0.83598
precision,0.84946
recall,0.82292


### Text embeddings

In [37]:
X = text_embeddings
y = (df.label=="real").to_numpy().astype(int)

X.shape, y.shape

((894, 768), (894,))

In [38]:
skf = StratifiedKFold(shuffle=True, random_state=124)

In [39]:
def train(train_idx, test_idx, params):

    # training
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    watchlist = [(dtrain,'train'), (dtest,'eval')]
    clf = xgb.train(params, dtrain, num_boost_round=200, early_stopping_rounds=None, evals=watchlist, callbacks=[WandbCallback()])
    #evaluation
    probs = clf.predict(dtest)
    y_pred = (probs > 0.5).astype(int)
    eval_results = {get_name(f):f(y_pred=y_pred, y_true=y_test) for f in metrics}
    wandb.log(eval_results)
    wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                            y_true=y_test, preds=y_pred,
                            class_names=["Fake", "Real"])})

In [40]:
params = {
    "objective":'binary:logistic',
    "seed":124
}

In [41]:
GROUP = "politifact-sbert-mpnet-v2-fulltext-xgb"
for fold_id, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    clear_output()
    with wandb.init(entity=WANDB_ENTITY, project=WANDB_PROJECT, group=GROUP, name=f"{GROUP}_0") as run:
        train(train_idx, test_idx, params)
    break

[0]	train-logloss:0.50447	eval-logloss:0.61130
[1]	train-logloss:0.38697	eval-logloss:0.55114
[2]	train-logloss:0.29647	eval-logloss:0.49479
[3]	train-logloss:0.23169	eval-logloss:0.45764
[4]	train-logloss:0.18565	eval-logloss:0.43254
[5]	train-logloss:0.15059	eval-logloss:0.42798
[6]	train-logloss:0.12388	eval-logloss:0.41043
[7]	train-logloss:0.10315	eval-logloss:0.39078
[8]	train-logloss:0.08684	eval-logloss:0.37531
[9]	train-logloss:0.07429	eval-logloss:0.36711
[10]	train-logloss:0.06422	eval-logloss:0.35258
[11]	train-logloss:0.05635	eval-logloss:0.35034
[12]	train-logloss:0.04973	eval-logloss:0.33461
[13]	train-logloss:0.04409	eval-logloss:0.33213
[14]	train-logloss:0.03943	eval-logloss:0.32611
[15]	train-logloss:0.03584	eval-logloss:0.32838
[16]	train-logloss:0.03268	eval-logloss:0.32420
[17]	train-logloss:0.02992	eval-logloss:0.31730
[18]	train-logloss:0.02768	eval-logloss:0.31323
[19]	train-logloss:0.02552	eval-logloss:0.30501
[20]	train-logloss:0.02367	eval-logloss:0.30243
[2

VBox(children=(Label(value='0.021 MB of 0.021 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.992819…

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval-logloss,█▅▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
f1,▁
precision,▁
recall,▁
train-logloss,█▄▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.87709
epoch,199.0
f1,0.87778
precision,0.94048
recall,0.82292


### + num tweets and retweets

In [42]:
X = np.concatenate([text_embeddings, df.num_retweets.to_numpy()[..., None], df.num_tweets.to_numpy()[..., None]], axis=1)
y = (df.label=="real").to_numpy().astype(int)

X.shape, y.shape

((894, 770), (894,))

In [43]:
skf = StratifiedKFold(shuffle=True, random_state=124)

In [44]:
def train(train_idx, test_idx, params):

    # training
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    watchlist = [(dtrain,'train'), (dtest,'eval')]
    clf = xgb.train(params, dtrain, num_boost_round=200, early_stopping_rounds=None, evals=watchlist, callbacks=[WandbCallback()])
    #evaluation
    probs = clf.predict(dtest)
    y_pred = (probs > 0.5).astype(int)
    eval_results = {get_name(f):f(y_pred=y_pred, y_true=y_test) for f in metrics}
    wandb.log(eval_results)
    wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                            y_true=y_test, preds=y_pred,
                            class_names=["Fake", "Real"])})

In [45]:
params = {
    "objective":'binary:logistic',
    "seed":124
}

In [46]:
GROUP = "politifact-sbert-mpnet-v2-fulltext-tw-xgb"
for fold_id, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    clear_output()
    with wandb.init(entity=WANDB_ENTITY, project=WANDB_PROJECT, group=GROUP, name=f"{GROUP}-0") as run:
        train(train_idx, test_idx, params)
    break

[0]	train-logloss:0.50119	eval-logloss:0.57833
[1]	train-logloss:0.38220	eval-logloss:0.49047
[2]	train-logloss:0.29839	eval-logloss:0.43014
[3]	train-logloss:0.23620	eval-logloss:0.40822
[4]	train-logloss:0.18775	eval-logloss:0.36637
[5]	train-logloss:0.15192	eval-logloss:0.34407
[6]	train-logloss:0.12583	eval-logloss:0.32154
[7]	train-logloss:0.10468	eval-logloss:0.30547
[8]	train-logloss:0.08789	eval-logloss:0.28160
[9]	train-logloss:0.07481	eval-logloss:0.27993
[10]	train-logloss:0.06428	eval-logloss:0.27590
[11]	train-logloss:0.05537	eval-logloss:0.26853
[12]	train-logloss:0.04906	eval-logloss:0.25857
[13]	train-logloss:0.04339	eval-logloss:0.25371
[14]	train-logloss:0.03918	eval-logloss:0.24492
[15]	train-logloss:0.03538	eval-logloss:0.24110
[16]	train-logloss:0.03211	eval-logloss:0.23702
[17]	train-logloss:0.02913	eval-logloss:0.23632
[18]	train-logloss:0.02671	eval-logloss:0.23376
[19]	train-logloss:0.02471	eval-logloss:0.22946
[20]	train-logloss:0.02261	eval-logloss:0.23022
[2

VBox(children=(Label(value='0.018 MB of 0.018 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval-logloss,█▄▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
f1,▁
precision,▁
recall,▁
train-logloss,█▄▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.90503
epoch,199.0
f1,0.90811
precision,0.94382
recall,0.875
