<a href="https://colab.research.google.com/github/verma-saloni/Thesis-Work/blob/main/4_5_22_3_SBERT_Embeddings_extra_XGboost_Politifact_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
%%capture
!pip install -U xgboost sentence-transformers wandb

In [8]:
from pathlib import Path
import numpy as np
import pandas as pd
import re
import json

from sentence_transformers import SentenceTransformer 

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import xgboost as xgb

import wandb
from wandb.xgboost import WandbCallback
from IPython.display import clear_output

In [9]:
metrics = [accuracy_score, f1_score, precision_score, recall_score]

def get_name(score_func):
    return score_func.__name__.split("_")[0]

# Load data

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
base_dir = Path("/content/drive/MyDrive/Research FakeNews")
assert base_dir.exists()

In [14]:
import ast

converters = {"retweets":ast.literal_eval, "tweets":ast.literal_eval}
df = pd.read_csv(base_dir/"politifact.csv", index_col=0, converters=converters, parse_dates=["publish_date"])
df.head()

Unnamed: 0,id,label,url,text,top_img,keywords,authors,canonical_link,title,meta_data,publish_date,source,summary,retweets,tweets
0,politifact1212,real,http://www.youtube.com/watch?v=5zrsl8o4ZPo&fea...,ein Google-Unternehmen\n\nDienste anbieten und...,http://www.google.com/favicon.ico,[],[],,Bevor Sie zu YouTube weitergehen,"{'viewport': 'initial-scale=1, maximum-scale=5...",,http://www.youtube.com,,[],"[{'id': None, 'text': None, 'retweet_count': N..."
1,politifact6730,real,http://www.motherjones.com/politics/2012/09/se...,"During a private fundraiser earlier this year,...",https://www.motherjones.com/wp-content/uploads...,[],"['David Corn', 'Dave Gilson', 'Tim Murphy', 'B...",https://www.motherjones.com/politics/2012/09/s...,SECRET VIDEO: Romney Tells Millionaire Donors ...,"{'viewport': 'width=device-width, initial-scal...",1347904858.0,http://www.motherjones.com,,[],"[{'id': None, 'text': None, 'retweet_count': N..."
2,politifact2298,real,https://web.archive.org/web/20050322064340/htt...,"COPYRIGHT © 2005 LexisNexis, a division of Ree...",,[],[],,LexisNexis(R) Publisher,{},,https://web.archive.org,,[],"[{'id': None, 'text': None, 'retweet_count': N..."
3,politifact87,real,http://www.ilga.gov/legislation/BillStatus.asp...,×\n\nThe Illinois General Assembly offers the ...,http://www.ilga.gov/LISlogo1.ico,[],[],,Illinois General Assembly,"{'classification': 'Government', 'distribution...",,http://www.ilga.gov,,[],[]
4,politifact3180,real,http://abcnews.go.com/Politics/rand-paul-repub...,"Feb. 4, 2011  -- In an exclusive interview wi...",http://abcnews.go.com/Politics/rand-paul-repub...,[],"['Abc News', 'Jonathan Karl', 'February']",https://abcnews.go.com/Politics/rand-paul-repu...,ABC News Exclusive: Rand Paul Says Republicans...,{'description': 'In an exclusive interview wit...,,http://abcnews.go.com,,[],[]


In [15]:
df.title.isna().sum(), (df.title == "").sum()

(0, 0)

In [16]:
dup_titles, dup_groups = [], []
for name, group in df.groupby("title"):
    if len(group) > 1:
        dup_titles.append(name)
        dup_groups.append(group)

In [17]:
bckp = df.copy()

In [18]:
agg_funcs = {"text":"max", "tweets":"sum", "retweets":"sum", "label":"first", "url":"first"}
df.text.fillna("", inplace=True)
df = df.groupby("title", as_index=False).agg(agg_funcs)
df.head()

Unnamed: 0,title,text,tweets,retweets,label,url
0,Actress Emma Stone ‘For the first time in his...,,"[{'id': None, 'text': None, 'retweet_count': N...",[],fake,
1,Breaking President Trump makes English the of...,,"[{'id': None, 'text': None, 'retweet_count': N...",[],fake,
2,Friendly Fire … Charlottesville Car attacker ...,,"[{'id': None, 'text': None, 'retweet_count': N...",[],fake,
3,If You Are Using This Toothpaste… Throw It Aw...,MightyLiving Blog\n\nHelpful inspiration from ...,"[{'id': None, 'text': None, 'retweet_count': N...","[888249387134066689, 882233433228967937, 88290...",fake,mightynest.com/learn/research-by-concern/dange...
4,"""Face the Nation"" transcripts, August 26, 2012...","""Face the Nation"" transcripts, August 26, 2012...","[{'id': None, 'text': None, 'retweet_count': N...",[],real,https://web.archive.org/web/20120827001956/htt...


In [19]:
titles = df.title.tolist()
texts = (df.title + " " + df.text).tolist()

In [20]:
len(texts)

894

# Compute emebeddings

In [21]:
embedding_file = base_dir/"sbert_title_embeddings.npy"

if embedding_file.exists():
    title_embeddings = np.load(embedding_file)
else:
    model_id = "all-mpnet-base-v2"
    model = SentenceTransformer(model_id)

    title_embeddings = model.encode(titles, show_progress_bar=True)
    np.save(embedding_file, title_embeddings)

In [22]:
embedding_file = base_dir/"sbert_fulltext_embeddings.npy"

if embedding_file.exists():
    text_embeddings = np.load(embedding_file)
else:
    model_id = "all-mpnet-base-v2"
    model = SentenceTransformer(model_id)

    text_embeddings = model.encode(texts, show_progress_bar=True)
    np.save(embedding_file, text_embeddings)

# Extra features

TODO: feel free to suggest more features here

## Source

In [26]:
source = df.url.str.replace("^(https://)?web.archive.org/web/\d+/", "", regex=True)

In [27]:
source = source.str.replace("^(https?://)?(www.)?", "", regex=True)

In [28]:
source = source.str.replace("\.(com|info|org|gov|tv|us|news|me|co.uk|net|club|co|live|edu|xyz|site|life|ru|online|tk|website|pw|one|world|mil).*$", "", regex=True)

In [29]:
len(source), len(source.unique())

(894, 519)

## Number of retweets

In [30]:
# df["tweet_id"] = df.twee

In [31]:
df["num_retweets"] = df.retweets.map(lambda x: len(set(x)))
df["log_num_retweets"] = np.log1p(df.num_retweets.to_numpy())
df["num_tweets"] = df.tweets.map(len)
df["log_num_tweets"] = np.log1p(df.num_tweets.to_numpy())

In [32]:
df.describe()

Unnamed: 0,num_retweets,log_num_retweets,num_tweets,log_num_tweets
count,894.0,894.0,894.0,894.0
mean,70.42953,0.81607,646.885906,3.430624
std,777.210478,1.855284,2746.769221,2.692058
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.693147
50%,0.0,0.0,33.0,3.526361
75%,0.0,0.0,240.75,5.487903
max,21984.0,9.998116,39919.0,10.594633


## More to come

# Training

### Title embeddings

In [33]:
X = title_embeddings
y = (df.label=="real").to_numpy().astype(int)

X.shape, y.shape

((894, 768), (894,))

In [34]:
skf = StratifiedKFold(shuffle=True, random_state=124)

In [35]:
def train(train_idx, test_idx, params):

    # training
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    watchlist = [(dtrain,'train'), (dtest,'eval')]
    clf = xgb.train(params, dtrain, num_boost_round=100, early_stopping_rounds=None, evals=watchlist, callbacks=[WandbCallback()])
    #evaluation
    probs = clf.predict(dtest)
    y_pred = (probs > 0.5).astype(int)
    eval_results = {get_name(f):f(y_pred=y_pred, y_true=y_test) for f in metrics}
    wandb.log(eval_results)
    wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                            y_true=y_test, preds=y_pred,
                            class_names=["Fake", "Real"])})

In [36]:
params = {
    "objective":'binary:logistic',
    "seed":124
}

In [38]:
GROUP = "sbert-mpnet-v2-title-xgb"

for fold_id, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    clear_output()
    with wandb.init(entity="saloniteam", project="fnd", group=GROUP, name=f"{GROUP}-fold-{fold_id}") as run:
        train(train_idx, test_idx, params)

[0]	train-logloss:0.48666	eval-logloss:0.62056
[1]	train-logloss:0.36886	eval-logloss:0.57794
[2]	train-logloss:0.28538	eval-logloss:0.53091
[3]	train-logloss:0.22380	eval-logloss:0.51001
[4]	train-logloss:0.18164	eval-logloss:0.49504
[5]	train-logloss:0.14780	eval-logloss:0.47909
[6]	train-logloss:0.12258	eval-logloss:0.46907
[7]	train-logloss:0.10261	eval-logloss:0.46918
[8]	train-logloss:0.08639	eval-logloss:0.45978
[9]	train-logloss:0.07478	eval-logloss:0.46155
[10]	train-logloss:0.06401	eval-logloss:0.46466
[11]	train-logloss:0.05624	eval-logloss:0.45765
[12]	train-logloss:0.04974	eval-logloss:0.45079
[13]	train-logloss:0.04451	eval-logloss:0.44738
[14]	train-logloss:0.04038	eval-logloss:0.44196
[15]	train-logloss:0.03674	eval-logloss:0.43572
[16]	train-logloss:0.03338	eval-logloss:0.44172
[17]	train-logloss:0.03089	eval-logloss:0.43562
[18]	train-logloss:0.02846	eval-logloss:0.43216
[19]	train-logloss:0.02626	eval-logloss:0.42958
[20]	train-logloss:0.02444	eval-logloss:0.43661
[2

VBox(children=(Label(value='0.021 MB of 0.021 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval-logloss,█▅▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
f1,▁
precision,▁
recall,▁
train-logloss,█▅▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.81461
epoch,99.0
f1,0.82902
precision,0.82474
recall,0.83333


### Text embeddings

In [39]:
X = text_embeddings
y = (df.label=="real").to_numpy().astype(int)

X.shape, y.shape

((894, 768), (894,))

In [40]:
skf = StratifiedKFold(shuffle=True, random_state=124)

In [41]:
def train(train_idx, test_idx, params):

    # training
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    watchlist = [(dtrain,'train'), (dtest,'eval')]
    clf = xgb.train(params, dtrain, num_boost_round=100, early_stopping_rounds=None, evals=watchlist, callbacks=[WandbCallback()])
    #evaluation
    probs = clf.predict(dtest)
    y_pred = (probs > 0.5).astype(int)
    eval_results = {get_name(f):f(y_pred=y_pred, y_true=y_test) for f in metrics}
    wandb.log(eval_results)
    wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                            y_true=y_test, preds=y_pred,
                            class_names=["Fake", "Real"])})

In [42]:
params = {
    "objective":'binary:logistic',
    "seed":124
}

In [43]:
GROUP = "sbert-mpnet-v2-fulltext-xgb"
for fold_id, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    clear_output()
    with wandb.init(entity="saloniteam", project="fnd", group=GROUP, name=f"{GROUP}-fold-{fold_id}") as run:
        train(train_idx, test_idx, params)

[0]	train-logloss:0.49802	eval-logloss:0.60112
[1]	train-logloss:0.37041	eval-logloss:0.55713
[2]	train-logloss:0.28558	eval-logloss:0.51866
[3]	train-logloss:0.22281	eval-logloss:0.48339
[4]	train-logloss:0.17741	eval-logloss:0.47409
[5]	train-logloss:0.14721	eval-logloss:0.45542
[6]	train-logloss:0.11919	eval-logloss:0.44750
[7]	train-logloss:0.09922	eval-logloss:0.42893
[8]	train-logloss:0.08325	eval-logloss:0.43532
[9]	train-logloss:0.07114	eval-logloss:0.43011
[10]	train-logloss:0.06135	eval-logloss:0.42342
[11]	train-logloss:0.05304	eval-logloss:0.41455
[12]	train-logloss:0.04676	eval-logloss:0.41704
[13]	train-logloss:0.04149	eval-logloss:0.41046
[14]	train-logloss:0.03699	eval-logloss:0.40922
[15]	train-logloss:0.03338	eval-logloss:0.40820
[16]	train-logloss:0.03049	eval-logloss:0.40937
[17]	train-logloss:0.02792	eval-logloss:0.41175
[18]	train-logloss:0.02571	eval-logloss:0.41297
[19]	train-logloss:0.02396	eval-logloss:0.41544
[20]	train-logloss:0.02230	eval-logloss:0.41041
[2

VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval-logloss,█▅▃▂▂▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▃▃▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃
f1,▁
precision,▁
recall,▁
train-logloss,█▅▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.83708
epoch,99.0
f1,0.84656
precision,0.86022
recall,0.83333


### + num tweets and retweets

In [44]:
X = np.concatenate([text_embeddings, df.num_retweets.to_numpy()[..., None], df.num_tweets.to_numpy()[..., None]], axis=1)
y = (df.label=="real").to_numpy().astype(int)

X.shape, y.shape

((894, 770), (894,))

In [45]:
skf = StratifiedKFold(shuffle=True, random_state=124)

In [46]:
def train(train_idx, test_idx, params):

    # training
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    watchlist = [(dtrain,'train'), (dtest,'eval')]
    clf = xgb.train(params, dtrain, num_boost_round=100, early_stopping_rounds=None, evals=watchlist, callbacks=[WandbCallback()])
    #evaluation
    probs = clf.predict(dtest)
    y_pred = (probs > 0.5).astype(int)
    eval_results = {get_name(f):f(y_pred=y_pred, y_true=y_test) for f in metrics}
    wandb.log(eval_results)
    wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                            y_true=y_test, preds=y_pred,
                            class_names=["Fake", "Real"])})

In [47]:
params = {
    "objective":'binary:logistic',
    "seed":124
}

In [48]:
GROUP = "sbert-mpnet-v2-fulltext-tw-xgb"
for fold_id, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    clear_output()
    with wandb.init(entity="saloniteam", project="fnd", group=GROUP, name=f"{GROUP}-fold-{fold_id}") as run:
        train(train_idx, test_idx, params)

[0]	train-logloss:0.49200	eval-logloss:0.56797
[1]	train-logloss:0.36549	eval-logloss:0.49879
[2]	train-logloss:0.27880	eval-logloss:0.43887
[3]	train-logloss:0.21741	eval-logloss:0.40190
[4]	train-logloss:0.17158	eval-logloss:0.37149
[5]	train-logloss:0.13934	eval-logloss:0.35331
[6]	train-logloss:0.11363	eval-logloss:0.35283
[7]	train-logloss:0.09411	eval-logloss:0.35104
[8]	train-logloss:0.07845	eval-logloss:0.34251
[9]	train-logloss:0.06639	eval-logloss:0.33942
[10]	train-logloss:0.05765	eval-logloss:0.33059
[11]	train-logloss:0.04968	eval-logloss:0.32909
[12]	train-logloss:0.04339	eval-logloss:0.32285
[13]	train-logloss:0.03815	eval-logloss:0.32190
[14]	train-logloss:0.03417	eval-logloss:0.32555
[15]	train-logloss:0.03094	eval-logloss:0.32701
[16]	train-logloss:0.02795	eval-logloss:0.32313
[17]	train-logloss:0.02581	eval-logloss:0.31621
[18]	train-logloss:0.02377	eval-logloss:0.31849
[19]	train-logloss:0.02186	eval-logloss:0.31859
[20]	train-logloss:0.02019	eval-logloss:0.31506
[2

VBox(children=(Label(value='0.015 MB of 0.015 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval-logloss,█▄▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▁▂▂▂▂▂▂▂▂▂▂▂▂▂
f1,▁
precision,▁
recall,▁
train-logloss,█▅▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.88764
epoch,99.0
f1,0.89474
precision,0.90426
recall,0.88542
