<a href="https://colab.research.google.com/github/verma-saloni/Thesis-Work/blob/main/4_5_22_1_Sbert_embeddings_extra_xgboost_politifact.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install -U xgboost sentence-transformers wandb

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import re

from sentence_transformers import SentenceTransformer 

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import xgboost as xgb

import wandb
from wandb.xgboost import WandbCallback

In [None]:
metrics = [accuracy_score, f1_score, precision_score, recall_score]

def get_name(score_func):
    return score_func.__name__.split("_")[0]


Load data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
base_dir = Path("/content/drive/MyDrive/")
assert base_dir.exists()

In [None]:
df_fake = pd.read_csv(base_dir/'politifact_fake.csv')
df_real = pd.read_csv(base_dir/'politifact_real.csv')
df_fake['label']=0
df_real['label']=1
df_fake.count()

id           432
news_url     428
title        432
tweet_ids    392
label        432
dtype: int64

In [None]:
df = df_fake.append(df_real)
df.reset_index(drop=True, inplace=True)

In [None]:
df.head()

Unnamed: 0,id,news_url,title,tweet_ids,label
0,politifact15014,speedtalk.com/forum/viewtopic.php?t=51650,BREAKING: First NFL Team Declares Bankruptcy O...,937349434668498944\t937379378006282240\t937380...,0
1,politifact15156,politics2020.info/index.php/2018/03/13/court-o...,Court Orders Obama To Pay $400 Million In Rest...,972666281441878016\t972678396575559680\t972827...,0
2,politifact14745,www.nscdscamps.org/blog/category/parenting/467...,UPDATE: Second Roy Moore Accuser Works For Mic...,929405740732870656\t929439450400264192\t929439...,0
3,politifact14355,https://howafrica.com/oscar-pistorius-attempts...,Oscar Pistorius Attempts To Commit Suicide,886941526458347521\t887011300278194176\t887023...,0
4,politifact15371,http://washingtonsources.org/trump-votes-for-d...,Trump Votes For Death Penalty For Being Gay,915205698212040704\t915242076681506816\t915249...,0


Compute emebeddings

In [None]:
embedding_file = Path("/content/drive/MyDrive/sbert_embeddings.npy")

if embedding_file.exists():
    embeddings = np.load(embedding_file)
else:
    model_id = "all-mpnet-base-v2"
    model = SentenceTransformer(model_id)

    sentences = df.title.to_list()
    embeddings = model.encode(sentences, show_progress_bar=True)
    np.save(embedding_file, embeddings)

Extra features: add more features here, if all looks good. 

Source

In [None]:
source = df.news_url.str.replace("^(https://)?web.archive.org/web/\d+/", "", regex=True)

In [None]:
source = source.str.replace("^(https?://)?(www.)?", "", regex=True)

In [None]:
source = source.str.replace("\.(com|info|org|gov|tv|us|news|me|co.uk|net|club|co|live|edu|xyz|site|life|ru|online|tk|website|pw|one|world|mil).*$", "", regex=True)

In [None]:
len(source), len(source.unique())


(1056, 526)

Number of retweets

In [None]:
df["num_retweets"] = df.tweet_ids.fillna("").map(lambda x: len(str(x).split("\t")) if len(x) else 0)
df["log_num_retweets"] = np.log(df.num_retweets.to_numpy() + 1)

In [None]:
df.describe()

Unnamed: 0,label,num_retweets,log_num_retweets
count,1056.0,1056.0,1056.0
mean,0.590909,552.57197,3.363832
std,0.491899,2126.317823,2.704599
min,0.0,0.0,0.0
25%,0.0,1.0,0.693147
50%,1.0,30.0,3.433987
75%,1.0,246.25,5.510398
max,1.0,29060.0,10.277152


More to come here..

Training

In [None]:
X = np.concatenate([embeddings, df.log_num_retweets.to_numpy()[..., None]], axis=1)
y = df.label.to_numpy()

X.shape, y.shape

((1056, 769), (1056,))

In [None]:
skf = StratifiedKFold(shuffle=True, random_state=124)

In [None]:
def train(train_idx, test_idx, params):

    # training
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    watchlist = [(dtrain,'train'), (dtest,'eval')]
    clf = xgb.train(params, dtrain, num_boost_round=100, early_stopping_rounds=None, evals=watchlist, callbacks=[WandbCallback()])
    #evaluation
    probs = clf.predict(dtest)
    y_pred = (probs > 0.5).astype(int)
    eval_results = {get_name(f):f(y_pred=y_pred, y_true=y_test) for f in metrics}
    wandb.log(eval_results)
    wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                            y_true=y_test, preds=y_pred,
                            class_names=["Fake", "Real"])})

In [None]:
params = {
    "objective":'binary:logistic',
    "seed":124
}

In [None]:
for fold_id, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    with wandb.init(entity="saloniteam", project="fnd", group="sbert-mpnet-v2+lnt-xgb", name=f"sbert-mpnet-v2+xgb-fold-{fold_id}") as run:
        train(train_idx, test_idx, params)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


[0]	train-logloss:0.49849	eval-logloss:0.58610
[1]	train-logloss:0.37901	eval-logloss:0.50565
[2]	train-logloss:0.29266	eval-logloss:0.44708
[3]	train-logloss:0.23140	eval-logloss:0.41384
[4]	train-logloss:0.18511	eval-logloss:0.38660
[5]	train-logloss:0.15363	eval-logloss:0.36962
[6]	train-logloss:0.12667	eval-logloss:0.34499
[7]	train-logloss:0.10608	eval-logloss:0.33883
[8]	train-logloss:0.09053	eval-logloss:0.32773
[9]	train-logloss:0.07714	eval-logloss:0.32147
[10]	train-logloss:0.06679	eval-logloss:0.31234
[11]	train-logloss:0.05845	eval-logloss:0.30866
[12]	train-logloss:0.05152	eval-logloss:0.30468
[13]	train-logloss:0.04586	eval-logloss:0.29732
[14]	train-logloss:0.04089	eval-logloss:0.29311
[15]	train-logloss:0.03713	eval-logloss:0.28604
[16]	train-logloss:0.03365	eval-logloss:0.28250
[17]	train-logloss:0.03086	eval-logloss:0.27879
[18]	train-logloss:0.02857	eval-logloss:0.27749
[19]	train-logloss:0.02652	eval-logloss:0.27341
[20]	train-logloss:0.02481	eval-logloss:0.27075
[2

VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.992643…

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval-logloss,█▅▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
f1,▁
precision,▁
recall,▁
train-logloss,█▅▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.89623
epoch,99.0
f1,0.91406
precision,0.89313
recall,0.936


[34m[1mwandb[0m: Currently logged in as: [33msaloniteam[0m (use `wandb login --relogin` to force relogin)


[0]	train-logloss:0.50387	eval-logloss:0.59078
[1]	train-logloss:0.37930	eval-logloss:0.50672
[2]	train-logloss:0.29503	eval-logloss:0.46410
[3]	train-logloss:0.23637	eval-logloss:0.42757
[4]	train-logloss:0.18915	eval-logloss:0.39747
[5]	train-logloss:0.15388	eval-logloss:0.37551
[6]	train-logloss:0.12860	eval-logloss:0.36201
[7]	train-logloss:0.10971	eval-logloss:0.34939
[8]	train-logloss:0.09309	eval-logloss:0.33636
[9]	train-logloss:0.07964	eval-logloss:0.32171
[10]	train-logloss:0.06920	eval-logloss:0.31029
[11]	train-logloss:0.06062	eval-logloss:0.30451
[12]	train-logloss:0.05357	eval-logloss:0.30012
[13]	train-logloss:0.04794	eval-logloss:0.30056
[14]	train-logloss:0.04266	eval-logloss:0.29977
[15]	train-logloss:0.03883	eval-logloss:0.29412
[16]	train-logloss:0.03549	eval-logloss:0.29326
[17]	train-logloss:0.03260	eval-logloss:0.29213
[18]	train-logloss:0.03022	eval-logloss:0.29230
[19]	train-logloss:0.02813	eval-logloss:0.28887
[20]	train-logloss:0.02636	eval-logloss:0.28893
[2

VBox(children=(Label(value='0.021 MB of 0.021 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.992875…

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval-logloss,█▅▃▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂
f1,▁
precision,▁
recall,▁
train-logloss,█▅▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.88152
epoch,99.0
f1,0.90347
precision,0.86667
recall,0.94355


[0]	train-logloss:0.49856	eval-logloss:0.58805
[1]	train-logloss:0.38498	eval-logloss:0.51283
[2]	train-logloss:0.29712	eval-logloss:0.47288
[3]	train-logloss:0.23533	eval-logloss:0.42939
[4]	train-logloss:0.18514	eval-logloss:0.41292
[5]	train-logloss:0.15231	eval-logloss:0.38415
[6]	train-logloss:0.12431	eval-logloss:0.36477
[7]	train-logloss:0.10405	eval-logloss:0.34777
[8]	train-logloss:0.08856	eval-logloss:0.33520
[9]	train-logloss:0.07586	eval-logloss:0.32718
[10]	train-logloss:0.06628	eval-logloss:0.32368
[11]	train-logloss:0.05771	eval-logloss:0.31283
[12]	train-logloss:0.05134	eval-logloss:0.30400
[13]	train-logloss:0.04597	eval-logloss:0.29380
[14]	train-logloss:0.04122	eval-logloss:0.29468
[15]	train-logloss:0.03714	eval-logloss:0.28512
[16]	train-logloss:0.03398	eval-logloss:0.28407
[17]	train-logloss:0.03116	eval-logloss:0.28228
[18]	train-logloss:0.02861	eval-logloss:0.28015
[19]	train-logloss:0.02652	eval-logloss:0.27806
[20]	train-logloss:0.02476	eval-logloss:0.27954
[2

VBox(children=(Label(value='0.021 MB of 0.021 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval-logloss,█▆▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
f1,▁
precision,▁
recall,▁
train-logloss,█▅▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.90047
epoch,99.0
f1,0.91566
precision,0.91935
recall,0.912


[0]	train-logloss:0.49186	eval-logloss:0.57930
[1]	train-logloss:0.36852	eval-logloss:0.51609
[2]	train-logloss:0.28510	eval-logloss:0.46525
[3]	train-logloss:0.22635	eval-logloss:0.43531
[4]	train-logloss:0.18224	eval-logloss:0.40688
[5]	train-logloss:0.14910	eval-logloss:0.39505
[6]	train-logloss:0.12409	eval-logloss:0.39153
[7]	train-logloss:0.10269	eval-logloss:0.37595
[8]	train-logloss:0.08609	eval-logloss:0.37879
[9]	train-logloss:0.07327	eval-logloss:0.36113
[10]	train-logloss:0.06339	eval-logloss:0.35517
[11]	train-logloss:0.05553	eval-logloss:0.35670
[12]	train-logloss:0.04906	eval-logloss:0.35280
[13]	train-logloss:0.04363	eval-logloss:0.34793
[14]	train-logloss:0.03904	eval-logloss:0.34494
[15]	train-logloss:0.03527	eval-logloss:0.34186
[16]	train-logloss:0.03224	eval-logloss:0.33630
[17]	train-logloss:0.02958	eval-logloss:0.33333
[18]	train-logloss:0.02727	eval-logloss:0.33390
[19]	train-logloss:0.02539	eval-logloss:0.33458
[20]	train-logloss:0.02361	eval-logloss:0.33476
[2

VBox(children=(Label(value='0.021 MB of 0.021 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.992710…

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval-logloss,█▅▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁
f1,▁
precision,▁
recall,▁
train-logloss,█▅▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.87678
epoch,99.0
f1,0.89344
precision,0.91597
recall,0.872


[0]	train-logloss:0.50642	eval-logloss:0.58410
[1]	train-logloss:0.38076	eval-logloss:0.52984
[2]	train-logloss:0.29592	eval-logloss:0.48506
[3]	train-logloss:0.23673	eval-logloss:0.45727
[4]	train-logloss:0.18913	eval-logloss:0.42034
[5]	train-logloss:0.15454	eval-logloss:0.39430
[6]	train-logloss:0.12735	eval-logloss:0.38000
[7]	train-logloss:0.10566	eval-logloss:0.37681
[8]	train-logloss:0.08931	eval-logloss:0.38139
[9]	train-logloss:0.07614	eval-logloss:0.38026
[10]	train-logloss:0.06559	eval-logloss:0.37055
[11]	train-logloss:0.05741	eval-logloss:0.36313
[12]	train-logloss:0.05115	eval-logloss:0.35830
[13]	train-logloss:0.04567	eval-logloss:0.35464
[14]	train-logloss:0.04086	eval-logloss:0.34977
[15]	train-logloss:0.03711	eval-logloss:0.34289
[16]	train-logloss:0.03367	eval-logloss:0.34921
[17]	train-logloss:0.03085	eval-logloss:0.35037
[18]	train-logloss:0.02842	eval-logloss:0.35486
[19]	train-logloss:0.02637	eval-logloss:0.35609
[20]	train-logloss:0.02472	eval-logloss:0.35815
[2

VBox(children=(Label(value='0.021 MB of 0.021 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.992867…

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval-logloss,█▅▂▂▂▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▂▂▂▂▂▂▃▂▂▃▃
f1,▁
precision,▁
recall,▁
train-logloss,█▅▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.88152
epoch,99.0
f1,0.90119
precision,0.89062
recall,0.912
