<a href="https://colab.research.google.com/github/verma-saloni/Thesis-Work/blob/main/06_25_22_GosSbert_embeddings_extra_xgboost_politifact.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -U xgboost sentence-transformers wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xgboost
  Downloading xgboost-1.6.1-py3-none-manylinux2014_x86_64.whl (192.9 MB)
[K     |████████████████████████████████| 192.9 MB 65 kB/s 
[?25hCollecting sentence-transformers
  Downloading sentence-transformers-2.2.1.tar.gz (84 kB)
[K     |████████████████████████████████| 84 kB 3.3 MB/s 
[?25hCollecting wandb
  Downloading wandb-0.12.19-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 47.9 MB/s 
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 50.0 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 58.8 MB/s 
[?25hCollecting huggingface-hub>=0.8.1
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
import re

from sentence_transformers import SentenceTransformer 

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import xgboost as xgb

import wandb
from wandb.xgboost import WandbCallback

In [3]:
metrics = [accuracy_score, f1_score, precision_score, recall_score]

def get_name(score_func):
    return score_func.__name__.split("_")[0]


Load data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
base_dir = Path("/content/drive/MyDrive/Thesis")
assert base_dir.exists()

In [7]:
df_fake = pd.read_csv(base_dir/'gossipcop_fake.csv')
df_real = pd.read_csv(base_dir/'gossipcop_real.csv')
df_fake['label']=0
df_real['label']=1
df_fake.count()

id           5323
news_url     5067
title        5323
tweet_ids    5135
label        5323
dtype: int64

In [8]:
df = df_fake.append(df_real)
df.reset_index(drop=True, inplace=True)

In [9]:
df.head()

Unnamed: 0,id,news_url,title,tweet_ids,label
0,gossipcop-2493749932,www.dailymail.co.uk/tvshowbiz/article-5874213/...,Did Miley Cyrus and Liam Hemsworth secretly ge...,284329075902926848\t284332744559968256\t284335...,0
1,gossipcop-4580247171,hollywoodlife.com/2018/05/05/paris-jackson-car...,Paris Jackson & Cara Delevingne Enjoy Night Ou...,992895508267130880\t992897935418503169\t992899...,0
2,gossipcop-941805037,variety.com/2017/biz/news/tax-march-donald-tru...,Celebrities Join Tax March in Protest of Donal...,853359353532829696\t853359576543920128\t853359...,0
3,gossipcop-2547891536,www.dailymail.co.uk/femail/article-3499192/Do-...,Cindy Crawford's daughter Kaia Gerber wears a ...,988821905196158981\t988824206556172288\t988825...,0
4,gossipcop-5476631226,variety.com/2018/film/news/list-2018-oscar-nom...,Full List of 2018 Oscar Nominations – Variety,955792793632432131\t955795063925301249\t955798...,0


Compute emebeddings

In [10]:
embedding_file = Path("/content/drive/MyDrive/sbert_embeddings.npy")

if embedding_file.exists():
    embeddings = np.load(embedding_file)
else:
    model_id = "all-mpnet-base-v2"
    model = SentenceTransformer(model_id)

    sentences = df.title.to_list()
    embeddings = model.encode(sentences, show_progress_bar=True)
    np.save(embedding_file, embeddings)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/692 [00:00<?, ?it/s]

Extra features: add more features here, if all looks good. 

Source

In [11]:
source = df.news_url.str.replace("^(https://)?web.archive.org/web/\d+/", "", regex=True)

In [12]:
source = source.str.replace("^(https?://)?(www.)?", "", regex=True)

In [13]:
source = source.str.replace("\.(com|info|org|gov|tv|us|news|me|co.uk|net|club|co|live|edu|xyz|site|life|ru|online|tk|website|pw|one|world|mil).*$", "", regex=True)

In [14]:
len(source), len(source.unique())


(22140, 2180)

Number of retweets

In [15]:
df["num_retweets"] = df.tweet_ids.fillna("").map(lambda x: len(str(x).split("\t")) if len(x) else 0)
df["log_num_retweets"] = np.log(df.num_retweets.to_numpy() + 1)

In [16]:
df.describe()

Unnamed: 0,label,num_retweets,log_num_retweets
count,22140.0,22140.0,22140.0
mean,0.759575,66.843993,3.252656
std,0.427351,154.985377,1.472402
min,0.0,0.0,0.0
25%,1.0,11.0,2.484907
50%,1.0,38.0,3.663562
75%,1.0,64.0,4.174387
max,1.0,2568.0,7.851272


More to come here..

Training

In [17]:
X = np.concatenate([embeddings, df.log_num_retweets.to_numpy()[..., None]], axis=1)
y = df.label.to_numpy()

X.shape, y.shape

((22140, 769), (22140,))

In [18]:
skf = StratifiedKFold(shuffle=True, random_state=124)

In [19]:
def train(train_idx, test_idx, params):

    # training
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    watchlist = [(dtrain,'train'), (dtest,'eval')]
    clf = xgb.train(params, dtrain, num_boost_round=100, early_stopping_rounds=None, evals=watchlist, callbacks=[WandbCallback()])
    #evaluation
    probs = clf.predict(dtest)
    y_pred = (probs > 0.5).astype(int)
    eval_results = {get_name(f):f(y_pred=y_pred, y_true=y_test) for f in metrics}
    wandb.log(eval_results)
    wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                            y_true=y_test, preds=y_pred,
                            class_names=["Fake", "Real"])})

In [20]:
params = {
    "objective":'binary:logistic',
    "seed":124
}

In [21]:
for fold_id, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    with wandb.init(entity="saloniteam", project="fnd", group="sbert-mpnet-v2+lnt-xgb", name=f"sbert-mpnet-v2+xgb-fold-{fold_id}") as run:
        train(train_idx, test_idx, params)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


[0]	train-logloss:0.54266	eval-logloss:0.55334
[1]	train-logloss:0.45509	eval-logloss:0.47920
[2]	train-logloss:0.39666	eval-logloss:0.43092
[3]	train-logloss:0.35623	eval-logloss:0.40040
[4]	train-logloss:0.32454	eval-logloss:0.37695
[5]	train-logloss:0.29856	eval-logloss:0.36278
[6]	train-logloss:0.27811	eval-logloss:0.34943
[7]	train-logloss:0.26149	eval-logloss:0.34247
[8]	train-logloss:0.24679	eval-logloss:0.33421
[9]	train-logloss:0.23384	eval-logloss:0.32811
[10]	train-logloss:0.22202	eval-logloss:0.32551
[11]	train-logloss:0.21092	eval-logloss:0.32147
[12]	train-logloss:0.20267	eval-logloss:0.31862
[13]	train-logloss:0.19268	eval-logloss:0.31612
[14]	train-logloss:0.18606	eval-logloss:0.31357
[15]	train-logloss:0.17749	eval-logloss:0.31115
[16]	train-logloss:0.17115	eval-logloss:0.31050
[17]	train-logloss:0.16289	eval-logloss:0.30793
[18]	train-logloss:0.15735	eval-logloss:0.30587
[19]	train-logloss:0.15114	eval-logloss:0.30385
[20]	train-logloss:0.14688	eval-logloss:0.30256
[2

VBox(children=(Label(value='0.044 MB of 0.044 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval-logloss,█▅▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
f1,▁
precision,▁
recall,▁
train-logloss,█▆▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.88392
epoch,99.0
f1,0.92559
precision,0.90183
recall,0.95064


[34m[1mwandb[0m: Currently logged in as: [33msaloni[0m ([33msaloniteam[0m). Use [1m`wandb login --relogin`[0m to force relogin


[0]	train-logloss:0.54448	eval-logloss:0.55448
[1]	train-logloss:0.45757	eval-logloss:0.47497
[2]	train-logloss:0.39968	eval-logloss:0.42307
[3]	train-logloss:0.35925	eval-logloss:0.38974
[4]	train-logloss:0.32764	eval-logloss:0.36730
[5]	train-logloss:0.30264	eval-logloss:0.35034
[6]	train-logloss:0.28194	eval-logloss:0.33818
[7]	train-logloss:0.26463	eval-logloss:0.32697
[8]	train-logloss:0.25015	eval-logloss:0.32083
[9]	train-logloss:0.23667	eval-logloss:0.31583
[10]	train-logloss:0.22404	eval-logloss:0.31134
[11]	train-logloss:0.21198	eval-logloss:0.30749
[12]	train-logloss:0.20211	eval-logloss:0.30541
[13]	train-logloss:0.19382	eval-logloss:0.30318
[14]	train-logloss:0.18484	eval-logloss:0.29834
[15]	train-logloss:0.17896	eval-logloss:0.29740
[16]	train-logloss:0.17170	eval-logloss:0.29401
[17]	train-logloss:0.16766	eval-logloss:0.29467
[18]	train-logloss:0.16135	eval-logloss:0.29337
[19]	train-logloss:0.15671	eval-logloss:0.29186
[20]	train-logloss:0.15054	eval-logloss:0.29109
[2

VBox(children=(Label(value='0.044 MB of 0.044 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval-logloss,█▅▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
f1,▁
precision,▁
recall,▁
train-logloss,█▆▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.88663
epoch,99.0
f1,0.92741
precision,0.90262
recall,0.95361


[0]	train-logloss:0.54421	eval-logloss:0.55252
[1]	train-logloss:0.45746	eval-logloss:0.47210
[2]	train-logloss:0.39927	eval-logloss:0.42025
[3]	train-logloss:0.35819	eval-logloss:0.38720
[4]	train-logloss:0.32693	eval-logloss:0.36190
[5]	train-logloss:0.30135	eval-logloss:0.34506
[6]	train-logloss:0.28045	eval-logloss:0.33142
[7]	train-logloss:0.26266	eval-logloss:0.31874
[8]	train-logloss:0.24808	eval-logloss:0.31073
[9]	train-logloss:0.23524	eval-logloss:0.30524
[10]	train-logloss:0.22388	eval-logloss:0.29966
[11]	train-logloss:0.21436	eval-logloss:0.29573
[12]	train-logloss:0.20432	eval-logloss:0.29372
[13]	train-logloss:0.19614	eval-logloss:0.29151
[14]	train-logloss:0.18748	eval-logloss:0.28846
[15]	train-logloss:0.18056	eval-logloss:0.28598
[16]	train-logloss:0.17393	eval-logloss:0.28389
[17]	train-logloss:0.16753	eval-logloss:0.28287
[18]	train-logloss:0.16083	eval-logloss:0.28183
[19]	train-logloss:0.15603	eval-logloss:0.28089
[20]	train-logloss:0.15267	eval-logloss:0.28133
[2

VBox(children=(Label(value='0.044 MB of 0.044 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval-logloss,█▅▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
f1,▁
precision,▁
recall,▁
train-logloss,█▆▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.89589
epoch,99.0
f1,0.93343
precision,0.90736
recall,0.96105


[0]	train-logloss:0.54201	eval-logloss:0.55649
[1]	train-logloss:0.45472	eval-logloss:0.47993
[2]	train-logloss:0.39663	eval-logloss:0.43334
[3]	train-logloss:0.35440	eval-logloss:0.39989
[4]	train-logloss:0.32278	eval-logloss:0.37760
[5]	train-logloss:0.29825	eval-logloss:0.35960
[6]	train-logloss:0.27776	eval-logloss:0.34637
[7]	train-logloss:0.26082	eval-logloss:0.33608
[8]	train-logloss:0.24651	eval-logloss:0.32878
[9]	train-logloss:0.23309	eval-logloss:0.32296
[10]	train-logloss:0.22095	eval-logloss:0.31821
[11]	train-logloss:0.21114	eval-logloss:0.31447
[12]	train-logloss:0.20103	eval-logloss:0.31042
[13]	train-logloss:0.19158	eval-logloss:0.30714
[14]	train-logloss:0.18357	eval-logloss:0.30564
[15]	train-logloss:0.17623	eval-logloss:0.30310
[16]	train-logloss:0.16907	eval-logloss:0.30084
[17]	train-logloss:0.16320	eval-logloss:0.29844
[18]	train-logloss:0.15717	eval-logloss:0.29718
[19]	train-logloss:0.15117	eval-logloss:0.29497
[20]	train-logloss:0.14525	eval-logloss:0.29392
[2

VBox(children=(Label(value='0.044 MB of 0.044 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.996487…

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval-logloss,█▅▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
f1,▁
precision,▁
recall,▁
train-logloss,█▆▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.89273
epoch,99.0
f1,0.93145
precision,0.90519
recall,0.95927


[0]	train-logloss:0.54326	eval-logloss:0.55538
[1]	train-logloss:0.45502	eval-logloss:0.48010
[2]	train-logloss:0.39636	eval-logloss:0.43012
[3]	train-logloss:0.35510	eval-logloss:0.39935
[4]	train-logloss:0.32276	eval-logloss:0.37654
[5]	train-logloss:0.29786	eval-logloss:0.36196
[6]	train-logloss:0.27885	eval-logloss:0.35022
[7]	train-logloss:0.26126	eval-logloss:0.34043
[8]	train-logloss:0.24678	eval-logloss:0.33238
[9]	train-logloss:0.23430	eval-logloss:0.32667
[10]	train-logloss:0.22302	eval-logloss:0.32097
[11]	train-logloss:0.21349	eval-logloss:0.31703
[12]	train-logloss:0.20327	eval-logloss:0.31416
[13]	train-logloss:0.19524	eval-logloss:0.31131
[14]	train-logloss:0.18850	eval-logloss:0.30961
[15]	train-logloss:0.17959	eval-logloss:0.30742
[16]	train-logloss:0.17270	eval-logloss:0.30552
[17]	train-logloss:0.16663	eval-logloss:0.30394
[18]	train-logloss:0.16074	eval-logloss:0.30283
[19]	train-logloss:0.15505	eval-logloss:0.30116
[20]	train-logloss:0.14959	eval-logloss:0.29961
[2

VBox(children=(Label(value='0.044 MB of 0.044 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval-logloss,█▅▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
f1,▁
precision,▁
recall,▁
train-logloss,█▆▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.89521
epoch,99.0
f1,0.93322
precision,0.90458
recall,0.96373
