<a href="https://colab.research.google.com/github/verma-saloni/Thesis-Work/blob/main/06_25_22_Fasttext_Embeddings_XGboost_Politifact_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
#%%capture
!pip install -U xgboost wandb fasttext pyfasttext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [11]:
import pandas as pd
import numpy as np
import nltk
import gensim
from gensim.models import Word2Vec, FastText
import fasttext as ft
import string
nltk.download('stopwords')
import re
import spacy
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import xgboost as xgb
from pathlib import Path

import wandb
from wandb.xgboost import WandbCallback
from IPython.display import clear_output


metrics = [accuracy_score, f1_score, precision_score, recall_score]

def get_name(score_func):
    return score_func.__name__.split("_")[0]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
base_dir = Path("/content/drive/MyDrive/ResearchFND")
assert base_dir.exists()

In [14]:
df = pd.read_csv(base_dir/"politifact_agg.csv", index_col=0)

In [15]:
df.head()

Unnamed: 0,title,text,tweets,retweets,label,url,tweet_ids,num_retweets,log_num_retweets,num_tweets,log_num_tweets
0,Actress Emma Stone ‘For the first time in his...,,[],"['1020554564334964741', '1020817527046197248',...",fake,,[],2911,7.976595,0,0.0
1,Breaking President Trump makes English the of...,,[],[],fake,,[],0,0.0,0,0.0
2,Friendly Fire … Charlottesville Car attacker ...,,[],"['3265439004', '3250621593', '3253922920', '32...",fake,,[],24,3.218876,0,0.0
3,If You Are Using This Toothpaste… Throw It Aw...,MightyLiving Blog\n\nHelpful inspiration from ...,[],"['911971426571255810', '1036749614853103616', ...",fake,mightynest.com/learn/research-by-concern/dange...,[],2569,7.851661,0,0.0
4,"""Face the Nation"" transcripts, August 26, 2012...","""Face the Nation"" transcripts, August 26, 2012...",[],[],real,https://web.archive.org/web/20120827001956/htt...,[],0,0.0,0,0.0


In [16]:
# NLP Preprocessing
from gensim.utils import simple_preprocess

# NLP Preprocess - gensim.utils.simple_preprocess(doc, deacc=False, min_len=2, max_len=15)[source]
# Convert a document into a list of tokens.
# This lowercases, tokenizes, de-accents (optional). – the output are final tokens = unicode strings, that won’t be processed any further.

df["title"] = df.title.map(lambda x: ' '.join(simple_preprocess(x)))

In [17]:
#%%capture
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
!gunzip cc.en.300.bin.gz

--2022-06-25 08:44:41--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: ‘cc.en.300.bin.gz’


2022-06-25 08:47:51 (22.6 MB/s) - ‘cc.en.300.bin.gz’ saved [4503593528/4503593528]



In [18]:
# Loading model for English
modelog = ft.load_model('cc.en.300.bin') 



In [19]:
ft_title_emb_file = base_dir/"ft_title_embeddings.npy"

if ft_title_emb_file.exists():
    title_embeddings = np.load(ft_title_emb_file)
else:
    title_embeddings = []
    for line in df['title']:
        emb = modelog.get_sentence_vector(line)
        title_embeddings.append(emb)
    title_embeddings = np.stack(title_embeddings)
    np.save(ft_title_emb_file, title_embeddings)

In [20]:
ft_text_emb_file = base_dir/"ft_text_embeddings.npy"

if ft_text_emb_file.exists():
    text_embeddings = np.load(ft_text_emb_file)
else:
    df.text.fillna("", inplace=True)
    texts = df.title + " " + df.text.map(lambda x: " ".join(simple_preprocess(x)))
    text_embeddings = []
    for line in texts:
        emb = modelog.get_sentence_vector(line)
        text_embeddings.append(emb)
    text_embeddings = np.stack(text_embeddings)
    np.save(ft_text_emb_file, text_embeddings)

Will be attempting xgboost here.

In [21]:
import gc

del modelog; gc.collect()

318

### Title embeddings

In [22]:
X = title_embeddings
y = (df.label=="real").to_numpy().astype(int)

X.shape, y.shape

((894, 300), (894,))

In [23]:
skf = StratifiedKFold(shuffle=True, random_state=124)

In [24]:
def train(train_idx, test_idx, params):

    # training
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    watchlist = [(dtrain,'train'), (dtest,'eval')]
    clf = xgb.train(params, dtrain, num_boost_round=100, early_stopping_rounds=None, evals=watchlist, callbacks=[WandbCallback()])
    #evaluation
    probs = clf.predict(dtest)
    y_pred = (probs > 0.5).astype(int)
    eval_results = {get_name(f):f(y_pred=y_pred, y_true=y_test) for f in metrics}
    wandb.log(eval_results)
    wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                            y_true=y_test, preds=y_pred,
                            class_names=["Fake", "Real"])})

In [25]:
params = {
    "objective":'binary:logistic',
    "seed":124
}

In [26]:
GROUP = "fasttext-title-xgb"

for fold_id, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    clear_output()
    with wandb.init(entity="saloniteam", project="fnd", group=GROUP, name=f"{GROUP}-fold-{fold_id}") as run:
        train(train_idx, test_idx, params)

[0]	train-logloss:0.51851	eval-logloss:0.61572
[1]	train-logloss:0.40812	eval-logloss:0.58951
[2]	train-logloss:0.32666	eval-logloss:0.55907
[3]	train-logloss:0.26414	eval-logloss:0.53379
[4]	train-logloss:0.21567	eval-logloss:0.50541
[5]	train-logloss:0.18021	eval-logloss:0.49854
[6]	train-logloss:0.15367	eval-logloss:0.49576
[7]	train-logloss:0.13064	eval-logloss:0.48193
[8]	train-logloss:0.11177	eval-logloss:0.47289
[9]	train-logloss:0.09766	eval-logloss:0.47519
[10]	train-logloss:0.08434	eval-logloss:0.46956
[11]	train-logloss:0.07409	eval-logloss:0.46545
[12]	train-logloss:0.06489	eval-logloss:0.46792
[13]	train-logloss:0.05881	eval-logloss:0.46983
[14]	train-logloss:0.05286	eval-logloss:0.47294
[15]	train-logloss:0.04839	eval-logloss:0.47244
[16]	train-logloss:0.04411	eval-logloss:0.47571
[17]	train-logloss:0.04038	eval-logloss:0.47395
[18]	train-logloss:0.03704	eval-logloss:0.47463
[19]	train-logloss:0.03435	eval-logloss:0.48215
[20]	train-logloss:0.03209	eval-logloss:0.48404
[2

VBox(children=(Label(value='0.015 MB of 0.015 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.990189…

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval-logloss,█▅▂▂▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▅▅
f1,▁
precision,▁
recall,▁
train-logloss,█▅▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.76966
epoch,99.0
f1,0.78534
precision,0.78947
recall,0.78125


### Text embeddings

In [27]:
X = text_embeddings
y = (df.label=="real").to_numpy().astype(int)

X.shape, y.shape

((894, 300), (894,))

In [28]:
skf = StratifiedKFold(shuffle=True, random_state=124)

In [29]:
def train(train_idx, test_idx, params):

    # training
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    watchlist = [(dtrain,'train'), (dtest,'eval')]
    clf = xgb.train(params, dtrain, num_boost_round=100, early_stopping_rounds=None, evals=watchlist, callbacks=[WandbCallback()])
    #evaluation
    probs = clf.predict(dtest)
    y_pred = (probs > 0.5).astype(int)
    eval_results = {get_name(f):f(y_pred=y_pred, y_true=y_test) for f in metrics}
    wandb.log(eval_results)
    wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                            y_true=y_test, preds=y_pred,
                            class_names=["Fake", "Real"])})

In [30]:
params = {
    "objective":'binary:logistic',
    "seed":124
}

In [31]:
GROUP = "fastext-fulltext-xgb"
for fold_id, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    clear_output()
    with wandb.init(entity="saloniteam", project="fnd", group=GROUP, name=f"{GROUP}-fold-{fold_id}") as run:
        train(train_idx, test_idx, params)

[0]	train-logloss:0.50363	eval-logloss:0.63250
[1]	train-logloss:0.38825	eval-logloss:0.59764
[2]	train-logloss:0.30897	eval-logloss:0.56504
[3]	train-logloss:0.24852	eval-logloss:0.55615
[4]	train-logloss:0.20233	eval-logloss:0.54239
[5]	train-logloss:0.16809	eval-logloss:0.51051
[6]	train-logloss:0.14092	eval-logloss:0.50119
[7]	train-logloss:0.11870	eval-logloss:0.49726
[8]	train-logloss:0.10371	eval-logloss:0.48790
[9]	train-logloss:0.08863	eval-logloss:0.48295
[10]	train-logloss:0.07685	eval-logloss:0.47724
[11]	train-logloss:0.06773	eval-logloss:0.47921
[12]	train-logloss:0.06026	eval-logloss:0.47675
[13]	train-logloss:0.05284	eval-logloss:0.46977
[14]	train-logloss:0.04717	eval-logloss:0.46967
[15]	train-logloss:0.04296	eval-logloss:0.46174
[16]	train-logloss:0.03926	eval-logloss:0.46636
[17]	train-logloss:0.03591	eval-logloss:0.46721
[18]	train-logloss:0.03309	eval-logloss:0.47220
[19]	train-logloss:0.03086	eval-logloss:0.47122
[20]	train-logloss:0.02891	eval-logloss:0.48094
[2

VBox(children=(Label(value='0.014 MB of 0.014 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval-logloss,█▅▃▂▂▂▁▁▂▂▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃
f1,▁
precision,▁
recall,▁
train-logloss,█▅▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.82584
epoch,99.0
f1,0.84103
precision,0.82828
recall,0.85417


### + num tweets and retweets

In [32]:
X = np.concatenate([text_embeddings, df.num_retweets.to_numpy()[..., None], df.num_tweets.to_numpy()[..., None]], axis=1)
y = (df.label=="real").to_numpy().astype(int)

X.shape, y.shape

((894, 302), (894,))

In [33]:
skf = StratifiedKFold(shuffle=True, random_state=124)

In [34]:
def train(train_idx, test_idx, params):

    # training
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    watchlist = [(dtrain,'train'), (dtest,'eval')]
    clf = xgb.train(params, dtrain, num_boost_round=100, early_stopping_rounds=None, evals=watchlist, callbacks=[WandbCallback()])
    #evaluation
    probs = clf.predict(dtest)
    y_pred = (probs > 0.5).astype(int)
    eval_results = {get_name(f):f(y_pred=y_pred, y_true=y_test) for f in metrics}
    wandb.log(eval_results)
    wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                            y_true=y_test, preds=y_pred,
                            class_names=["Fake", "Real"])})

In [35]:
params = {
    "objective":'binary:logistic',
    "seed":124
}

In [36]:
GROUP = "fastext-fulltext-tw-xgb"
for fold_id, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    clear_output()
    with wandb.init(entity="saloniteam", project="fnd", group=GROUP, name=f"{GROUP}-fold-{fold_id}") as run:
        train(train_idx, test_idx, params)

[0]	train-logloss:0.50303	eval-logloss:0.57219
[1]	train-logloss:0.38417	eval-logloss:0.52484
[2]	train-logloss:0.30601	eval-logloss:0.46918
[3]	train-logloss:0.24088	eval-logloss:0.44068
[4]	train-logloss:0.19891	eval-logloss:0.42567
[5]	train-logloss:0.16382	eval-logloss:0.41925
[6]	train-logloss:0.13401	eval-logloss:0.40957
[7]	train-logloss:0.11277	eval-logloss:0.39576
[8]	train-logloss:0.09621	eval-logloss:0.39569
[9]	train-logloss:0.08286	eval-logloss:0.39460
[10]	train-logloss:0.07324	eval-logloss:0.38750
[11]	train-logloss:0.06384	eval-logloss:0.38200
[12]	train-logloss:0.05702	eval-logloss:0.37776
[13]	train-logloss:0.05094	eval-logloss:0.37774
[14]	train-logloss:0.04508	eval-logloss:0.37484
[15]	train-logloss:0.04098	eval-logloss:0.37026
[16]	train-logloss:0.03701	eval-logloss:0.37019
[17]	train-logloss:0.03392	eval-logloss:0.37597
[18]	train-logloss:0.03129	eval-logloss:0.37094
[19]	train-logloss:0.02889	eval-logloss:0.37717
[20]	train-logloss:0.02737	eval-logloss:0.37872
[2

VBox(children=(Label(value='0.014 MB of 0.014 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.989127…

0,1
accuracy,▁
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
eval-logloss,█▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▂▂▃▂▂▃▃▃▃▃▃▃
f1,▁
precision,▁
recall,▁
train-logloss,█▅▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.85393
epoch,99.0
f1,0.86316
precision,0.87234
recall,0.85417
