<a href="https://colab.research.google.com/github/verma-saloni/Thesis-Work/blob/main/1_11_22_gossipcop_dgl_GCN_text_embs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -qq install jsonlines

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [3]:
from pathlib import Path
base_dir = Path("/gdrive/MyDrive/ResearchFND")
assert base_dir.exists()

## Data

In [4]:
import pandas as pd
import ast
import json

In [5]:
dataset_id = 'gossipcop'
text_embeddings = 'sbert' # options: sbert, ft

In [6]:
df = pd.read_csv(base_dir/f'{dataset_id}_agg.csv')
df.head(2)

Unnamed: 0,title,text,tweets,retweets,label,url,num_retweets,log_num_retweets,num_tweets,log_num_tweets
0,Kendall Kylie Jenner Jenner NOT Upset Up...,,[],"['995423424741888001', '995461685166202880', '...",fake,,3,1.386294,0,0.0
1,Kim Kardashian Dethroned Dethroned By Khlo...,,[],"['848843565027516416', '849030801970868224', '...",fake,,3,1.386294,0,0.0


In [7]:
df['tweets'] = df.tweets.map(ast.literal_eval)

In [8]:
users_tweeted = df.tweets.map(lambda x: [int(e['user_id']) for e in x])

In [9]:
len(users_tweeted), sum(users_tweeted.map(len) > 0)

(19968, 2117)

## GNN

### Data

In [10]:
#%%capture
!pip install dgl wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dgl
  Downloading dgl-0.9.1-cp37-cp37m-manylinux1_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 23.0 MB/s 
[?25hCollecting wandb
  Downloading wandb-0.13.4-py2.py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 55.9 MB/s 
Collecting psutil>=5.8.0
  Downloading psutil-5.9.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (291 kB)
[K     |████████████████████████████████| 291 kB 31.5 MB/s 
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.9-py3-none-any.whl (9.4 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.29-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 45.9 MB/s 
[?25hCollecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting setproctitle
  Downloading setproctitle-1.3.2-cp37-cp37m-manylinux_2_5_x86_64.many

In [11]:
%env DGLBACKEND=pytorch

env: DGLBACKEND=pytorch


In [12]:
import os
import json
import jsonlines
import numpy as np
import torch
import dgl

import wandb
import IPython.display as ipd

In [13]:
u2i = {}

follow_src = []
follow_dst = []
with jsonlines.open(base_dir/"followers.jsonl") as reader:
    for line in reader:
        v = line["user_id"]
        if v not in u2i:
            u2i[v] = len(u2i)
        for u in line["followers"]:
            if u not in u2i:
                u2i[u] = len(u2i)
            follow_src.append(u2i[u])
            follow_dst.append(u2i[v])

In [14]:
with jsonlines.open(base_dir/"following.jsonl") as reader:
    for line in reader:
        u = line["user_id"]
        if u not in u2i:
            u2i[u] = len(u2i)
        for v in line["following"]:
            if v not in u2i:
                u2i[v] = len(u2i)
            follow_src.append(u2i[u])
            follow_dst.append(u2i[v])

In [15]:
tweet_src = []
tweet_dst = []

for v, l in users_tweeted.iteritems():
    if not len(l):
        continue
    for u in l:
        u = int(u)
        if u in u2i:
            tweet_src.append(u2i[u])
            tweet_dst.append(v)

In [16]:
text_embs = np.load(base_dir/f'{dataset_id}_{text_embeddings}_fulltext_embeddings.npy')
text_embs.shape

(19968, 768)

In [17]:
num_users = len(set(follow_src+follow_dst+tweet_src))

In [18]:
follow_src = torch.tensor(follow_src)
follow_dst = torch.tensor(follow_dst)
tweet_src = torch.tensor(tweet_src)
tweet_dst = torch.tensor(tweet_dst)

graph = dgl.heterograph({
    ('user', 'follow', 'user'): (follow_src, follow_dst),
    ('user', 'followed-by', 'user'): (follow_dst, follow_src),
    ('user', 'tweet', 'article'): (tweet_src, tweet_dst),
    ('article', 'tweeted-by', 'user'): (tweet_dst, tweet_src)},
    {'article':len(df), 'user':num_users}
)

graph.nodes['user'].data['feat'] = torch.arange(graph.num_nodes('user'))
graph.nodes['article'].data['feat'] = torch.tensor(text_embs)
graph.nodes['article'].data['label'] = torch.tensor((df.label=="real").to_numpy()).long()

In [19]:
graph

Graph(num_nodes={'article': 19968, 'user': 31792},
      num_edges={('article', 'tweeted-by', 'user'): 588, ('user', 'follow', 'user'): 48409, ('user', 'followed-by', 'user'): 48409, ('user', 'tweet', 'article'): 588},
      metagraph=[('article', 'user', 'tweeted-by'), ('user', 'user', 'follow'), ('user', 'user', 'followed-by'), ('user', 'article', 'tweet')])

In [20]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(shuffle=True, random_state=124)

In [21]:
labels = graph.ndata['label']['article']

train_idx, valid_idx = next(skf.split(labels, labels))

In [22]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [23]:
n_layers = 3
n_neighbors = 10

In [24]:
sampler = dgl.dataloading.NeighborSampler([n_neighbors]*n_layers)
train_loader = dgl.dataloading.DataLoader(
    graph,
    {'article':train_idx},
    sampler,
    device=device,
    batch_size=64,
    shuffle=True,
    drop_last=False,
    num_workers=0
)

In [25]:
eval_sampler = dgl.dataloading.NeighborSampler([-1]*n_layers)
eval_loader = dgl.dataloading.DataLoader(
    graph,
    {'article':valid_idx},
    eval_sampler,
    device=device,
    batch_size=64,
    shuffle=False,
    drop_last=False,
    num_workers=0
)

In [26]:
batch = next(iter(train_loader))



### Model

In [27]:
from collections import defaultdict
import torch.nn as nn
import torch.nn.functional as F

d_emb_dict = defaultdict(lambda: 64)

def flatten_dict(d):
    for k, v in d.items():
        d[k] = v.flatten(1)
    return d

In [28]:
class NodeEmbedding(nn.Module):
    """
    Node embedding layer. Handles input node features of varying dimentionality.
    
    Parameters:
    - n_nodes: dict[str,int] - a dictionary containing number of nodes per type
    for id only nodes (without precmputed features)
    - d_in: dict[str,int] - a dictionary mapping node type to the node feature dim
    for nodes with precomputed vector features
    - d_emb: int - size of output feature vector for all node types
    - proj_nodes: list[str] - optional list of node types with vector features to be
    processed with linear projection. If None keys of `d_in` are used
    - embed_nodes: list[str] - optional list of node types without vector features to be
    processed with embedding layer. If None keys of `n_nnodes` are used

    Inputs:
    - nx - dict[str,Tensor] - dictionary containing input node features per node type

    Outputs:
    - out - dict[str, Tensor] - dictionary of node embedding tensors of shape [bs, d_emb]
    """
    def __init__(self, n_nodes:dict, d_in:dict, d_emb:int, proj_nodes:list=None, embed_nodes:list=None):
        super().__init__()
        self.proj_nodes = proj_nodes if proj_nodes is not None else list(d_in.keys())
        self.embed_nodes = embed_nodes if embed_nodes is not None else list(n_nodes.keys())
        self.emb = nn.ModuleDict({k:nn.Embedding(n_nodes[k], d_emb) for k in self.embed_nodes})
        self.proj = nn.ModuleDict({k:nn.Linear(d_in[k], d_emb, bias=False) for k in self.proj_nodes})
        self.init()

    def forward(self, nx):
        out = {}
        for k, m  in self.emb.items():
            out[k] = m(nx[k])
        for k, m  in self.proj.items():
            out[k] = m(nx[k])
        return out

    def init(self):
        for _, m in self.emb.items():
            torch.nn.init.xavier_uniform_(m.weight)
        for _, m in self.proj.items():
            torch.nn.init.xavier_uniform_(m.weight)

In [29]:
class Residual(nn.Module):
    """
    Residual connection. Computes output node features as:
    x_out_dst = GraphConv(graph, x_in_all) + x_in_dst
    """
    def __init__(self, conv):
        super().__init__()
        self.conv = conv

    def forward(self, graph, x):
        h = self.conv(graph, x)
        res = x[1]
        return h + res

In [30]:
in_proj = NodeEmbedding({k:graph.num_nodes(k) for k in ["user"]}, {"article":text_embs.shape[1]}, 64)
conv = dgl.nn.HeteroGraphConv({rel:Residual(dgl.nn.GraphConv(64, 64, allow_zero_in_degree=True)) for rel in graph.etypes})

In [31]:
blocks = batch[-1]
block = blocks[0]
x = block.ndata['feat']

with torch.no_grad():
    h = in_proj(x)
    res = conv(block, h)

In [32]:
res['article'].shape, res['user'].shape

(torch.Size([64, 64]), torch.Size([2, 64]))

In [33]:
class Encoder(torch.nn.Module):
    """
    Encoder for heterogenous graph using GCN layer.
    """
    def __init__(self, d_in, d_h, etypes, n_layers=2, dropout=0.0):
        super().__init__()
        ds = [d_in] + [d_h] * n_layers
        self.layers = nn.ModuleList([
            dgl.nn.HeteroGraphConv({
                rel : Residual(dgl.nn.GraphConv(ds[i], ds[i+1], allow_zero_in_degree=True)) for rel in etypes
            }) for i in range(n_layers)
        ])

    def forward(self, blocks, x):
        
        for layer, block in zip(self.layers, blocks):
            x = layer(block, x)
        return x

In [34]:
graph.ndata['feat']['article'].shape

torch.Size([19968, 768])

In [35]:
class GNN(nn.Module):
    
    def __init__(self, g, d_h:int, n_layers:int, tgt_ntype:str, emb_nodes:list=['user'], proj_nodes:list=['article']):
        super().__init__()
        self.tgt_ntype = tgt_ntype
        self.in_proj = NodeEmbedding(
            {k:g.num_nodes(k) for k in emb_nodes}, 
            {k:graph.ndata['feat'][k].shape[1] for k in proj_nodes},
            d_h
        )
        self.encoder = Encoder(d_h, d_h, g.etypes)
        self.head = nn.Linear(d_h, 2)

    def forward(self, blocks, x):
        h = self.in_proj(x)
        h = self.encoder(blocks, h)
        return self.head(h[self.tgt_ntype])

    @torch.no_grad()
    def get_embeddings(self, graph, x):
        h = self.emb(x)
        h = self.encoder(graph, h)
        return h[self.tgt_ntype]

In [36]:
model = GNN(graph, 128, n_layers, 'article')

with torch.no_grad():
    logits = model(blocks, x)

In [37]:
logits.shape

torch.Size([64, 2])

In [38]:
def accuracy(logits, labels):
    return (logits.argmax(-1) == labels).float().mean()

In [39]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(shuffle=True, random_state=124)

In [40]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

metrics = [accuracy_score, f1_score, precision_score, recall_score]
def get_name(score_func):
    return score_func.__name__.split("_")[0]

In [41]:
class AverageMeter:

    def __init__(self, store_vals=False, store_avgs=False):
        self.store_vals = store_vals
        self.store_avgs = store_avgs
        if store_vals: self.values = []
        if store_avgs: self.avgs = []
        self.tot, self.n = 0, 0

    def update(self, v, n=1):
        if self.store_vals: self.values.append(v)
        self.n += n
        self.tot += v*n

    @property
    def avg(self):
        if self.n == 0:
            return
        return self.tot / self.n

    def reset(self):
        if self.store_avgs and self.avg: self.avgs.append(self.avg)
        self.tot, self.n = 0, 0

In [42]:
class_weights = torch.tensor([1/(labels==0).sum(), 1/(labels==1).sum()])*len(labels)

In [43]:
def train(fold, train_idx, valid_idx, params):

    model = GNN(graph, 128, n_layers, 'article')
    opt = torch.optim.Adam(model.parameters(), params['lr'])
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        opt, factor=params["lr_sched_factor"], patience=params["patience"], verbose=True
    )

    train_loss = AverageMeter(store_avgs=True)
    train_acc = AverageMeter(store_avgs=True)
    valid_loss = AverageMeter(store_avgs=True)
    valid_acc = AverageMeter(store_avgs=True)

    best_acc = 0
    for epoch in range(params['n_epochs']):
        model.train()
        for batch in train_loader:
            blocks = batch[-1]
            x = blocks[0].ndata['feat']
            logits = model(blocks, x)
            
            labels = blocks[-1].dstdata['label']['article']
            loss = F.cross_entropy(logits, labels, weight=class_weights)
            acc = accuracy(logits, labels)
            opt.zero_grad()
            loss.backward()
            opt.step()

            train_loss.update(loss.item(), len(labels))
            train_acc.update(acc, len(labels))

        model.eval()
        for i, batch in enumerate(eval_loader):
            blocks = batch[-1]
            x = blocks[0].ndata['feat']
            with torch.no_grad():
                logits = model(blocks, x)

                labels = blocks[-1].dstdata['label']['article']
                val_loss = F.cross_entropy(logits, labels, weight=class_weights)
                val_acc = accuracy(logits, labels)

                valid_loss.update(val_loss.item(), len(labels))
                valid_acc.update(val_acc, len(labels))
        
        scheduler.step(valid_loss.avg)
        wandb.log({'train_loss':loss.item(), 'train_acc':acc, 'valid_loss':val_loss.item(), 'valid_acc':val_acc}, step=epoch)
        print(f"{epoch+1:>3}: Train loss {train_loss.avg:.4f}, acc {train_acc.avg:.4f}%; validation loss {valid_loss.avg:.4f}, acc {valid_acc.avg:.4f}%")
        
        if valid_acc.avg >= best_acc:
            best_acc = valid_acc.avg
            torch.save(model.state_dict(), f'models/model-{fold}.pt')
        
        train_loss.reset()
        train_acc.reset()
        valid_loss.reset()
        valid_acc.reset()

        

    # load best model and evaluate
    model.load_state_dict(torch.load(f'models/model-{fold}.pt'))
    model.eval()
    preds = []
    targs = []
    for i, batch in enumerate(eval_loader):
        blocks = batch[-1]
        x = blocks[0].ndata['feat']
        with torch.no_grad():
            logits = model(blocks, x)

            labels = blocks[-1].dstdata['label']['article']
            
        preds.append(logits.argmax(-1).cpu().numpy())
        targs.append(labels.cpu().numpy())
    preds = np.concatenate(preds)
    targs = np.concatenate(targs)
    eval_results = {get_name(f):f(y_pred=preds, y_true=targs) for f in metrics}
    print("Final evaluation results:")
    for k,v in eval_results.items():
        print(f"{k:<16}{v:.4f}")
    
    wandb.log(eval_results)
    wandb.log({"conf_mat" : wandb.plot.confusion_matrix(probs=None,
                            y_true=targs, preds=preds,
                            class_names=["Fake", "Real"])})

    return {
        'train_loss':train_loss,
        'train_acc':train_acc,
        'valid_loss':valid_loss,
        'valid_acc':valid_acc
    }

In [44]:
params = {
    "n_epochs":200,
    'bs': 16,
    'lr':4e-3,
    "seed":124,
    "lr_sched_factor":0.5,
    "patience":20
}

labels = graph.ndata['label']['article']

In [45]:
if not os.path.exists('models'):
    os.mkdir('models')

GROUP = "sbert-fulltext-gcn-P20updatedE200"
for fold_id, (train_idx, valid_idx) in enumerate(skf.split(labels, labels)):
    ipd.clear_output()
    with wandb.init(entity="saloniteam", project="nofolds", group=GROUP, name=f"{GROUP}-fold-{fold_id}") as run:
        log = train(fold_id, train_idx, valid_idx, params)
    break

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




  1: Train loss 0.4992, acc 0.7662%; validation loss 0.4783, acc 0.7909%
  2: Train loss 0.4702, acc 0.7969%; validation loss 0.4884, acc 0.7569%
  3: Train loss 0.4512, acc 0.8017%; validation loss 0.4775, acc 0.8095%
  4: Train loss 0.4464, acc 0.8066%; validation loss 0.4966, acc 0.7969%
  5: Train loss 0.4612, acc 0.8049%; validation loss 0.8241, acc 0.7967%
  6: Train loss 0.4758, acc 0.8059%; validation loss 0.5272, acc 0.8007%
  7: Train loss 0.4460, acc 0.8088%; validation loss 0.9647, acc 0.7872%
  8: Train loss 0.4358, acc 0.8114%; validation loss 0.5084, acc 0.7939%
  9: Train loss 0.4320, acc 0.8143%; validation loss 0.4942, acc 0.8012%
 10: Train loss 0.4258, acc 0.8128%; validation loss 0.4999, acc 0.7849%
 11: Train loss 0.4239, acc 0.8135%; validation loss 0.5420, acc 0.7977%
 12: Train loss 0.4428, acc 0.8146%; validation loss 0.5681, acc 0.7902%
 13: Train loss 0.4208, acc 0.8155%; validation loss 0.4976, acc 0.7887%
 14: Train loss 0.4215, acc 0.8179%; validation los

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.862447…

0,1
accuracy,▁
f1,▁
precision,▁
recall,▁
train_acc,▂▄▅▅▅▄▇▃▂▂▆▅▂▅▃▅▅▄▆▁▂▅▇▆▅▅▄▅▇▆█▅▅▄▄▂▆▄▄▅
train_loss,▄▄▄▃▃▅▃▄█▄▃▂▇▃▆▂▂▃▁▇▄▂▂▄▃▃▄▂▁▃▁▄▆▄▅▄▂▄▄▃
valid_acc,█▁█▁███████▁▁███▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_loss,▁▂▄█▂▁▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂

0,1
accuracy,0.82874
f1,0.88702
precision,0.90465
recall,0.87006
train_acc,0.84211
train_loss,0.36229
valid_acc,0.88462
valid_loss,1.24198


In [46]:
model = GNN(graph, 128, n_layers, "article")

model.load_state_dict(torch.load(f'models/model-{fold_id}.pt'))

<All keys matched successfully>