In [31]:
%matplotlib inline

from pathlib import Path
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl
import torchmetrics
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

In [32]:
RANDOM_SEED = 42
MAX_TOKEN_COUNT = 512
BERT_MODEL_NAME = 'bert-base-cased'
TEXT_COLUMN_NAME = 'text'
LABEL_COLUMN_NAME = 'class'

In [33]:
fake = pd.read_csv('../data/kaggle/Fake.csv')
real = pd.read_csv('../data/kaggle/True.csv')
fake = fake[fake['text'] != ' ']
fake = fake[fake['text'] != '  ']
real = real[real['text'] != ' ']
real = real[real['text'] != '  ']

publisher = [] #storing information for publisher (new column will be created after)
tmp_text = [] #temporal text

unknown_publishers = []
for index, row in enumerate(real.text.values):
    try:
        record = row.split('-', maxsplit=1) #there is a dash dividing the text (at the text column)
        record[1] #second part of the text
        
        assert(len(record[0])<120) #checking if #chars is less tham 120
    except:
        unknown_publishers.append(index)

for index, row in enumerate(real.text.values): #Including unknown publishers
    if index in unknown_publishers:
        tmp_text.append(row)
        publisher.append('Unknown')
        continue
    
    else:
        record = row.split('-', maxsplit = 1)
        publisher.append(record[0].strip())
        tmp_text.append(record[1].strip())
real['publisher'] = publisher #replacing columns by new content
real['text'] = tmp_text

In [34]:
empty_real_index = [index for index, text in enumerate(real.text.tolist()) if str(text).strip()==""] 
real.iloc[empty_real_index]

Unnamed: 0,title,text,subject,date,publisher


In [35]:
empty_fake_index = [index for index, text in enumerate(fake.text.tolist()) if str(text).strip()==""] 
fake.iloc[empty_fake_index]

Unnamed: 0,title,text,subject,date


In [36]:
real['text'] = real['title'] + " " + real['text'] #Putting title and text in the same column
fake['text'] = fake['title'] + " " + fake['text']
real['text'] = real['text'].apply(lambda x: str(x).lower()) #converting to lower case
fake['text'] = fake['text'].apply(lambda x: str(x).lower())
real['class'] = 1.0 #Adding a new column, indicating it is true (1) or false (0) 
fake['class'] = 0.0
real = real[['text', 'class']] #We will just keep text and class (label)
fake = fake[['text', 'class']] #We will just keep text and class (label)
df = real.append(fake, ignore_index=True) #Merging

In [37]:
df

Unnamed: 0,text,class
0,"as u.s. budget fight looms, republicans flip t...",1.0
1,u.s. military to accept transgender recruits o...,1.0
2,senior u.s. republican senator: 'let mr. muell...,1.0
3,fbi russia probe helped by australian diplomat...,1.0
4,trump wants postal service to charge 'much mor...,1.0
...,...,...
44262,mcpain: john mccain furious that iran treated ...,0.0
44263,justice? yahoo settles e-mail privacy class-ac...,0.0
44264,sunnistan: us and allied ‘safe zone’ plan to t...,0.0
44265,how to blow $700 million: al jazeera america f...,0.0


In [38]:
df['class'].value_counts()

0.0    22851
1.0    21416
Name: class, dtype: int64

In [39]:
df['text'].map(
    lambda text: len(str(text).split(" "))
).describe()

count    44267.000000
mean       432.100052
std        361.228903
min          2.000000
25%        229.000000
50%        385.000000
75%        541.000000
max       8449.000000
Name: text, dtype: float64

In [40]:
train_df, val_df = train_test_split(df, test_size=0.05)
train_df.shape, val_df.shape

((42053, 2), (2214, 2))

In [41]:
BERT_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

In [42]:
sample_row = df.iloc[0]
sample_comment = sample_row['text']
sample_labels = sample_row['class']

print(sample_comment)

as u.s. budget fight looms, republicans flip their fiscal script the head of a conservative republican faction in the u.s. congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on sunday and urged budget restraint in 2018. in keeping with a sharp pivot under way among republicans, u.s. representative mark meadows, speaking on cbs’ “face the nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in january. when they return from the holidays on wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the november congressional election campaigns approach in which republicans will seek to keep control of congress. president donald trump and his republicans want a big budget increase in military spending, while democrats also want proportional increases for non-defense “discretionary”

In [43]:
class FakeNewsDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: BertTokenizer,
        max_token_len: int = MAX_TOKEN_COUNT
    ):
    
        self.tokenizer = tokenizer
        self.data = data
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        text = data_row[TEXT_COLUMN_NAME]
        labels = data_row[LABEL_COLUMN_NAME]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return dict(
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            labels=labels
        )   

In [44]:
train_dataset = FakeNewsDataset(
  train_df,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)

data_loader1 = DataLoader(train_dataset)

In [45]:
train_dataset = FakeNewsDataset(
  train_df,
  tokenizer,
  max_token_len=MAX_TOKEN_COUNT
)

sample_item = train_dataset[0]
# sample_item.keys()

In [46]:
# sample_item["text"]

In [47]:
# sample_item["labels"]

In [48]:
# sample_item["input_ids"]

In [49]:
# sample_item["attention_mask"]

In [50]:
# bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
# output = bert(sample_item["input_ids"], attention_mask=sample_item["attention_mask"])

In [51]:
class FakeNewsDataModule(pl.LightningDataModule):
    def __init__(self, train_df, test_df, tokenizer, batch_size=8, max_token_len=128):
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.max_token_len = max_token_len
        
    def setup(self, stage=None):
        self.train_dataset = FakeNewsDataset(
            self.train_df,
            self.tokenizer,
            self.max_token_len
        )

        self.test_dataset = FakeNewsDataset(
            self.test_df,
            self.tokenizer,
            self.max_token_len
        )
        
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            shuffle=True
        )

    def val_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch_size
        )

    # def train_dataloader(self):
    #     return DataLoader(
    #         self.train_dataset,
    #         batch_size=self.batch_size,
    #         shuffle=True,
    #         num_workers=1
    #     )

    # def val_dataloader(self):
    #     return DataLoader(
    #         self.test_dataset,
    #         batch_size=self.batch_size,
    #         num_workers=1
    #     )

    # def test_dataloader(self):
    #     return DataLoader(
    #         self.test_dataset,
    #         batch_size=self.batch_size,
    #         num_workers=1
    #     )

In [52]:
N_EPOCHS = 10
BATCH_SIZE = 16

data_module = FakeNewsDataModule(
  train_df,
  val_df,
  tokenizer,
  batch_size=BATCH_SIZE,
  max_token_len=MAX_TOKEN_COUNT
)

In [53]:
import pytorch_lightning as pl
import torchmetrics

class FakeNewsTagger(pl.LightningModule):
    def __init__(self, n_training_steps=None, n_warmup_steps=None):
        super().__init__()
        self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
        self.i2h = nn.Linear(self.bert.config.hidden_size, 128)
        self.h2o = nn.Linear(128, 1)
        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        self.criterion = nn.BCELoss()
        self.auroc = torchmetrics.AUROC()
        self.accuracy = torchmetrics.Accuracy()

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.bert(input_ids, attention_mask=attention_mask)
        output = self.i2h(output.pooler_output)
        output = self.h2o(output)
        output = torch.sigmoid(output).flatten()
        loss = 0
        if labels is not None:
            loss = self.criterion(output.to(dtype=float), labels.to(dtype=float))
        return loss, output

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True, batch_size=BATCH_SIZE)
        # self.log("train_AUROC", self.auroc(outputs, labels), prog_bar=True, logger=True, batch_size=BATCH_SIZE)
        self.log("train_acc", self.calculate_accuracy(outputs, labels), prog_bar=True, logger=True, batch_size=BATCH_SIZE)
        return {"loss": loss, "predictions": outputs, "labels": labels}
        
    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True, batch_size=BATCH_SIZE)
        # self.log("val_AUROC", self.auroc(outputs, labels), prog_bar=True, logger=True, batch_size=BATCH_SIZE)
        self.log("val_acc", self.calculate_accuracy(outputs, labels), prog_bar=True, logger=True, batch_size=BATCH_SIZE)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True, batch_size=BATCH_SIZE)
        # self.log("test_AUROC", self.auroc(outputs, labels), prog_bar=True, logger=True, batch_size=BATCH_SIZE)
        self.log("test_acc", self.calculate_accuracy(outputs, labels), prog_bar=True, logger=True, batch_size=BATCH_SIZE)
        return loss

    def training_epoch_end(self, outputs):
        labels = []
        predictions = []
        for output in outputs:
            for out_labels in output["labels"].detach().cpu():
                labels.append(out_labels)
            for out_predictions in output["predictions"].detach().cpu():
                predictions.append(out_predictions)
        labels = torch.stack(labels).int()
        predictions = torch.stack(predictions)
        roc_auc = self.auroc(predictions, labels)
        self.logger.experiment.add_scalar("roc_auc/Train", roc_auc, self.current_epoch)

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=2e-5)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.n_warmup_steps,
            num_training_steps=self.n_training_steps
        )
        return dict(
            optimizer=optimizer,
            lr_scheduler=dict(
                scheduler=scheduler,
                interval='step'
            )
        )
    
    def calculate_auroc(self, outputs, labels):
        return self.auroc(torch.round(outputs).to(int), labels.to(int))

    def calculate_accuracy(self, outputs, labels):
        return self.accuracy(torch.round(outputs).to(int), labels.to(int))

In [54]:
model = FakeNewsTagger(
  n_warmup_steps=20,
  n_training_steps=100
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [55]:
checkpoint_callback = ModelCheckpoint(
  dirpath="checkpoints",
  filename="best-checkpoint",
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

In [56]:
logger = TensorBoardLogger("lightning_logs", name="fake-news-ff")

In [57]:
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=3)

In [58]:
trainer = pl.Trainer(
  logger=logger,
  checkpoint_callback=checkpoint_callback,
  callbacks=[early_stopping_callback],
  max_epochs=10,
  gpus=1,
  progress_bar_refresh_rate=30
)

  f"Setting `Trainer(checkpoint_callback={checkpoint_callback})` is deprecated in v1.5 and will "
  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [59]:
trainer.fit(model, data_module)
# trainer.fit(model, data_loader1)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type      | Params
----------------------------------------
0 | bert      | BertModel | 108 M 
1 | i2h       | Linear    | 98.4 K
2 | h2o       | Linear    | 129   
3 | criterion | BCELoss   | 0     
4 | auroc     | AUROC     | 0     
5 | accuracy  | Accuracy  | 0     
----------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.635   Total estimated model params size (MB)


Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


                                                              

  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Epoch 0:   0%|          | 0/2768 [00:00<?, ?it/s] 

  f"One of the returned values {set(extra.keys())} has a `grad_fn`. We will detach it automatically"


Epoch 3: 100%|██████████| 2768/2768 [18:10<00:00,  2.54it/s, loss=0.136, v_num=51, train_loss=0.0583, train_acc=1.000, val_loss=0.165, val_acc=0.949]


In [60]:
# checkpoint = torch.load('lightning_logs/fake-news-snopes/version_40/checkpoints/epoch=2-step=7886.ckpt')
# model.load_from_checkpoint('lightning_logs/fake-news-snopes/version_40/checkpoints/epoch=2-step=7886.ckpt')
trainer.test(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  f"The dataloader, {name}, does not have many workers which may be a bottleneck."


Testing: 100%|██████████| 139/139 [00:20<00:00,  6.72it/s]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.9487410187721252, 'test_loss': 0.16516122221946716}
--------------------------------------------------------------------------------
Testing: 100%|██████████| 139/139 [00:20<00:00,  6.73it/s]


[{'test_loss': 0.16516122221946716, 'test_acc': 0.9487410187721252}]