In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn as nn
import torch.optim as optim

# for convolution functions
import torch.nn.functional as F

In [2]:
news_df = pd.read_csv('data/fake-and-real-news-dataset/combined.csv')
news_df.head()

Unnamed: 0,title,text,subject,date,label
0,WATCH: Six Minutes Of Conservative Media’s Se...,It s no secret that conservatives and Republic...,News,"August 2, 2016",fake
1,Sanders: Firms must take 'haircut' in Puerto R...,WASHINGTON (Reuters) - Wall Street investment ...,politicsNews,"April 1, 2016",real
2,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"November 29, 2016",real
3,CNBC EDITOR: Media Must Remember Readers Are N...,A CNBC editor said members of the press need t...,left-news,"Jun 29, 2017",fake
4,NYC: Turkish Thugs Beat Up Protesters…Deny Fre...,Remember when these Turkish thugs beat up (see...,politics,"Sep 22, 2017",fake


In [3]:
news_df["text"] = news_df["title"] + ": " + news_df["text"] 
news_df["fake"] = news_df["label"].apply(lambda x: True if x == 'real' else False)
news_df.head()

Unnamed: 0,title,text,subject,date,label,full_text,fake
0,WATCH: Six Minutes Of Conservative Media’s Se...,It s no secret that conservatives and Republic...,News,"August 2, 2016",fake,WATCH: Six Minutes Of Conservative Media’s Se...,False
1,Sanders: Firms must take 'haircut' in Puerto R...,WASHINGTON (Reuters) - Wall Street investment ...,politicsNews,"April 1, 2016",real,Sanders: Firms must take 'haircut' in Puerto R...,True
2,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"November 29, 2016",real,Factbox: Trump fills top jobs for his administ...,True
3,CNBC EDITOR: Media Must Remember Readers Are N...,A CNBC editor said members of the press need t...,left-news,"Jun 29, 2017",fake,CNBC EDITOR: Media Must Remember Readers Are N...,False
4,NYC: Turkish Thugs Beat Up Protesters…Deny Fre...,Remember when these Turkish thugs beat up (see...,politics,"Sep 22, 2017",fake,NYC: Turkish Thugs Beat Up Protesters…Deny Fre...,False


In [87]:
class BERTSequenceClassifier():
    
    def __init__(self, pretrained_name = "bert-base-uncased"):
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.criterion = None
        self.optimizer = None
        
        # https://huggingface.co/transformers/model_doc/bert.html#berttokenizer
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_name)
        
        # https://huggingface.co/transformers/model_doc/bert.html#bertforsequenceclassification
        self.model = BertForSequenceClassification.from_pretrained(pretrained_name)
        self.model.config.num_labels = 1
        
        # Freeze the pre trained parameters
        for param in self.model.parameters():
            param.requires_grad = False
            
    def defineLossAndOptimizer(self):
        self.criterion = nn.MSELoss().to(self.device)
        self.optimizer = optim.SGD(self.model.parameters(), lr=0.01)
        
    def addLayers(self, layers):
        modules = []
        
        for layer in layers:
            modules.append(layer)
        
        self.model.classifier = nn.Sequential(*modules)
        self.model = self.model.to(self.device)
        
    def preprocess_text_samples(self, samples, max_seq_length = 300):
    
        '''
        Adapted from https://www.kaggle.com/clmentbisaillon/classifying-fake-news-with-bert/notebook
        '''

        encoded_samples = []
        
        for idx, sample in tqdm(samples.iterrows(), total = samples.shape[0]):
            encoded_text = []
            words = sample.text.strip().split(' ')
            nb_seqs = int(len(words)/max_seq_length)

            for i in range(nb_seqs+1):
                words_part = ' '.join(words[i*max_seq_length : (i+1)*max_seq_length])

                try:
                    # https://huggingface.co/transformers/main_classes/tokenizer.html#pretrainedtokenizer
                    # encoding using BERT pretrained tokeinizer and converts to pytorch tensors
                    encoded_text.append(self.tokenizer.encode(words_part, return_tensors="pt", 
                                                         max_length = 500, device = self.device))
                except:
                    print("Issue at: " +str(idx))
                    raise

            encoded_samples.append(encoded_text)

        return encoded_samples

In [88]:
bert = BERTSequenceClassifier()

In [89]:
layers = [nn.Linear(768, 256),
    nn.ReLU(),
    nn.Linear(256, 64),
    nn.ReLU(),
    nn.Linear(64, 2),
    nn.Softmax(dim=1)]

bert.addLayers(layers)

# Still in progress / testing

In [90]:
nb_samples = 100

tensor_list = bert.preprocess_text_samples(news_df[:nb_samples])
tensor_labels = news_df[:100].fake.values

100%|██████████| 100/100 [00:01<00:00, 85.62it/s]


In [91]:
torch.save(tensor_list, 'tensor.pt')

In [92]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(tensor_list, tensor_labels, test_size=0.4, 
                                                    random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.4, random_state=1)

In [93]:
#tensors = torch.load('tensor.pt')

In [94]:
X_train[0][1].reshape(-1)[:512].reshape(1, -1)

tensor([[  101,  8112,  7327, 21197,  3550,  2032,  1010,  2096, 18520,  7207,
          6369,  1996, 27128,  1997,  1996,  2225,  3448,  7672,  5205,  2728,
         17845,  2016,  2170,  2014, 10779,  1012,  2076,  2010,  7327,  6483,
          1010,  2343, 13857,  8112,  5228,  2010, 17005,  2005,  1996,  1047,
         19658,  2882,  5202,  1998,  2101,  1010,  1996,  2149,  5205,  2728,
         17845,  1024,  1998,  2004,  1045,  8339,  2006,  1996,  2440, 11740,
          1997,  2010,  6227,  2086,  1010,  2009,  3849,  2000,  2033,  2008,
          2010,  2166,  6260,  2875,  3425,  1010,  8112,  2056,  1012,  2066,
          1996,  4552,  2002,  9332,  1999,  2010,  4979,  1010,  2066,  2256,
          3842,  2993,  1010,  2728, 17845,  8679,  2008, 21864, 17340,  5054,
         20925,  2137,  3737,  1010,  1998,  2008,  2003,  1037,  3977,  2000,
          2689,  1010,  1037,  3977,  2000,  4553,  1010,  1037,  3977,  2000,
          4952,  1010,  1037,  3977,  2000,  2022,  

In [100]:
nb_epochs = 1

total_loss = 0
loss_history = []

model.train()

for epoch in tqdm(range(nb_epochs)):
    
    # iterate through the datapoints
    for idx, text_tensor in enumerate(X_train):
        # set gradients of all optimizers to zero -> avoid accumulation
        model.zero_grad()
        
        # define a tensor for the output 
        output = torch.zeros((1, 2)).float().to(device)
        
        # iterate through each part of the text (each part is represented by a tensor)
        # and sum the output
        for i in len(text_tensor[idx]):
            input = text_tensor[idx].reshape(-1)[:512].reshape(1, -1)
            output += model(input, labels = y_train[idx])[1].float().to(device)

NameError: name 'model' is not defined

In [None]:
print_every = 300

total_loss = 0
all_losses = []

CUDA_LAUNCH_BLOCKING=1

model.train()

for idx, row in train_data.iterrows():
    text_parts = preprocess_text(str(row['text']))
    label = torch.tensor([row['is_fake']]).long().to(device)

    optimizer.zero_grad()

    overall_output = torch.zeros((1, 2)).float().to(device)
    for part in text_parts:
        if len(part) > 0:
            try:
                input = part.reshape(-1)[:512].reshape(1, -1)
                # print(input.shape)
                overall_output += model(input, labels=label)[1].float().to(device)
            except Exception as e:
                print(str(e))

#     overall_output /= len(text_parts)
    overall_output = F.softmax(overall_output[0], dim=-1)

    if label == 0:
        label = torch.tensor([1.0, 0.0]).float().to(device)
    elif label == 1:
        label = torch.tensor([0.0, 1.0]).float().to(device)

    # print(overall_output, label)

    loss = criterion(overall_output, label)
    total_loss += loss.item()
    
    loss.backward()
    optimizer.step()

    if idx % print_every == 0 and idx > 0:
        average_loss = total_loss / print_every
        print("{}/{}. Average loss: {}".format(idx, len(train_data), average_loss))
        all_losses.append(average_loss)
        total_loss = 0