In [6]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn as nn
import torch.optim as optim

# for convolution functions
import torch.nn.functional as F

In [7]:
news_df = pd.read_csv('data/fake-and-real-news-dataset/combined.csv')
news_df.head()

Unnamed: 0,title,text,subject,date,label
0,WATCH: Six Minutes Of Conservative Media’s Se...,It s no secret that conservatives and Republic...,News,"August 2, 2016",fake
1,Sanders: Firms must take 'haircut' in Puerto R...,WASHINGTON (Reuters) - Wall Street investment ...,politicsNews,"April 1, 2016",real
2,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"November 29, 2016",real
3,CNBC EDITOR: Media Must Remember Readers Are N...,A CNBC editor said members of the press need t...,left-news,"Jun 29, 2017",fake
4,NYC: Turkish Thugs Beat Up Protesters…Deny Fre...,Remember when these Turkish thugs beat up (see...,politics,"Sep 22, 2017",fake


In [8]:
news_df["text"] = news_df["title"] + ": " + news_df["text"] 
news_df["fake"] = news_df["label"].apply(lambda x: True if x == 'real' else False)
news_df.head()

Unnamed: 0,title,text,subject,date,label,fake
0,WATCH: Six Minutes Of Conservative Media’s Se...,WATCH: Six Minutes Of Conservative Media’s Se...,News,"August 2, 2016",fake,False
1,Sanders: Firms must take 'haircut' in Puerto R...,Sanders: Firms must take 'haircut' in Puerto R...,politicsNews,"April 1, 2016",real,True
2,Factbox: Trump fills top jobs for his administ...,Factbox: Trump fills top jobs for his administ...,politicsNews,"November 29, 2016",real,True
3,CNBC EDITOR: Media Must Remember Readers Are N...,CNBC EDITOR: Media Must Remember Readers Are N...,left-news,"Jun 29, 2017",fake,False
4,NYC: Turkish Thugs Beat Up Protesters…Deny Fre...,NYC: Turkish Thugs Beat Up Protesters…Deny Fre...,politics,"Sep 22, 2017",fake,False


In [131]:
class BERTSequenceClassifier():
    
    def __init__(self, pretrained_name = "bert-base-uncased"):
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.criterion = None
        self.optimizer = None
        
        # https://huggingface.co/transformers/model_doc/bert.html#berttokenizer
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_name)
        
        # https://huggingface.co/transformers/model_doc/bert.html#bertforsequenceclassification
        self.model = BertForSequenceClassification.from_pretrained(pretrained_name)
        self.model.config.num_labels = 1
        
        # Freeze the pre trained parameters
        for param in self.model.parameters():
            param.requires_grad = False
            
    def defineLossAndOptimizer(self):
        self.criterion = nn.MSELoss().to(self.device)
        self.optimizer = optim.SGD(self.model.parameters(), lr=0.01, momentum=0.9)
        
    def addLayers(self, layers):
        modules = []
        
        for layer in layers:
            modules.append(layer)
        
        self.model.classifier = nn.Sequential(*modules)
        self.model = self.model.to(self.device)
        
    def preprocess_text_samples(self, samples, max_seq_length = 300):
    
        '''
        Adapted from https://www.kaggle.com/clmentbisaillon/classifying-fake-news-with-bert/notebook
        '''

        encoded_samples = []
        
        for idx, sample in tqdm(samples.iterrows(), total = samples.shape[0]):
            encoded_text = []
            words = sample.text.strip().split(' ')
            nb_seqs = int(len(words)/max_seq_length)

            for i in range(nb_seqs+1):
                words_part = ' '.join(words[i*max_seq_length : (i+1)*max_seq_length])

                try:
                    # https://huggingface.co/transformers/main_classes/tokenizer.html#pretrainedtokenizer
                    # encoding using BERT pretrained tokeinizer and converts to pytorch tensors
                    encoded_text.append(self.tokenizer.encode(words_part, return_tensors="pt", 
                                                         max_length = 500, device = self.device))
                except:
                    print("Issue at: " +str(idx))
                    raise

            encoded_samples.append(encoded_text)

        return encoded_samples
    
    def train_model():
        pass
    
    def test_model():
        pass
    
    def predict():
        pass

In [132]:
bert = BERTSequenceClassifier()

In [133]:
layers = [nn.Linear(768, 256),
    nn.ReLU(),
    nn.Linear(256, 64),
    nn.ReLU(),
    nn.Linear(64, 2),
    nn.Softmax(dim=1)]

bert.addLayers(layers)

# Still in progress / testing

In [186]:
nb_samples = 100

tensor_list = bert.preprocess_text_samples(news_df[:nb_samples])
tensor_labels = news_df.fake[:nb_samples].apply(lambda x: torch.tensor([x]).long().to(device)).to_list()
#tensor_labels = news_df.fake[:nb_samples].apply(lambda x: torch.tensor([1.0, 0.0]).float().to(device) if x == 0 else torch.tensor([0.0, 1.0]).float().to(device))

100%|██████████| 100/100 [00:01<00:00, 91.57it/s]


In [187]:
torch.save(tensor_list, 'data/tensor.pt')
torch.save(tensor_labels, 'data/tensor_labels.pt')

In [188]:
#tensor_list = torch.load('data/tensor.pt')
#tensor_labels = torch.load('data/tensor_labels.pt')

In [189]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(tensor_list, tensor_labels, test_size=0.4, 
                                                    random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.4, random_state=1)

In [194]:
nb_epochs = 1
loss_history = []
log_freq = 100

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
criterion = nn.MSELoss().to(device)
optimizer = optim.SGD(bert.model.parameters(), lr=0.01, momentum=0.9)

bert.model.train()

for epoch in tqdm(range(nb_epochs)):
    
    total_loss = 0
    
    # iterate through the datapoints
    for idx, text_tensor in enumerate(X_train):
        # set gradients of all optimizers to zero -> avoid accumulation
        bert.model.zero_grad()
        
        # define a tensor for the output 
        output = torch.zeros((1, 2)).float().to(device)
        
        # iterate through each part of the text (each part is represented by a tensor)
        # and obtain the average of the outputs
        for i in range(len(text_tensor)):
            input = text_tensor[i]
            output += bert.model(input, labels = y_train[idx])[1].float().to(device)/len(text_tensor)
        
        label = torch.tensor([1.0, 0.0]).float().to(device) if y_train[idx] == 0 else torch.tensor([0.0, 1.0]).float().to(device)
        loss = criterion(output[0], label)
        total_loss += loss.item()
        
        # backpropagate
        loss.backward()
        optimizer.step()
        
    print("Epoch ({}/{}): loss = {}".format(epoch, nb_epochs, total_loss))
    #print("\t Training accuracy: ", )
    #print("\t Validation accuracy: ", )
    #loss_history.append(total_loss)    
            
# save weights after training
torch.save(bert.state_dict(), "model_after_train.pt")



  0%|          | 0/1 [00:00<?, ?it/s][A[A

100%|██████████| 1/1 [01:46<00:00, 106.29s/it][A[A


NameError: name 'model' is not defined

In [197]:
# save weights after training
torch.save(bert.model.state_dict(), "weights/model_after_train.pt")