In [16]:
import pandas as pd
import numpy as np

from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn as nn
import torch.optim as optim

# for convolutoin functions
import torch.nn.functional as F

In [3]:
news_df = pd.read_csv('data/fake-and-real-news-dataset/combined.csv')
news_df.head()

Unnamed: 0,title,text,subject,date,label
0,WATCH: Six Minutes Of Conservative Media’s Se...,It s no secret that conservatives and Republic...,News,"August 2, 2016",fake
1,Sanders: Firms must take 'haircut' in Puerto R...,WASHINGTON (Reuters) - Wall Street investment ...,politicsNews,"April 1, 2016",real
2,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"November 29, 2016",real
3,CNBC EDITOR: Media Must Remember Readers Are N...,A CNBC editor said members of the press need t...,left-news,"Jun 29, 2017",fake
4,NYC: Turkish Thugs Beat Up Protesters…Deny Fre...,Remember when these Turkish thugs beat up (see...,politics,"Sep 22, 2017",fake


In [10]:
news_df["full_text"] = news_df["title"] + ": " + news_df["text"] 
news_df["fake"] = news_df["label"].apply(lambda x: True if x == 'real' else False)
news_df.head()

Unnamed: 0,title,text,subject,date,label,full_text,fake
0,WATCH: Six Minutes Of Conservative Media’s Se...,It s no secret that conservatives and Republic...,News,"August 2, 2016",fake,WATCH: Six Minutes Of Conservative Media’s Se...,False
1,Sanders: Firms must take 'haircut' in Puerto R...,WASHINGTON (Reuters) - Wall Street investment ...,politicsNews,"April 1, 2016",real,Sanders: Firms must take 'haircut' in Puerto R...,True
2,Factbox: Trump fills top jobs for his administ...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"November 29, 2016",real,Factbox: Trump fills top jobs for his administ...,True
3,CNBC EDITOR: Media Must Remember Readers Are N...,A CNBC editor said members of the press need t...,left-news,"Jun 29, 2017",fake,CNBC EDITOR: Media Must Remember Readers Are N...,False
4,NYC: Turkish Thugs Beat Up Protesters…Deny Fre...,Remember when these Turkish thugs beat up (see...,politics,"Sep 22, 2017",fake,NYC: Turkish Thugs Beat Up Protesters…Deny Fre...,False


In [14]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(news_df.full_text.values, news_df.fake.values, test_size=0.4, 
                                                    random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.4, random_state=1)

In [21]:
class BERTClassifier():
    
    def __init__(self):
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.tokenizer = None
        self.model = None
        self.criterion = None
        self.optimizer = None
        
    def loadModel(self, pretrained_name = "bert-base-uncased"):
        
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_name)
        self.model = BertForSequenceClassification.from_pretrained(pretrained_name)
        self.model.config.num_labels = 1
        
        # Freeze the pre trained parameters
        for param in self.model.parameters():
            param.requires_grad = False
            
    def defineLossAndOptimizer(self):
        self.criterion = nn.MSELoss().to(self.device)
        self.optimizer = optim.SGD(self.model.parameters(), lr=0.01)
        
    def addLayers(self, layers):
        modules = []
        
        for layer in layers:
            modules.append(layer)
        
        self.model.classifier = nn.Sequential(*modules)
        self.model = self.model.to(self.device)

In [22]:
bert = BERTClassifier()

In [23]:
bert.loadModel()

layers = [nn.Linear(768, 256),
    nn.ReLU(),
    nn.Linear(256, 64),
    nn.ReLU(),
    nn.Linear(64, 2),
    nn.Softmax(dim=1)]

bert.addLayers(layers)

In [24]:
bert.model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [37]:
def preprocess_text(text):
    parts = []

    text_len = len(text.split(' '))
    delta = 300
    max_parts = 5
    nb_cuts = int(text_len / delta)
    nb_cuts = min(nb_cuts, max_parts)
    
    print(nb_cuts)
    
    for i in range(nb_cuts + 1):
        text_part = ' '.join(text.split(' ')[i * delta: (i + 1) * delta])
        print(text_part)
        parts.append(tokenizer.encode(text_part, return_tensors="pt", max_length=500).to(device))

    return parts

In [38]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [39]:
test = "want to know what this does"
preprocess_text(test)

0
want to know what this does


[tensor([[ 101, 2215, 2000, 2113, 2054, 2023, 2515,  102]])]