In [51]:
import time
import gzip
import shutil

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import gensim
import torch
from torch import nn, optim
from torch.utils.data import Dataset,DataLoader
from torch.utils.data.dataset import random_split
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

**Mount drive**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Read dataset

In [3]:
dataset = pd.read_csv("/content/drive/MyDrive/Datasets/dataset.csv")
dataset.sample(5)

Unnamed: 0,text,fake,clean_text,clean_text_without_freq_words
1363,VIDEO: Whole Troop of EMPD Officers Abuse & Th...,True,video whole troop empd officer abuse threaten ...,video whole troop empd officer abuse threaten ...
200,"Ramaphosa, ANC & Abdool Karim Want To Flatten ...",True,ramaphosa anc abdool karim want flatten econom...,ramaphosa anc abdool karim want flatten econom...
938,Sanef condemns robbery of Newzroom Afrika crew...,False,sanef condemns robbery newzroom afrika crew ea...,sanef condemns robbery newzroom afrika crew ea...
868,Former Chiefs midfielder Lucky Maselesele murd...,False,former chief midfielder lucky maselesele murde...,former chief midfielder lucky maselesele murde...
1254,'I lent him R5 000' - alleged victim of rape-a...,False,lent r5 000 alleged victim rapeaccused mpumala...,lent r5 000 alleged victim rapeaccused mpumala...


# Split the dataset

In [4]:
X_train,X_test,y_train,y_test = train_test_split(dataset['clean_text'],dataset['fake'],
                                                test_size=0.2, random_state=1)
print(f'X: Training set {X_train.shape}, Testing set {X_test.shape}')
print(f'y: Training set {y_train.shape}, Testing set {y_test.shape}')

X: Training set (1168,), Testing set (292,)
y: Training set (1168,), Testing set (292,)


#Vectorization: Word2Vec

## 1 Create preprocessed corpus(list of lists of n-grams)

In [28]:
# Create a list of lists of unigrams
def create_unigrams_list(corpus):
  corpus_lst = []

  for article in corpus:
    words_lst = article.split()
    grams_lst = [" ".join(words_lst[i:i+1]) for i in range(0, len(words_lst), 1)]
    corpus_lst.append(grams_lst)
  return corpus_lst

In [29]:
corpus_lst_train = create_unigrams_list(X_train.copy())
corpus_lst_test =  create_unigrams_list(X_test.copy())

## 2 Embedding model

In [30]:
# Embedding model
word2vec = gensim.models.word2vec.Word2Vec(corpus_lst_train, size=100, window=5, min_count=1, sg=1)

## 3 Vocabulary

In [31]:
# Vocab size
print('Vocab size:',len(word2vec.wv.vocab))

Vocab size: 26220


In [32]:
# Example
word2vec.wv.most_similar('police')

[('sap', 0.8775656223297119),
 ('station', 0.863582968711853),
 ('officer', 0.8578199148178101),
 ('brigadier', 0.8316795229911804),
 ('colonel', 0.8313194513320923),
 ('mbele', 0.8295994997024536),
 ('investigating', 0.8286892771720886),
 ('thembeka', 0.824602484703064),
 ('naidu', 0.8243770599365234),
 ('naidoo', 0.8192735314369202)]

# 1. Feature Engineering

## 1.1 Create datasets

In [5]:
# Custom Dataset
class NewsDataset(Dataset):
  def __init__(self, df_text, df_label):
    self.text = df_text
    self.label = df_label

  def __len__(self):
    return len(self.text)

  def __getitem__(self, idx):
    return  self.label.iloc[idx],self.text.iloc[idx]

In [6]:
train_dataset = NewsDataset(X_train, y_train)
test_dataset = NewsDataset(X_test, y_test)

## 1.2 Create Vocabulary

In [7]:
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
  for text in data_iter:
    yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(X_train.to_list()), specials=["<unk>"])
vocab.set_default_index(vocab['<unk>'])

In [8]:
words_index = vocab.get_stoi() 
for i, (word, num) in enumerate(words_index.items()):
  print(f'{word} => {num}')
  if i == 10:
    break

zwide => 26219
zuziwe => 26213
zungulain => 26212
zumazuma => 26211
zululand => 26209
zuckerburg => 26208
zoomed => 26206
ziphora => 26201
zindzi => 26196
zimlive => 26195
zimkhitha => 26194


## 1.3 Data processing pipelines

In [9]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) 

## 1.4 Create dataloaders

In [10]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(DEVICE), text_list.to(DEVICE), offsets.to(DEVICE)

In [11]:
train_dataloader = DataLoader(train_dataset, batch_size=5, shuffle=False, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=5, shuffle=False, collate_fn=collate_batch)

## 1.5 Ensure things make sense

In [12]:
idx = 0

# Text
label,text = train_dataset[idx]
print(f'from: {text},\n len:{len(text.split())}')

# Tokens
labels, tokens,offsets = next(iter(train_dataloader))
print(f'to: {tokens[offsets[0].item():offsets[1].item()].cpu().tolist()}')

# Check
print(f'Check: {text.split()[0]} => idx in vocab: {vocab[text.split()[0]]}')

from: expropriate land western cape malema tell supporter eff leader julius malema told resident kayamandi stellenbosch party would expropriate land western cape come power malema kicked party election campaign province thursday 10 day resident expected cast voteskayamandi resident came number rallythe area plagued year problem unemployment lack housingresidents told malema issue faced daily basis saying toilet access residential waste collection service proper housing malema wasted time criticising anc da governance nationally provincially stellenbosch racist town want change scared eff change place better eff want everyone stand together one said malema added running water flushing toilet area lashed anc da supplying adequate toilet housing lived informal settlement animal dont running water flushing toilet da anc treat u like animal need electricity wait housing electricity people need electricity malema said 2016 municipal election da received majority 30 seat councilthese white fa

# Deep Learning Model

## 1 Create News classification models

In [88]:
class NewsClassificationModel(nn.Module):

  def __init__(self, vocab_size, embed_dim, num_class,weight):
    super(NewsClassificationModel, self).__init__()

    # self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
    self.embedding_bag = nn.EmbeddingBag.from_pretrained(weight, freeze=True)
    self.fc1 = nn.Linear(embed_dim,32)
    self.fc2 = nn.Linear(32,num_class)
    self.sig = nn.Sigmoid()
    self.relu = nn.ReLU()

    self.__init_weights()

  def __init_weights(self):
    initrange = 0.5
    # self.embedding.weight.data.uniform_(-initrange, initrange)
    self.fc1.weight.data.uniform_(-initrange, initrange)
    self.fc1.bias.data.zero_()
    self.fc2.weight.data.uniform_(-initrange, initrange)
    self.fc2.bias.data.zero_()

  def forward(self, text, offsets):
    # embedded = self.embedding(text, offsets)
    embedded = self.embedding_bag(text, offsets)

    # Last linear layers
    out = self.fc1(embedded)
    out = self.relu(out)
    out = self.fc2(out)

    return self.sig(out)

## 2 Training loop

In [14]:
def train_model(model,dataloader,curr_epoch,criterion,optimizer,print_freq = 2):
  model.train()
  total_acc, total_count = 0,0

  for idx, (label, text, offsets) in enumerate(dataloader):
    optimizer.zero_grad()
    probs = model(text, offsets)

    preds = torch.round(probs)
    label = torch.unsqueeze(label, 1)
    label = label.to(torch.float)

    loss = criterion(preds, label)

    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), 0.1)
    optimizer.step()

    total_acc += (preds == label).sum().item()
    total_count += label.size(0)

    if idx % print_freq == 0:
      print(f'epoch {curr_epoch} | {idx}/{len(dataloader)} batches | accuracy {total_acc/total_count}')
      total_acc, total_count = 0,0


## 3 Evaluation Loop

In [15]:
def eval_model(model, dataloader, criterion):
  model.eval()

  total_acc, total_count = 0, 0

  with torch.no_grad():
    for idx, (label, text, offsets) in enumerate(dataloader):
      probs = model(text, offsets)

      preds = torch.round(probs)
      label = torch.unsqueeze(label, 1)
      label = label.to(torch.float)

      loss = criterion(preds, label)
      total_acc += (preds == label).sum().item()
      total_count += label.size(0)

  return total_acc/total_count

## 4 Run the model

In [16]:
def run(model, num_epochs, dataloader, criterion, optimizer, scheduler):
  tot_start_time = time.time()
  total_acc = None

  for epoch in range(1, num_epochs+1):
    epoch_start_time = time.time()
    
    # Training
    train_model(model, dataloader['train'], epoch, criterion, optimizer )

    # Validation
    acc_val = eval_model(model, dataloader['valid'], criterion)

    if total_acc is not None and total_acc > acc_val:
      scheduler.step()
    else:
      total_acc = acc_val

    print('-'*60)
    print(f'end of epoch {epoch} | time:{time.time() - epoch_start_time}s | valid accuracy {acc_val}')
    print('-'*60)
  
  print(f'Elapsed time {time.time() - tot_start_time}')

## 5 Training

### 5.1 Initialise

In [89]:
hyper_params = {
    'epochs': 15,
    'batch-size': 32,
    'learning-rate':5,
    'num-classes':1,
    'embedding-size': 100
}

# word2vec weight matrix
word2vec_weights = torch.FloatTensor(word2vec.wv.vectors)

# Model 
vocab_size = len(vocab)
news_model = NewsClassificationModel(vocab_size, hyper_params['embedding-size'], hyper_params['num-classes'], word2vec_weights ).to(DEVICE)
print(news_model)

# criterion
criterion = nn.BCELoss()

# Optimizer 
optimizer = optim.SGD(news_model.parameters(), lr=hyper_params['learning-rate'])

# Scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

NewsClassificationModel(
  (embedding_bag): EmbeddingBag(26220, 100, mode=mean)
  (fc1): Linear(in_features=100, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=1, bias=True)
  (sig): Sigmoid()
  (relu): ReLU()
)


### 5.2 Datasets

In [94]:
num_train = int(len(train_dataset) * 0.75)
split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset)-num_train])
print(f'Training dataset {len(split_train_)}')
print(f'Validation dataset {len(split_valid_)}')
print(f'Testing dataset {len(test_dataset)}')

train_dataloader = DataLoader(split_train_, batch_size=hyper_params['batch-size'], shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=hyper_params['batch-size'], shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=hyper_params['batch-size'], shuffle=False, collate_fn=collate_batch)

dataloader = {
    'train': train_dataloader,
    'valid': valid_dataloader
}

Training dataset 876
Validation dataset 292
Testing dataset 292


### 5.3 Train the model

In [49]:
run(news_model, hyper_params['epochs'], dataloader, criterion, optimizer, scheduler)

epoch 1 | 0/14 batches | accuracy 0.484375
epoch 1 | 2/14 batches | accuracy 0.4921875
epoch 1 | 4/14 batches | accuracy 0.4453125
epoch 1 | 6/14 batches | accuracy 0.4375
epoch 1 | 8/14 batches | accuracy 0.515625
epoch 1 | 10/14 batches | accuracy 0.53125
epoch 1 | 12/14 batches | accuracy 0.5
------------------------------------------------------------
end of epoch 1 | time:0.2717909812927246s | valid accuracy 0.4965753424657534
------------------------------------------------------------
epoch 2 | 0/14 batches | accuracy 0.53125
epoch 2 | 2/14 batches | accuracy 0.515625
epoch 2 | 4/14 batches | accuracy 0.5234375
epoch 2 | 6/14 batches | accuracy 0.5
epoch 2 | 8/14 batches | accuracy 0.4609375
epoch 2 | 10/14 batches | accuracy 0.5234375
epoch 2 | 12/14 batches | accuracy 0.4140625
------------------------------------------------------------
end of epoch 2 | time:0.2524247169494629s | valid accuracy 0.4965753424657534
------------------------------------------------------------
ep

## 6 Inference

In [95]:
print('Check results of the test dataset')
acc_test = eval_model(news_model,test_dataloader, criterion)
print(f'Test accuracy {acc_test}')

Check results of the test dataset
Test accuracy 0.4863013698630137


# Deep Learning Model(pretrained)

## 1 Pretrained Word2Vec vectors

In [79]:
from gensim.test.utils import datapath

pretrained_word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/Datasets/GoogleNews-vectors-negative300.bin', binary=True)

## 2 Training

### 2.1 Initialise

In [91]:
pre_hyper_params = {
    'epochs': 10,
    'batch-size': 32,
    'learning-rate':5,
    'num-classes':1,
    'embedding-size': 300
}

# Vocabulary size
pre_vocab_size = len(pretrained_word2vec_model.vocab)

# word2vec weight matrix
pre_word2vec_weights = torch.FloatTensor(pretrained_word2vec_model.vectors)

# Classifier Model
pre_news_model = NewsClassificationModel(pre_vocab_size, pre_hyper_params['embedding-size'], pre_hyper_params['num-classes'], pre_word2vec_weights ).to(DEVICE)
print(pre_news_model)

# Optimizer
pre_optimizer = optim.SGD(pre_news_model.parameters(), lr=pre_hyper_params['learning-rate'])

# Scheduler
pre_scheduler = optim.lr_scheduler.StepLR(pre_optimizer, 1.0, gamma=0.1)

NewsClassificationModel(
  (embedding_bag): EmbeddingBag(3000000, 300, mode=mean)
  (fc1): Linear(in_features=300, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=1, bias=True)
  (sig): Sigmoid()
  (relu): ReLU()
)


### 2.2 Datasets

In [96]:
train_dataloader = DataLoader(split_train_, batch_size=pre_hyper_params['batch-size'], shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=pre_hyper_params['batch-size'], shuffle=True, collate_fn=collate_batch)
pre_test_dataloader = DataLoader(test_dataset, batch_size=pre_hyper_params['batch-size'], shuffle=False, collate_fn=collate_batch)

pre_dataloader = {
    'train': train_dataloader,
    'valid': valid_dataloader
}

### 2.3 Train the model

In [93]:
run(pre_news_model, hyper_params['epochs'], pre_dataloader, criterion, pre_optimizer, pre_scheduler)

epoch 1 | 0/28 batches | accuracy 0.4375
epoch 1 | 2/28 batches | accuracy 0.390625
epoch 1 | 4/28 batches | accuracy 0.5
epoch 1 | 6/28 batches | accuracy 0.546875
epoch 1 | 8/28 batches | accuracy 0.5
epoch 1 | 10/28 batches | accuracy 0.359375
epoch 1 | 12/28 batches | accuracy 0.5625
epoch 1 | 14/28 batches | accuracy 0.46875
epoch 1 | 16/28 batches | accuracy 0.390625
epoch 1 | 18/28 batches | accuracy 0.53125
epoch 1 | 20/28 batches | accuracy 0.453125
epoch 1 | 22/28 batches | accuracy 0.46875
epoch 1 | 24/28 batches | accuracy 0.4375
epoch 1 | 26/28 batches | accuracy 0.53125
------------------------------------------------------------
end of epoch 1 | time:0.29987168312072754s | valid accuracy 0.4589041095890411
------------------------------------------------------------
epoch 2 | 0/28 batches | accuracy 0.4375
epoch 2 | 2/28 batches | accuracy 0.515625
epoch 2 | 4/28 batches | accuracy 0.46875
epoch 2 | 6/28 batches | accuracy 0.53125
epoch 2 | 8/28 batches | accuracy 0.4843

## 3 Inference

In [97]:
print('Check results of the test dataset')
pre_acc_test = eval_model(pre_news_model,pre_test_dataloader, criterion)
print(f'Test accuracy {pre_acc_test}')

Check results of the test dataset
Test accuracy 0.4589041095890411
