<a href="https://colab.research.google.com/github/tanat1994/Fake-Real-news-classification/blob/main/fake_real_news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
! pip install -q kaggle

## Kaggle download dataset

In [None]:
from google.colab import files
files.upload()

In [3]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets list

ref                                                             title                                                size  lastUpdated          downloadCount  voteCount  usabilityRating  
--------------------------------------------------------------  --------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
meirnizri/covid19-dataset                                       COVID-19 Dataset                                      5MB  2022-11-13 15:47:17           5469        165  1.0              
madhurpant/world-deaths-and-causes-1990-2019                    World Deaths and Causes (1990 - 2019)               442KB  2022-11-29 07:09:27            842         23  1.0              
thedevastator/jobs-dataset-from-glassdoor                       Salary Prediction                                     3MB  2022-11-16 13:52:31           3616         84  1.0              
thedevastator/how-much-sleep-do-americans-really-get        

In [4]:
# ! kaggle datasets list
!kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset

Downloading fake-and-real-news-dataset.zip to /content
 95% 39.0M/41.0M [00:02<00:00, 22.1MB/s]
100% 41.0M/41.0M [00:03<00:00, 14.2MB/s]


In [5]:
!unzip fake-and-real-news-dataset.zip

Archive:  fake-and-real-news-dataset.zip
  inflating: Fake.csv                
  inflating: True.csv                


## Implementation

In [6]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

In [7]:
fake_df = pd.read_csv('./Fake.csv')
true_df = pd.read_csv('./True.csv')

In [8]:
fake_df['label'] = 1
true_df['label'] = 0

In [9]:
fake_df.shape, true_df.shape

((23481, 5), (21417, 5))

In [10]:
news_df = pd.concat([fake_df, true_df], axis=0)

In [11]:
news_df['label'].value_counts()

1    23481
0    21417
Name: label, dtype: int64

In [12]:
news_df = news_df.drop(columns=['subject', 'date'], axis=1)

In [13]:
import transformers
from tqdm.notebook import tqdm
from transformers import BertTokenizer, TFBertModel

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertModel.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/527M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [16]:
def bert_encode(data, max_length):
  input_ids = []
  attention_masks = []

  for i, text in tqdm(enumerate(data['title'])):
    encoded = tokenizer(text,
                        add_special_tokens=True,
                        max_length=max_length,
                        padding='max_length',
                        return_attention_mask=True,
                        return_tensors='tf')
    input_ids.append(encoded['input_ids'][0])
    attention_masks.append(encoded['attention_mask'][0])
  return np.asarray(input_ids, dtype='int32'), np.asarray(attention_masks, dtype='int32')

In [17]:
news_df.sample()

Unnamed: 0,title,text,label
7758,Trump's 'bad hombres' and 'nasty woman' remark...,NEW YORK (Reuters) - U.S. Republican president...,0


In [18]:
from sklearn.model_selection import train_test_split
X = news_df.drop(columns=['text', 'label'], axis=1).reset_index()
y = news_df['label']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [20]:
max_length = 128
input_ids, attention_masks = bert_encode(X_train, max_length)

0it [00:00, ?it/s]

In [21]:
print(input_ids.shape)
print(attention_masks.shape)

(35918, 128)
(35918, 128)


In [22]:
def create_model(bert_model):
  input_ids = Input(shape=(128,), dtype='int32', name='input_ids')
  attention_masks = Input(shape=(128,), dtype='int32', name='attention_masks')

  output = bert_model([input_ids, attention_masks])
  output = output[1] # pooled output
  
  output = Dense(32, activation='relu')(output)
  output = tf.keras.layers.Dropout(0.2, name='dropout')(output)

  output = Dense(1, activation='sigmoid')(output)
  model = Model(inputs=[input_ids, attention_masks], outputs=output)
  model.compile(Adam(learning_rate=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
  return model

In [23]:
bert_model = create_model(model)
bert_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 attention_masks (InputLayer)   [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_masks[0][0]']        
                                tentions(last_hidde                                               
                                n_state=(None, 128,                                           

In [24]:
history = bert_model.fit([input_ids, attention_masks], y_train, validation_split=0.2, epochs=2, batch_size=10)

Epoch 1/2
Epoch 2/2


In [33]:
# process test data
t_input_ids, t_attention_masks = bert_encode(X_test, 128)

0it [00:00, ?it/s]

In [35]:
result = bert_model.predict([t_input_ids, t_attention_masks])

[[9.9999928e-01]
 [1.0583095e-06]
 [7.9637135e-07]
 ...
 [9.9999940e-01]
 [9.9999928e-01]
 [7.7052289e-07]]


In [37]:
result = np.round(result).astype(int)

(8980, 1)

In [58]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
score = accuracy_score(y_test, result.ravel().reshape(-1, 1))

In [54]:
y_test[:20]

2349     1
4958     0
17343    0
15166    0
36       1
8767     0
11871    0
8767     1
1361     0
19530    0
3721     1
3345     1
15587    0
5855     0
5450     0
1592     1
10938    0
19106    1
7957     0
4026     1
Name: label, dtype: int64

In [51]:
result[:20].ravel().reshape(-1)

array([1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1])

In [56]:
classification_report(y_test, result)

'              precision    recall  f1-score   support\n\n           0       1.00      1.00      1.00      4284\n           1       1.00      1.00      1.00      4696\n\n    accuracy                           1.00      8980\n   macro avg       1.00      1.00      1.00      8980\nweighted avg       1.00      1.00      1.00      8980\n'

In [59]:
tn, fp, fn, tp = confusion_matrix(y_test, result).ravel()

In [60]:
print(f'tp = {tp}, fp = {fp}')
print(f'fn = {fn}, tn = {tn}')

tp = 4695, fp = 1
fn = 1, tn = 4283
