## Importing the relevant libraries

In [None]:
!pip install wget --quiet


In [None]:
!pip install tensorflow==2.10.0 --quiet

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings, wget
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, PrecisionRecallDisplay, RocCurveDisplay
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Dense, Input, Dropout, Flatten, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler, ModelCheckpoint
from tensorflow.keras.utils import plot_model
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from wordcloud import WordCloud
wget.download("https://raw.githubusercontent.com/yogawicaksana/helper_prabowo/main/helper_prabowo_ml.py",out="helper_prabowo_ml.py")
from helper_prabowo_ml import clean_html, punct, remove_digits, remove_links, remove_special_characters, remove_, removeStopWords, lower, email_address, non_ascii
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

2025-04-02 09:44:51.029912: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-02 09:44:51.267501: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-04-02 09:44:51.267527: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2025-04-02 09:44:51.308626: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-02 09:44:52.392945: W tensorflow/stream_executor/platform/de

## Loading the train, test and evaluation datasets

In [None]:
train_df = pd.read_csv("data/train (2).csv",delimiter=';')
train_df.drop('Unnamed: 0',axis=1,inplace=True)
train_df.head(5)

In [None]:
train_df.shape

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.isna().sum()

In [None]:
train_df.duplicated().sum()

In [None]:
train_df = train_df.drop_duplicates()
train_df.shape

In [None]:
test_df = pd.read_csv('data/test (1).csv',delimiter=';')
test_df.drop('Unnamed: 0',axis=1,inplace=True)
test_df.head(3)

In [None]:
test_df.shape

In [None]:
test_df.isnull().sum()

In [None]:
test_df.duplicated().sum()

In [None]:
evaluation_df = pd.read_csv("data/evaluation.csv",delimiter=';')
evaluation_df.drop('Unnamed: 0',axis=1,inplace=True)
evaluation_df.head()

In [None]:
evaluation_df.shape

In [None]:
evaluation_df.isna().sum()

In [None]:
evaluation_df.duplicated().sum()

## Data Exploration

In [None]:
train_df['num_words'] = train_df.text.apply(len)
train_df.num_words.describe()

In [None]:
plt.figure(figsize=(10,5))
sns.distplot(train_df['num_words']);

In [5]:
max_len = 120

In [None]:
plt.figure(figsize=(12,6))
wc = WordCloud(width=600,height=300,random_state=101).generate(' '.join(train_df.text))
plt.imshow(wc)
plt.title('Word Cloud of News Text',fontsize=25,pad=20,fontweight='bold',color='sienna')
plt.show();

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='label',data=train_df);

The target fake news label is relatively balanced.

## Text Preprocessing

In [None]:
def preprocess_text(data,col):
    data[col] = data[col].apply(func=lower)
    data[col] = data[col].apply(func=clean_html)
    data[col] = data[col].apply(func=non_ascii)
    data[col] = data[col].apply(func=email_address)
    data[col] = data[col].apply(func=punct)
    data[col] = data[col].apply(func=removeStopWords)
    data[col] = data[col].apply(func=remove_)
    data[col] = data[col].apply(func=remove_digits)
    data[col] = data[col].apply(func=remove_links)
    data[col] = data[col].apply(func=remove_special_characters)
    return data
    

In [None]:
preprocessed_train = preprocess_text(train_df,'text')
preprocessed_test = preprocess_text(test_df,'text')
preprocessed_eval = preprocess_text(evaluation_df,'text')

## Downloading pretrained Tokenizer and BERT model from Hugging Face

In [5]:
model_name = 'mrm8488/bert-tiny-finetuned-fake-news-detection'
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert = TFAutoModelForSequenceClassification.from_pretrained(model_name,from_pt=True)

2025-04-02 08:05:52.362831: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2025-04-02 08:05:52.364488: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2025-04-02 08:05:52.364552: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (artefact-de-vm-wissem0072): /proc/driver/nvidia/version does not exist
2025-04-02 08:05:52.367576: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSeque

## Performing text tokenization

In [None]:
X_train = tokenizer(text=preprocessed_train.text.tolist(),
                   max_length=max_len,
                   padding=True,
                   truncation=True,
                   add_special_tokens=True,
                   return_token_type_ids=False,
                   return_attention_mask=True,
                   return_tensors='tf',
                   verbose=1)

X_test = tokenizer(text=preprocessed_test.text.tolist(),
                  max_length=max_len,
                  padding=True,
                  truncation=True,
                  add_special_tokens=True,
                  return_token_type_ids=False,
                  return_attention_mask=True,
                  return_tensors='tf',
                  verbose=1)

X_eval = tokenizer(text=preprocessed_eval.text.tolist(),
                  max_length=max_len,
                  padding=True,
                  truncation=True,
                  add_special_tokens=True,
                  return_token_type_ids=False,
                  return_attention_mask=True,
                  return_tensors='tf',
                  verbose=1)



In [None]:
print(X_train)


## Encoding the target label classes

In [None]:
scaler = LabelEncoder()
train_df.label = scaler.fit_transform(train_df.label)
test_df.label = scaler.transform(test_df.label)
evaluation_df.label = scaler.transform(evaluation_df.label)

## Model Training & Evaluation

In [None]:
input_ids = Input(shape=(max_len,),dtype=tf.int32,name='input_ids')
attention_mask = Input(shape=(max_len,),dtype=tf.int32,name='attention_mask')

In [None]:
embeddings = bert(input_ids,attention_mask=attention_mask)[0] # 0 -> final hidden state, 1 -> pooling output
output = Flatten()(embeddings)
output = Dense(units=512,activation='relu')(output)
output = BatchNormalization()(output)
output = Dropout(0.3)(output)
output = Dense(units=256,activation='relu')(output)
output = BatchNormalization()(output)
output = Dropout(0.2)(output)
output = Dense(units=128,activation='relu')(output)
output = BatchNormalization()(output)
output = Dropout(0.15)(output)
output = Dense(units=1,activation='sigmoid')(output)
model = Model(inputs=[input_ids,attention_mask],outputs=output)
model.layers[2].trainable = False
model.summary()

In [None]:
plot_model(model,to_file='model.png',show_shapes=True,dpi=100)

In [None]:
optimizer = Adam(learning_rate=4e-5,epsilon=1e-7,decay=1e-2,clipnorm=1.0)
model.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])

In [None]:
es = EarlyStopping(monitor='val_accuracy',mode='max',patience=10,restore_best_weights=True,verbose=1)
lrs = LearningRateScheduler(step_decay,verbose=1)
mc = ModelCheckpoint(filepath='fake_news_classifier.keras',monitor='val_accuracy',save_best_only=True,mode='max',save_freq='epoch',verbose=1)

r = model.fit(x={'input_ids': X_train['input_ids'], 'attention_mask': X_train['attention_mask']},
             y=train_df.label,
             epochs=20,
             batch_size=256,
             callbacks=[es,lrs,mc],
             validation_data=({'input_ids': X_test['input_ids'], 'attention_mask': X_test['attention_mask']},test_df.label)
             )

In [None]:
def step_decay(epoch, lr):
    drop_rate = 0.5
    step_size = 10
    return lr * drop_rate ** (epoch // step_size)

## Visualizing model performance

In [None]:
plt.plot(r.history['loss'],'r',label='train loss')
plt.plot(r.history['val_loss'],'b',label='test loss')
plt.xlabel('Number of Epochs')
plt.ylabel('Binary Crossentropy Loss')
plt.title('Loss Graph')
plt.legend()
plt.show();

In [None]:
plt.plot(r.history['accuracy'],'r',label='train accuracy')
plt.plot(r.history['val_accuracy'],'b',label='test accuracy')
plt.xlabel('Number of Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy Graph')
plt.legend()
plt.show();

## Evaluating model on the test dataset

In [None]:
loss, acc = model.evaluate({'input_ids': X_test['input_ids'], 'attention_mask': X_test['attention_mask']},test_df.label)
print("Test Binary Crossentropy Loss:", round(loss,2))
print("Test Accuracy:", round(acc*100,2))

In [None]:
test_predictions = model.predict({'input_ids': X_test['input_ids'], 'attention_mask': X_test['attention_mask']})
test_predictions = (test_predictions > 0.5).astype('float')
print("Confusion Matrix:")
print(confusion_matrix(test_df.label,test_predictions))
print("Classification Report:")
print(classification_report(test_df.label,test_predictions))
ConfusionMatrixDisplay.from_predictions(test_df.label,test_predictions)
plt.show()
PrecisionRecallDisplay.from_predictions(test_df.label,test_predictions)
plt.show()
RocCurveDisplay.from_predictions(test_df.label,test_predictions)
plt.show()

## Model Inference

In [None]:
test_predictions = model.predict({'input_ids': X_eval['input_ids'], 'attention_mask': X_eval['attention_mask']})
test_predictions = (test_predictions > 0.5).astype('float')
print("Confusion Matrix:")
print(confusion_matrix(evaluation_df.label,test_predictions))
print("Classification Report:")
print(classification_report(evaluation_df.label,test_predictions))
ConfusionMatrixDisplay.from_predictions(evaluation_df.label,test_predictions)
plt.show()
PrecisionRecallDisplay.from_predictions(evaluation_df.label,test_predictions)
plt.show()
RocCurveDisplay.from_predictions(evaluation_df.label,test_predictions)
plt.show()

Bravo! The model has performed incredibly well by achieving an astonishing accuracy of more than 97% on the evaluation holdout set.

Don't forget to upvote my notebook if you like it! If you have any feedback, kindly share it in the comments section below.

In [None]:
tokenizer.save_pretrained(save_directory='tokenizer/') # Saving the pretrained tokenizer

In [None]:
The error occurs because the directory `tokenizer/` does not contain the necessary configuration files (`config.json`) or the correct model files required to load a model using `TFAutoModelForSequenceClassification`. The `tokenizer.save_pretrained()` method only saves the tokenizer, not the model itself.

To load the tokenizer, you can use the `from_pretrained` method of the `AutoTokenizer` class. Here's how you can do it:

To load the tokenizer:
1. If you have saved the tokenizer in a directory (e.g., `tokenizer/`), you can load it as follows:
    ```python
    tokenizer = AutoTokenizer.from_pretrained('tokenizer/')
    ```

2. If you want to load it directly from the model name used earlier:
    ```python
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    ```


In [None]:
bert.save_pretrained(save_directory='model/')

In [16]:
loaded_model = TFAutoModelForSequenceClassification.from_pretrained('model/')


Some layers from the model checkpoint at model/ were not used when initializing TFBertForSequenceClassification: ['dropout_7']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [None]:
def predict_text(text):
    tokenized_input = tokenizer(text=text,
                                max_length=max_len,
                                padding=True,
                                truncation=True,
                                add_special_tokens=True,
                                return_token_type_ids=False,
                                return_attention_mask=True,
                                return_tensors='tf',
                                verbose=1)
    prediction = loaded_model.predict({'input_ids': tokenized_input['input_ids'], 'attention_mask': tokenized_input['attention_mask']})
    return (prediction.logits > 0.5).astype('float')
prediction_final = predict_text("SACRAMENTO, Calif. (Reuters) - California Governor Jerry Brown on Thursday nominated U.S. Congressman Xavier Becerra to be attorney general, a high-powered pick that signals the state‚Äôs readiness to defend its progressive policies as Republicans consolidate power in Washington.")
print(prediction_final)   

[[1. 0.]]


In [38]:
import numpy as np
import tensorflow as tf

def predict_text(text, threshold=0.5):
    """
    Prédiction sur un texte donné avec le modèle NLP chargé.
    :param text: Texte à analyser
    :param threshold: Seuil de classification (0.5 par défaut)
    :return: Score de probabilité et classe (Fake News ou Non)
    """
    # Tokenisation
    tokenized_input = tokenizer(text=text,
                                max_length=max_len,
                                padding="max_length",
                                truncation=True,
                                return_tensors='tf')

    # Prédiction
    prediction = loaded_model(tokenized_input)

    # Application de la fonction sigmoid (si nécessaire)
    scores = tf.nn.sigmoid(prediction.logits).numpy()

    # Conversion en classe binaire
    predicted_class = (scores > threshold).astype(int)

    return {"score": scores[0][0], "prediction": "FAKE NEWS" if predicted_class[0][0] == 0 else "REAL NEWS"}

# Exemple d'utilisation
text_example = "WASHINGTON (Reuters) - U.S. Director of National Intelligence Dan Coats declined to say whether President Donald Trump asked him to help deny any collusion between his campaign team and Russia, as reported by the Washington Post, saying his talks with Trump were private. ‚ÄúOn this topic, as well as other topics, I don‚Äôt feel it‚Äôs appropriate to characterize discussions and conversations with the president,‚Äù Coats said about the alleged request. He was speaking at a hearing before the Senate Armed Services Committee."
result = predict_text(text_example)
print(result)


{'score': 0.9673226, 'prediction': 'REAL NEWS'}


In [13]:
# Chargement du modèle et du tokenizer
tokenizer = AutoTokenizer.from_pretrained("tokenizer")
loaded_model = TFAutoModelForSequenceClassification.from_pretrained("model")
# Chargement du tokenizer
def predict_text(text, threshold=0.5):
    """
    Prédiction sur un texte donné avec le modèle NLP chargé.
    :param text: Texte à analyser
    :param threshold: Seuil de classification (0.5 par défaut)
    :return: Score de probabilité et classe (Fake News ou Non)
    """
    # Tokenisation
    tokenized_input = tokenizer(text=text,
                                max_length=max_len,
                                padding="max_length",
                                truncation=True,
                                return_tensors='tf')

    # Prédiction
    prediction = loaded_model(tokenized_input)

    # Application de la fonction sigmoid (si nécessaire)
    scores = tf.nn.sigmoid(prediction.logits).numpy()

    # Conversion en classe binaire
    predicted_class = (scores > threshold).astype(int)

    return {"score": scores[0][0], "prediction": "FAKE NEWS" if predicted_class[0][0] == 0 else "REAL NEWS"}

# Exemple d'utilisation
text_example = "WASHINGTON (Reuters) - U.S. Director of National Intelligence Dan Coats declined to say whether President Donald Trump asked him to help deny any collusion between his campaign team and Russia, as reported by the Washington Post, saying his talks with Trump were private. ‚ÄúOn this topic, as well as other topics, I don‚Äôt feel it‚Äôs appropriate to characterize discussions and conversations with the president,‚Äù Coats said about the alleged request. He was speaking at a hearing before the Senate Armed Services Committee."
result = predict_text(text_example)
print(result)


Some layers from the model checkpoint at model were not used when initializing TFBertForSequenceClassification: ['dropout_7']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


{'score': 0.9673226, 'prediction': 'REAL NEWS'}
