In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re, string, unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from transformers import BertModel, BertTokenizer
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from transformers import BertForSequenceClassification
from torch.optim import AdamW
from datasets import Dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
df = pd.read_csv('WELFake_Dataset.csv')

In [12]:
df = df.head(500)
df.describe

<bound method NDFrame.describe of      Unnamed: 0                                              title  \
0             0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1             1                                                NaN   
2             2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3             3  Bobby Jindal, raised Hindu, uses story of Chri...   
4             4  SATAN 2: Russia unvelis an image of its terrif...   
..          ...                                                ...   
495         495  Trump ordered to give deposition in Washington...   
496         496  Obama’s Race War Makes Its Way To His Hometown...   
497         497  FACTBOX: About 6.1 million without power in U....   
498         498  OOPS! ABSOLUTELY NO ONE SHOWED UP For NYC Debu...   
499         499  Russia warns Iraq, Kurds not to destabilize Mi...   

                                                  text  label  
0    No comment is expected from Barack Obama Membe...      1

In [13]:
print("Before Preprocessing:")
print(df.head(), "\n")

Before Preprocessing:
   Unnamed: 0                                              title  \
0           0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1           1                                                NaN   
2           2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3           3  Bobby Jindal, raised Hindu, uses story of Chri...   
4           4  SATAN 2: Russia unvelis an image of its terrif...   

                                                text  label  
0  No comment is expected from Barack Obama Membe...      1  
1     Did they post their votes for Hillary already?      1  
2   Now, most of the demonstrators gathered last ...      1  
3  A dozen politically active pastors came here f...      0  
4  The RS-28 Sarmat missile, dubbed Satan 2, will...      1   



In [14]:
df.dropna()

df['text'] = df['text'].astype(str)
df['text'] = df['text'].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())

df['text'] = df['text'].apply(lambda x: re.sub(r"http\S+", "", x))

df['text'] = df['text'].apply(lambda x: re.sub(r'[^A-Za-z0-9\s]', '', x))
df['text'] = df['text'].apply(lambda x: x.lower())

df['text'] = df['text'].apply(lambda x: re.sub(r'\d+', '', x))

df['text'] = df['text'].apply(lambda x: " ".join(x.split()))

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: " ".join([word for word in word_tokenize(x) if word.lower() not in stop_words]))

stemmer = PorterStemmer()
df['text'] = df['text'].apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))

df['text'] = df['text'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))

  df['text'] = df['text'].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())
  df['text'] = df['text'].apply(lambda x: BeautifulSoup(x, "html.parser").get_text())
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
df["text"] = df["text"].astype(str)  # Convert all values to string (force conversion)
df = df[df["text"] != "nan"]  # Remove "nan" stored as a string
df = df.dropna(subset=["text"]).reset_index(drop=True)  # Drop real NaNs

In [17]:
print("After Preprocessing:")
print(df.head())

After Preprocessing:
   Unnamed: 0                                              title  \
0           0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1           1                                                NaN   
2           2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3           3  Bobby Jindal, raised Hindu, uses story of Chri...   
4           4  SATAN 2: Russia unvelis an image of its terrif...   

                                                text  label  
0  comment expect barack obama member fyf fukyofl...      1  
1                          post vote hillari alreadi      1  
2  demonstr gather last night exercis constitut p...      1  
3  dozen polit activ pastor came privat dinner fr...      0  
4  rs sarmat missil dub satan replac ss fli mile ...      1  


In [77]:
# print(type(df.loc[325, "text"])) 

In [79]:
df.to_csv("preprocessed_data_news.csv", index=False)

In [80]:
df = pd.read_csv('preprocessed_data_news.csv')

In [83]:
print(df[df["text"].isnull()]) 

Empty DataFrame
Columns: [Unnamed: 0, title, text, label]
Index: []


In [84]:
dataset = Dataset.from_pandas(df[['text', 'label']])

# Split into train (80%) and test (20%)
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)

# Further split the training set into train (90%) and validation (10%)
train_val_split = train_test_split["train"].train_test_split(test_size=0.1, seed=42)

datasets = DatasetDict({
    "train": train_val_split["train"],
    "validation": train_val_split["test"],
    "test": train_test_split["test"],
})

In [85]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 355
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 40
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 99
    })
})

In [86]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = datasets.map(tokenize_function, batched=True)

tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])

Map: 100%|████████████████████████████████████████████████████████████████████| 355/355 [00:06<00:00, 53.78 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 67.02 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████| 99/99 [00:02<00:00, 44.71 examples/s]


In [93]:
from transformers import Trainer, TrainingArguments
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_dir="./logs",  
    logging_steps=10,  
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    weight_decay=0.01,
    learning_rate=3e-5 
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"]
)

trainer.train()
trainer.evaluate()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.667117
2,0.682500,0.575006
3,0.682500,0.667272
4,0.530500,0.492273
5,0.426300,0.497556


{'eval_loss': 0.4975564479827881,
 'eval_runtime': 5.6824,
 'eval_samples_per_second': 7.039,
 'eval_steps_per_second': 0.176,
 'epoch': 5.0}

In [10]:
# pip install accelerate>=0.26.0

In [94]:
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")

('./final_model\\tokenizer_config.json',
 './final_model\\special_tokens_map.json',
 './final_model\\vocab.txt',
 './final_model\\added_tokens.json')

In [4]:
#dimulai dari sini

from transformers import BertForSequenceClassification, BertTokenizer

model = BertForSequenceClassification.from_pretrained("./final_model")
tokenizer = BertTokenizer.from_pretrained("./final_model")

In [8]:
input_text = "Breaking: Scientists discover a cure for aging"

inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=128)

with torch.no_grad():  
    outputs = model(**inputs)

logits = outputs.logits
predicted_class = torch.argmax(logits, dim=-1).item()

print(f"Predicted class: {predicted_class}")

Predicted class: 0
