##a. LIBRARY INSTALL AND IMPORT

In [None]:
pip cache purge

Files removed: 200


In [None]:
!pip install pandas numpy scikit-learn torch transformers tqdm checklist

Collecting checklist
  Downloading checklist-0.0.11.tar.gz (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m114.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata

In [None]:
!pip install ipywidgets



In [None]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Import all necessary libraries
import re
import nltk
import os
import json
import pandas as pd
import torch
import numpy as np
from sklearn.utils import resample
from transformers import BertTokenizer, BertModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
# check some numpy version
print(f"NumPy version: {np.__version__}")

NumPy version: 2.0.2


#b.DATA

###b1.DATA LOADING

In [None]:
import json
import pandas as pd

# Function to read a JSON file
def load_json_file(file_path):
    """Load a JSON file"""
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)

# Function to process FakeNewsNet data
def process_FakeNewsNet_data(file_paths):
    """Read multiple JSON files and organize them into a DataFrame"""
    data_list = []  # Store all data

    for category, file_path in file_paths.items():
        data = load_json_file(file_path)  # Load JSON data

        for news_id, news_content in data.items():
            if 'text' in news_content:  # Ensure the text field exists
                # Determine the news source
                if "gossipcop" in category:
                    source = "gossipcop"
                elif "politifact" in category:
                    source = "politifact"
                else:
                    source = "unknown"

                # Determine the news type
                if "R" in category:
                    if "HR" in category:
                        label = "human_real" # Human-written real news
                    else:
                        label = "gpt_real"  # AI-generated real news
                elif "F" in category:
                    if "MF" in category:
                        label = "gpt_fake"  # AI-generated fake news
                    else:
                        label = "human_fake"  # Human-written fake news
                else:
                    label = "unknown"

                data_list.append({
                    "id": news_id,
                    "text": news_content["text"],
                    "title": news_content.get("title", ""),  # Some data may not have a title
                    "source": source,  # News source
                    "label": label,  # Real/fake news category
                })

    # Convert to a Pandas DataFrame
    df = pd.DataFrame(data_list)
    return df

# Define paths for the dataset
fake_news_paths = {
    "gossipcop_HF": "/content/drive/MyDrive/DS266/HF.json",
    "gossipcop_HR": "/content/drive/MyDrive/DS266/HR.json",
    "gossipcop_MF": "/content/drive/MyDrive/DS266/MF.json",
    "gossipcop_MR": "/content/drive/MyDrive/DS266/MR.json",
}

# Load and process the data
FakeNewsNet_df = process_FakeNewsNet_data(fake_news_paths)

# Display the first 5 rows
FakeNewsNet_df.head()


Unnamed: 0,id,text,title,source,label
0,0,✕ Close Meghan Markle and Prince Harry have an...,As it happened: Prince Harry and Meghan Markle...,gossipcop,human_fake
1,1,Kim Kardashian and Kanye West are pulling out ...,Kim & Kanye Install At-Home Panic Room After P...,gossipcop,human_fake
2,2,Prince Harry and Meghan currently live at Kens...,£1.4million spent renovating Prince Harry and ...,gossipcop,human_fake
3,3,They can't get enough of the Biebs on this sho...,Photos from Dancing With the Stars: Special Gu...,gossipcop,human_fake
4,4,Ben Affleck is keeping life with his three kid...,Jennifer Garner ‘Doesn’t Want’ Her Kids Around...,gossipcop,human_fake


# c.Model Optmization
Direction
1. gpt fake & gpt real have shorter text, balance the length
  1.  reduce current length
  2.  data augmentation
2. humanize gpt text by adding typo for gpt real & fake
3. two stage model



In [None]:

!pip install -q transformers
!pip install -q torchinfo
!pip install -q datasets
!pip install -q evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m481.3/491.2 kB[0m [31m18.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [None]:

import numpy as np

import transformers
import evaluate

from datasets import load_dataset
from torchinfo import summary

from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('stopwords')
nltk.download('punkt')

from datasets import Dataset
import pandas as pd

from sklearn.metrics import confusion_matrix, classification_report
import numpy as np


from transformers import Trainer, TrainingArguments
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


#### C1: gpt fake & gpt real have shorter text, balance the length

In [None]:
# combine text and title for training
FakeNewsNet_df['combined_text'] = FakeNewsNet_df['text'] + '.' + FakeNewsNet_df['title']

FakeNewsNet_df["word_count"] = FakeNewsNet_df["combined_text"].apply(lambda x: len(x.split()))


MR =  FakeNewsNet_df[(FakeNewsNet_df['label'] == "gpt_real") & (FakeNewsNet_df['word_count'] < 500) & (FakeNewsNet_df['word_count'] > 80)]
MF =  FakeNewsNet_df[(FakeNewsNet_df['label'] == "gpt_fake") & (FakeNewsNet_df['word_count'] < 500) & (FakeNewsNet_df['word_count'] > 80)]
HR =  FakeNewsNet_df[(FakeNewsNet_df['label'] == "human_real") & (FakeNewsNet_df['word_count'] < 500) & (FakeNewsNet_df['word_count'] > 80)]
HF =  FakeNewsNet_df[(FakeNewsNet_df['label'] == "human_fake") & (FakeNewsNet_df['word_count'] < 500) & (FakeNewsNet_df['word_count'] > 80)]


new_FNN_df = pd.concat([MR,MF,HR,HF])

print(new_FNN_df['label'].value_counts())

##lowest is 2756



label
human_real    4853
gpt_fake      4080
gpt_real      3527
human_fake    2756
Name: count, dtype: int64


In [None]:
MR = resample(MR,replace=True,n_samples=2756,random_state=42)
MF = resample(MF,replace=True,n_samples=2756,random_state=42)
HR = resample(HR,replace=True,n_samples=2756,random_state=42)


Bal_FNN_df = pd.concat([MR,MF,HR,HF])

print(Bal_FNN_df['label'].value_counts())



label
gpt_real      2756
gpt_fake      2756
human_real    2756
human_fake    2756
Name: count, dtype: int64


In [None]:
# Convert labels to numerical values (0=real, 1=fake)
Bal_FNN_df['label_new'] = Bal_FNN_df['label'].map({'human_fake': 0, 'human_real': 1.0,'gpt_fake': 2,'gpt_real':3})

# calcalate word count distributon of news text
Bal_FNN_df["word_count"] = Bal_FNN_df["combined_text"].apply(lambda x: len(x.split()))
Bal_FNN_df.groupby("label")["word_count"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
gpt_fake,2756.0,235.175617,52.481711,85.0,201.0,233.0,267.0,496.0
gpt_real,2756.0,327.410377,83.272611,84.0,270.0,327.0,386.0,499.0
human_fake,2756.0,295.28955,119.02955,81.0,192.0,316.0,393.0,499.0
human_real,2756.0,301.905298,107.708082,81.0,220.0,308.0,387.0,499.0


In [None]:
import string

def simple_tokenizer(text):
    return re.findall(r'\b\w+\b', text.lower())

text = "Hello! This is a test."
print(simple_tokenizer(text))

# Remove unwanted characters, links, and HTML tags
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove special characters and numbers, keeping only letters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Convert to lowercase
    text = text.lower()
    words = simple_tokenizer(text)
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)



['hello', 'this', 'is', 'a', 'test']


In [None]:
# Apply cleaning to datasets
Bal_FNN_df['cleaned_combined_text'] = Bal_FNN_df['combined_text'].apply(clean_text)

In [None]:
from sklearn.model_selection import train_test_split

# Split the data while keeping the DataFrame structure ( for hugging face dataset use)

train_df, tmp_df = train_test_split(
    Bal_FNN_df[['cleaned_combined_text', 'label_new']],
    test_size=0.4,
    random_state=42,
    stratify=Bal_FNN_df['label_new']
)

val_df, test_df = train_test_split(
    tmp_df,
    test_size=0.5,
    random_state=42,
    stratify=tmp_df['label_new']
)
# Reset the index for both DataFrames
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Val samples: {len(test_df)}")
print("\nTrain DataFrame:")
print(train_df.head())
print("\nTest DataFrame:")
print(test_df.head())
print("\nVal DataFrame:")
print(val_df.head())

Training samples: 6614
Test samples: 2205
Val samples: 2205

Train DataFrame:
                               cleaned_combined_text  label_new
0  chrissy teigen john legend fair share ups down...        3.0
1  kourtney kardashians boyfriend younes bendjima...        1.0
2  former nfl star terrell owens announced good m...        1.0
3  katharine mcphee david foster engaged hollywoo...        1.0
4  halloween might still weeks away counting anyt...        1.0

Test DataFrame:
                               cleaned_combined_text  label_new
0  urassic world star chris pratt spotted coffee ...        2.0
1  brad pitt angelina jolie ever going get differ...        0.0
2  comes certain television shows sometimes choos...        1.0
3  mila kunis renowned hollywood actress finally ...        2.0
4  woman hanging drake split jennifer lopez claim...        0.0

Val DataFrame:
                               cleaned_combined_text  label_new
0  kardashians celebrated americas independence d...     

In [None]:
from checklist.perturb import Perturb

def safe_add_typos(text, min_length=3, max_typos=2):
    if pd.isna(text) or len(text) < min_length:
        return text

    try:
        n_typos = min(max_typos, len(text)//2)  # Max 1 typo per 2 characters
        return Perturb.add_typos(text)
    except ValueError:
        return text  # Fallback for edge cases

def conditional_add_typos(row):
        return safe_add_typos(row['cleaned_combined_text'])




train_df['text_add_typos'] = train_df.apply(conditional_add_typos, axis=1)

# test_df['text_add_typos'] = test_df.apply(conditional_add_typos, axis=1)

# val_df['text_add_typos'] = val_df.apply(conditional_add_typos, axis=1)


# train_df['text_add_typos'] = train_df['cleaned_combined_text'].apply(safe_add_typos)

# # Verify
# empty_count = train_df['cleaned_combined_text'].apply(lambda x: len(x) < 3).sum()
# print(f"Fixed {empty_count} short/empty texts")

In [None]:
train_df.drop(columns=['cleaned_combined_text'], inplace=True)
# test_df.drop(columns=['cleaned_combined_text'], inplace=True)
# val_df.drop(columns=['cleaned_combined_text'], inplace=True)

In [None]:
# Convert to Hugging Face Dataset for training
formatted_df = train_df.rename(columns={
    'text_add_typos': 'text',
    'label_new': 'label'
})

fake_news_dataset_train = Dataset.from_pandas(formatted_df)

# Convert to Hugging Face Dataset for testimg
formatted_df = test_df.rename(columns={
    'cleaned_combined_text': 'text',
    'label_new': 'label'
})

fake_news_dataset_test = Dataset.from_pandas(formatted_df)

# Convert to Hugging Face Dataset for validating
formatted_df = val_df.rename(columns={
    'cleaned_combined_text': 'text',
    'label_new': 'label'
})

fake_news_dataset_val = Dataset.from_pandas(formatted_df)

print("Test: \n",fake_news_dataset_test)
print("Train: \n",fake_news_dataset_train)
print("Val: \n",fake_news_dataset_val)

Test: 
 Dataset({
    features: ['text', 'label'],
    num_rows: 2205
})
Train: 
 Dataset({
    features: ['label', 'text'],
    num_rows: 6614
})
Val: 
 Dataset({
    features: ['text', 'label'],
    num_rows: 2205
})


In [None]:
MAX_SEQUENCE_LENGTH = 496

In [None]:
# Encode data

def preprocess_data(data, tokenizer):
    review_text = data['text']

    encoded = tokenizer.batch_encode_plus(
            review_text,
            max_length=MAX_SEQUENCE_LENGTH,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors="pt"
        )

    return encoded

In [None]:
#Creating  compute metrics function

metric = evaluate.load('accuracy')

# FakeNewsNet_df['label_new'] = FakeNewsNet_df['label'].map({'human_fake': 0, 'human_real': 1.0,'gpt_fake': 2,'gpt_real': 3})
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
  acc = accuracy_score(labels, predictions)

  class_report = classification_report(
        labels, predictions,
        target_names=['human_fake', 'human_real', 'gpt_fake','gpt_real'],
        digits=4
    )
  print("\nClassification Report:")
  print(class_report)

  return {
      'accuracy': acc,
      'f1': f1,
      'precision': precision,
      'recall': recall
  }


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
## Fine tuning the "bert-base-cased" model

def fine_tune_classification_model(classification_model,
                                   tokenizer,
                                   train_data,
                                   val_data,
                                   batch_size = 16,
                                   num_epochs = 3):

    preprocessed_train_data = train_data.map(preprocess_data, batched=True, fn_kwargs={'tokenizer': tokenizer})
    preprocessed_val_data = val_data.map(preprocess_data, batched=True, fn_kwargs={'tokenizer': tokenizer})

    preprocessed_train_data = preprocessed_train_data.map(
        lambda x: {'labels': torch.tensor(x['label'], dtype=torch.long)}  # Ensure labels are int64
    )
    preprocessed_val_data  = preprocessed_val_data .map(
        lambda x: {'labels': torch.tensor(x['label'], dtype=torch.long)}
    )



    training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/DS266/model_output",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_dir="/content/drive/MyDrive/DS266/logs",
    eval_strategy="epoch",
    save_strategy="epoch",
    report_to='none'
)
    trainer = Trainer(
        model=classification_model,
        args=training_args,
        train_dataset=preprocessed_train_data,
        eval_dataset=preprocessed_val_data,
        compute_metrics=compute_metrics
    )


    trainer.train()



### bert-base model

In [None]:
model_checkpoint_name = "bert-base-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)
# bert_classification_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint_name)

bert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=4  # number of classes = 4
)

# fine_tune_classification_model(bert_classification_model, bert_tokenizer, fake_news_dataset_train, fake_news_dataset_val)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
fine_tune_classification_model(bert_classification_model, bert_tokenizer, fake_news_dataset_train, fake_news_dataset_val)

Map:   0%|          | 0/6614 [00:00<?, ? examples/s]

Map:   0%|          | 0/2205 [00:00<?, ? examples/s]

Map:   0%|          | 0/6614 [00:00<?, ? examples/s]

Map:   0%|          | 0/2205 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.465321,0.825397,0.828333,0.835719,0.825397
2,0.692400,0.442193,0.849887,0.848316,0.847936,0.849887
3,0.319100,0.449787,0.864853,0.864325,0.865339,0.864853



Classification Report:
              precision    recall  f1-score   support

  human_fake     0.7173    0.7967    0.7549       551
  human_real     0.7113    0.7423    0.7265       551
    gpt_fake     0.9313    0.9330    0.9321       552
    gpt_real     0.9828    0.8294    0.8996       551

    accuracy                         0.8254      2205
   macro avg     0.8357    0.8253    0.8283      2205
weighted avg     0.8357    0.8254    0.8283      2205


Classification Report:
              precision    recall  f1-score   support

  human_fake     0.7948    0.7804    0.7875       551
  human_real     0.7820    0.7423    0.7616       551
    gpt_fake     0.9018    0.9819    0.9402       552
    gpt_real     0.9130    0.8947    0.9038       551

    accuracy                         0.8499      2205
   macro avg     0.8479    0.8498    0.8483      2205
weighted avg     0.8479    0.8499    0.8483      2205


Classification Report:
              precision    recall  f1-score   support

  h

### roberta-base model

In [None]:
model_checkpoint_name = "roberta-base"
roberta_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)
roberta_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=4  # number of classes = 4
)



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
fine_tune_classification_model(roberta_classification_model, roberta_tokenizer, fake_news_dataset_train, fake_news_dataset_val)

Map:   0%|          | 0/6614 [00:00<?, ? examples/s]

Map:   0%|          | 0/2205 [00:00<?, ? examples/s]

Map:   0%|          | 0/6614 [00:00<?, ? examples/s]

Map:   0%|          | 0/2205 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.583557,0.806803,0.805487,0.818023,0.806803
2,0.620900,0.425656,0.860317,0.859476,0.861842,0.860317
3,0.311200,0.489063,0.866213,0.86522,0.866768,0.866213



Classification Report:
              precision    recall  f1-score   support

  human_fake     0.7291    0.8058    0.7655       551
  human_real     0.7342    0.7169    0.7254       551
    gpt_fake     0.8359    0.9873    0.9053       552
    gpt_real     0.9729    0.7169    0.8255       551

    accuracy                         0.8068      2205
   macro avg     0.8180    0.8067    0.8054      2205
weighted avg     0.8180    0.8068    0.8055      2205


Classification Report:
              precision    recall  f1-score   support

  human_fake     0.8264    0.7604    0.7921       551
  human_real     0.7743    0.8094    0.7915       551
    gpt_fake     0.8882    0.9928    0.9376       552
    gpt_real     0.9584    0.8784    0.9167       551

    accuracy                         0.8603      2205
   macro avg     0.8618    0.8603    0.8594      2205
weighted avg     0.8618    0.8603    0.8595      2205


Classification Report:
              precision    recall  f1-score   support

  h

### c2.Data Augment

In [None]:
# remove extrame long text
# combine text and title for training
FakeNewsNet_df['combined_text'] = FakeNewsNet_df['text'] + '.' + FakeNewsNet_df['title']

FakeNewsNet_df["word_count"] = FakeNewsNet_df["combined_text"].apply(lambda x: len(x.split()))


MR =  FakeNewsNet_df[(FakeNewsNet_df['label'] == "gpt_real") & (FakeNewsNet_df['word_count'] < 1000)]
MF =  FakeNewsNet_df[(FakeNewsNet_df['label'] == "gpt_fake") & (FakeNewsNet_df['word_count'] < 1000) ]
HR =  FakeNewsNet_df[(FakeNewsNet_df['label'] == "human_real") & (FakeNewsNet_df['word_count'] < 1000) ]
HF =  FakeNewsNet_df[(FakeNewsNet_df['label'] == "human_fake") & (FakeNewsNet_df['word_count'] < 1000) ]


new_FNN_df = pd.concat([MR,MF,HR,HF])

print(new_FNN_df['label'].value_counts())

##lowest is 2756

label
human_real    7310
gpt_fake      4084
gpt_real      4063
human_fake    3777
Name: count, dtype: int64


In [None]:
new_FNN_df["word_count"] = new_FNN_df["combined_text"].apply(lambda x: len(x.split()))
new_FNN_df.groupby("label")["word_count"].describe()

# For here, we would like to match gpt fake with human fake
# and match gpt real with human real.

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
gpt_fake,4084.0,234.705191,52.024997,14.0,202.0,232.0,265.0,527.0
gpt_real,4063.0,361.5016,140.310961,6.0,277.0,340.0,415.5,998.0
human_fake,3777.0,338.941223,204.162242,7.0,170.0,334.0,449.0,993.0
human_real,7310.0,358.080575,220.508056,8.0,200.0,330.0,478.0,998.0


In [None]:
target_stats = {
    'human_fake': {'mean': 338, 'std': 204, 'min': 5, 'max': 993},
    'human_real': {'mean': 358, 'std': 220, 'min': 4, 'max': 998}
}

In [None]:
target_length_gpt_fake = int(np.clip(
    np.random.normal(loc=target_stats['human_fake']['mean'],
                    scale=target_stats['human_fake']['std']),
    target_stats['human_fake']['min'],
    target_stats['human_fake']['max']
))

print(target_length_gpt_fake)


target_length_gpt_real = int(np.clip(
    np.random.normal(loc=target_stats['human_real']['mean'],
                    scale=target_stats['human_real']['std']),
    target_stats['human_real']['min'],
    target_stats['human_real']['max']
))

print(target_length_gpt_real)

248
539


In [None]:
from transformers import pipeline
import numpy as np

# Initialize text generation pipeline
generator = pipeline('text-generation', model='gpt2-medium',batch_size=8,pad_token_id=50256)

def augment_gpt_fake(text):

    current_length = len(text.split())
    target_length = int(np.clip(
      np.random.normal(loc=target_stats['human_fake']['mean'],
                      scale=target_stats['human_fake']['std']),
      target_stats['human_fake']['min'],
      target_stats['human_fake']['max']
  ))
    if current_length >= target_length:
        return text

    # Generate continuation to reach target length
    prompt = text + " "
    additional_tokens = target_length - current_length
    augmented = generator(
        prompt,
        max_new_tokens=additional_tokens+10,
        num_return_sequences=1,
        truncation=True
    )[0]['generated_text']

    return ' '.join(augmented.split()[:target_length])


def augment_gpt_real(text):

    current_length = len(text.split())
    target_length = int(np.clip(
      np.random.normal(loc=target_stats['human_real']['mean'],
                      scale=target_stats['human_real']['std']),
      target_stats['human_real']['min'],
      target_stats['human_real']['max']
  ))
    if current_length >= target_length:
        return text

    # Generate continuation to reach target length
    prompt = text + " "
    additional_tokens = target_length - current_length
    augmented = generator(
        prompt,
        max_new_tokens=additional_tokens+10,
        num_return_sequences=1,
        truncation=True
    )[0]['generated_text']

    return ' '.join(augmented.split()[:target_length])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
augmented_samples = []

for idx, row in new_FNN_df[new_FNN_df['label'] == 'gpt_fake'].iterrows():
    augmented = augment_gpt_fake(row['text'])
    augmented_samples.append({
        'text': augmented,
        'label': row['label'],
        'original_length': len(row['text'].split()),
        'new_length': len(augmented.split())
    })

# Convert to DataFrame and merge with original
balanced_df = pd.DataFrame(augmented_samples)


# Verify new length distribution
print(balanced_df.groupby('label')['text'].apply(
    lambda x: x.str.split().str.len().describe()
))

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


label          
gpt_fake  count    4084.000000
          mean      253.833497
          std        68.943873
          min        85.000000
          25%       210.000000
          50%       244.000000
          75%       285.000000
          max       704.000000
Name: text, dtype: float64


In [None]:
balanced_df = pd.read_csv('/content/drive/MyDrive/DS266/balgptfake.csv')

In [None]:
#concat gpt fake with cleaned human real and human fake

HR =  new_FNN_df[(new_FNN_df['label'] == "human_real")][["combined_text","label"]]
HF =  new_FNN_df[(new_FNN_df['label'] == "human_fake") ][["combined_text","label"]]
MF =  balanced_df[["text","label"]].rename(columns={'text': 'combined_text'})
newBal_FNN_df = pd.concat([MF,HR,HF])

print(newBal_FNN_df['label'].value_counts())

label
human_real    7310
gpt_fake      4084
human_fake    3777
Name: count, dtype: int64


In [None]:
MF = resample(MF,replace=True,n_samples=3777,random_state=42)
HR = resample(HR,replace=True,n_samples=3777,random_state=42)


newBal_FNN_df = pd.concat([MF,HR,HF])

print(newBal_FNN_df['label'].value_counts())

label
gpt_fake      3777
human_real    3777
human_fake    3777
Name: count, dtype: int64


In [None]:
# calcalate word count distributon of news text
newBal_FNN_df["word_count"] = newBal_FNN_df["combined_text"].apply(lambda x: len(x.split()))
newBal_FNN_df.groupby("label")["word_count"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
gpt_fake,3777.0,253.222928,68.361937,92.0,209.0,243.0,284.0,621.0
human_fake,3777.0,338.941223,204.162242,7.0,170.0,334.0,449.0,993.0
human_real,3777.0,357.951019,217.844484,14.0,205.0,331.0,477.0,995.0


In [None]:
newBal_FNN_df['cleaned_combined_text'] = newBal_FNN_df['combined_text'].apply(clean_text)

newBal_FNN_df['label_new'] = newBal_FNN_df['label'].map({'human_fake': 0, 'human_real': 1.0,'gpt_fake': 2,'gpt_real':3})



In [None]:
from sklearn.model_selection import train_test_split

# Split the data while keeping the DataFrame structure ( for hugging face dataset use)

train_df, tmp_df = train_test_split(
    newBal_FNN_df[['cleaned_combined_text', 'label_new']],
    test_size=0.4,
    random_state=42,
    stratify=newBal_FNN_df['label_new']
)

val_df, test_df = train_test_split(
    tmp_df,
    test_size=0.5,
    random_state=42,
    stratify=tmp_df['label_new']
)
# Reset the index for both DataFrames
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Val samples: {len(test_df)}")
print("\nTrain DataFrame:")
print(train_df.head())
print("\nTest DataFrame:")
print(test_df.head())
print("\nVal DataFrame:")
print(val_df.head())

Training samples: 6798
Test samples: 2267
Val samples: 2267

Train DataFrame:
                               cleaned_combined_text  label_new
0  movie report says latin lover actor currently ...        0.0
1  going awards show getting gussied isnt enough ...        1.0
2  like many young stars zendaya got start disney...        1.0
3  new book revealed details secret meeting jfk j...        2.0
4  hollywood couple jennifer garner ben affleck r...        2.0

Test DataFrame:
                               cleaned_combined_text  label_new
0  shocking news dotdash meredith americas larges...        2.0
1  west family one bigger kim kardashian kanye we...        1.0
2  leonardo dicaprio isnt ready say goodbye bache...        0.0
3  caitlyn jenner reportedly decided strip nude p...        2.0
4  ever wondered celebrity parents handle ups dow...        2.0

Val DataFrame:
                               cleaned_combined_text  label_new
0  shameless star ethan cutkosky plays troublepro...     

In [None]:
train_df['text_add_typos'] = train_df.apply(conditional_add_typos, axis=1)
train_df.drop(columns=['cleaned_combined_text'], inplace=True)
train_df

Unnamed: 0,label_new,text_add_typos
0,0.0,movie report says latin lover actor currently ...
1,1.0,going awards show getting gussied isnt enough ...
2,1.0,like many young stars zendaya got start disney...
3,2.0,new book revealed details secret meeting jfk j...
4,2.0,hollywood couple jennifer garner ben affleck r...
...,...,...
6793,0.0,james devaneygc images legal issues behind pit...
6794,0.0,actor charlie sheen defended tweet expressed h...
6795,2.0,recent interview ed sheeran finally weighed lo...
6796,0.0,caitlyn jenner dating someone fake story affai...


In [None]:
# Convert to Hugging Face Dataset for training
formatted_df1 = train_df.rename(columns={
    'text_add_typos': 'text',
    'label_new': 'label'
})

fake_news_dataset_train = Dataset.from_pandas(formatted_df1)

# Convert to Hugging Face Dataset for testimg
formatted_df2 = test_df.rename(columns={
    'cleaned_combined_text': 'text',
    'label_new': 'label'
})

fake_news_dataset_test = Dataset.from_pandas(formatted_df2)

# Convert to Hugging Face Dataset for validating
formatted_df3 = val_df.rename(columns={
    'cleaned_combined_text': 'text',
    'label_new': 'label'
})

fake_news_dataset_val = Dataset.from_pandas(formatted_df3)

print("Test: \n",fake_news_dataset_test)
print("Train: \n",fake_news_dataset_train)
print("Val: \n",fake_news_dataset_val)

Test: 
 Dataset({
    features: ['text', 'label'],
    num_rows: 2267
})
Train: 
 Dataset({
    features: ['label', 'text'],
    num_rows: 6798
})
Val: 
 Dataset({
    features: ['text', 'label'],
    num_rows: 2266
})


In [None]:
#Creating  compute metrics function

metric = evaluate.load('accuracy')

# FakeNewsNet_df['label_new'] = FakeNewsNet_df['label'].map({'human_fake': 0, 'human_real': 1.0,'gpt_fake': 2,'gpt_real': 3})
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
  acc = accuracy_score(labels, predictions)

  class_report = classification_report(
        labels, predictions,
        target_names=['human_fake', 'human_real', 'gpt_fake'],
        digits=4
    )
  print("\nClassification Report:")
  print(class_report)

  return {
      'accuracy': acc,
      'f1': f1,
      'precision': precision,
      'recall': recall
  }


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


####bert

In [None]:
MAX_SEQUENCE_LENGTH = 500

In [None]:
model_checkpoint_name = "bert-base-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)
# bert_classification_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint_name)

bert_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=3  # number of classes = 4
)

fine_tune_classification_model(bert_classification_model, bert_tokenizer, fake_news_dataset_train, fake_news_dataset_val)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/6798 [00:00<?, ? examples/s]

Map:   0%|          | 0/2266 [00:00<?, ? examples/s]

Map:   0%|          | 0/6798 [00:00<?, ? examples/s]

Map:   0%|          | 0/2266 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.398194,0.827449,0.827307,0.828837,0.827449
2,0.535000,0.396986,0.847308,0.844783,0.8444,0.847308
3,0.288400,0.423425,0.858341,0.856822,0.855987,0.858341



Classification Report:
              precision    recall  f1-score   support

  human_fake     0.7719    0.7116    0.7405       756
  human_real     0.7387    0.8013    0.7687       755
    gpt_fake     0.9760    0.9695    0.9728       755

    accuracy                         0.8274      2266
   macro avg     0.8289    0.8275    0.8273      2266
weighted avg     0.8288    0.8274    0.8273      2266


Classification Report:
              precision    recall  f1-score   support

  human_fake     0.8179    0.7487    0.7818       756
  human_real     0.7981    0.7960    0.7971       755
    gpt_fake     0.9172    0.9974    0.9556       755

    accuracy                         0.8473      2266
   macro avg     0.8444    0.8474    0.8448      2266
weighted avg     0.8444    0.8473    0.8448      2266


Classification Report:
              precision    recall  f1-score   support

  human_fake     0.8035    0.7897    0.7965       756
  human_real     0.8198    0.7894    0.8043       755
   

#### reberta

In [None]:
model_checkpoint_name = "roberta-base"
roberta_tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_name)
roberta_classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint_name,
    num_labels=3  # number of classes = 4
)

fine_tune_classification_model(roberta_classification_model, roberta_tokenizer, fake_news_dataset_train, fake_news_dataset_val)

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/6798 [00:00<?, ? examples/s]

Map:   0%|          | 0/2266 [00:00<?, ? examples/s]

Map:   0%|          | 0/6798 [00:00<?, ? examples/s]

Map:   0%|          | 0/2266 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.457993,0.792145,0.785615,0.797945,0.792145
2,0.545700,0.380024,0.844219,0.841113,0.844759,0.844219
3,0.321800,0.454408,0.845543,0.843343,0.842403,0.845543



Classification Report:
              precision    recall  f1-score   support

  human_fake     0.6744    0.8108    0.7363       756
  human_real     0.8071    0.5709    0.6687       755
    gpt_fake     0.9125    0.9947    0.9518       755

    accuracy                         0.7921      2266
   macro avg     0.7980    0.7921    0.7856      2266
weighted avg     0.7979    0.7921    0.7856      2266


Classification Report:
              precision    recall  f1-score   support

  human_fake     0.8349    0.6892    0.7551       756
  human_real     0.7616    0.8464    0.8018       755
    gpt_fake     0.9377    0.9974    0.9666       755

    accuracy                         0.8442      2266
   macro avg     0.8448    0.8443    0.8412      2266
weighted avg     0.8448    0.8442    0.8411      2266


Classification Report:
              precision    recall  f1-score   support

  human_fake     0.7888    0.7659    0.7772       756
  human_real     0.8111    0.7735    0.7919       755
   

##CODE SANDBOX

In [None]:

# # Bal_FNN_df = pd.concat([balanced_df_real,balanced_df_real])

# balanced_df.to_csv('/content/drive/MyDrive/DS266/balgptfake.csv', index=False)



In [None]:
# augmented_samples = []

# for idx, row in new_FNN_df[new_FNN_df['label'] == 'gpt_real'].iterrows():
#     augmented = augment_gpt_real(row['text'])
#     augmented_samples.append({
#         'text': augmented,
#         'label': row['label'],
#         'original_length': len(row['text'].split()),
#         'new_length': len(augmented.split())
#     })

# # Convert to DataFrame and merge with original
# balanced_df_real = pd.DataFrame(augmented_samples)


# # Verify new length distribution
# print(balanced_df_real.groupby('label')['text'].apply(
#     lambda x: x.str.split().str.len().describe()
# ))

In [None]:
# balanced_df_real.to_csv('/content/drive/MyDrive/DS266/balanced_df_gptreal.csv', index=False)

In [None]:

# Bal_FNN_df = pd.concat([balanced_df_real,balanced_df_real])

# Bal_FNN_df.to_csv('/content/drive/MyDrive/DS266/balanced_data.csv', index=False)

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')  # First mount your Drive

# # Save as CSV
# balanced_df.to_csv('/content/drive/MyDrive/your_folder/balanced_data.csv', index=False)

In [None]:
# Bal_FNN_df.to_parquet('/content/drive/MyDrive/DS266/balanced_data_parq.parquet')