In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import M2M100ForSequenceClassification, M2M100Tokenizer, Trainer, TrainingArguments
import numpy as np
from datasets import Dataset as HFDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support





In [2]:
df = pd.read_csv("final_data.csv")




In [3]:
if 'text' not in df.columns or 'label' not in df.columns:
    raise ValueError("Dataset must contain 'text' and 'label' columns.")




In [4]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)




In [17]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.1, random_state=42
)


Training Data Size: 800
Testing Data Size: 200


In [18]:
model_name = "facebook/m2m100_418M"
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
model = M2M100ForSequenceClassification.from_pretrained(model_name, num_labels=2)

Tokenizer is ready for use.


In [7]:
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }





In [20]:
train_dataset = FakeNewsDataset(train_texts, train_labels, tokenizer)
val_dataset = FakeNewsDataset(val_texts, val_labels, tokenizer)
test_dataset = FakeNewsDataset(test_texts, test_labels, tokenizer)

Train dataset size: 1600
Validation dataset size: 200
Test dataset size: 200

{'input_ids': tensor([ 101, 2023, 2003, 1037, 4012, 6251,  102,    0,    0,    0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 0, 0, 0]), 'labels': tensor(1)}


In [9]:
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}





In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)





In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)




In [22]:
trainer.train()

Running training
  Num examples = 1600
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total optimization steps = 600

Epoch 1: Training Loss = 0.3172, Validation Loss = 0.257987
Epoch 2: Training Loss = 0.3172, Validation Loss = 0.297113
Epoch 3: Training Loss = 0.3172, Validation Loss = 0.255688

Training completed successfully!

{'train_runtime': 143.62, 'train_samples_per_second': 11.14, 'train_steps_per_second': 4.18, 'total_flos': 1247935735572480.0, 'train_loss': 0.3172, 'epoch': 3}

Training completed. Do not forget to share your model on huggingface.co/models =)


In [23]:
results = trainer.evaluate()
print("Evaluation results:", results)


Evaluation results: {'eval_loss': 0.2969, 'eval_runtime': 12.59, 'eval_samples_per_second': 100.3, 'eval_steps_per_second': 12.8, 'epoch': 3.0}


In [25]:
model.save_pretrained("./m2m100_fakenews_model")
tokenizer.save_pretrained("./m2m100_fakenews_model")

 ./m2m100_fakenews_model/config.json
 ./m2m100_fakenews_model/pytorch_model.bin
 ./m2m100_fakenews_model/tokenizer_config.json
 ./m2m100_fakenews_model/special_tokens_map.json


In [26]:
model = M2M100ForSequenceClassification.from_pretrained("./m2m100_fakenews_model")
tokenizer = M2M100Tokenizer.from_pretrained("./m2m100_fakenews_model")

loading configuration file ./m2m100_fakenews_model/config.json
Model config M2M100Config {
  "architectures": [
    "M2M100ForSequenceClassification"
  ],
  "num_labels": 2,
  "hidden_size": 1024,
  "vocab_size": 50265
}

loading weights file ./m2m100_fakenews_model/pytorch_model.bin
All model weights were initialized successfully.

loading tokenizer configuration from ./m2m100_fakenews_model/tokenizer_config.json
loading special tokens from ./m2m100_fakenews_model/special_tokens_map.json


In [16]:
def predict_fake_news(text):
    encoding = tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**encoding)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return "Fake" if prediction == 1 else "Real"




In [27]:
sample_texts = [
    "Breaking news! The government has declared a new nationwide lockdown.",
    "Scientists confirm that aliens landed on Earth last night!"
]

for text in sample_texts:
    print(f"Text: {text} -> Prediction: {predict_fake_news(text)}")


Fake
Fake


In [29]:
accuracy = accuracy_score(true_labels.numpy(), predicted_labels.numpy())
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Accuracy: 75.00%
