<a href="https://colab.research.google.com/github/sutanmuleta/Detecting-Political-Misinformation-Using-Fine-Tuned-NLP-Models/blob/main/4misinformation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets




In [None]:
import pandas as pd

# Create a mock dataset for demonstration
data = {
    'text': [
        'The economy is booming under the current administration.',
        'Healthcare costs will be reduced significantly.',
        'The candidate promises to lower taxes and increase jobs.',
        'Reports show a decrease in crime rates nationwide.',
        'New policies have improved the education system tremendously.'
    ],
    'label': ['mostly-true', 'half-true', 'true', 'false', 'barely-true']
}

# Convert to DataFrame
dataset = pd.DataFrame(data)

# Display the DataFrame
print(dataset.head())


                                                text        label
0  The economy is booming under the current admin...  mostly-true
1    Healthcare costs will be reduced significantly.    half-true
2  The candidate promises to lower taxes and incr...         true
3  Reports show a decrease in crime rates nationw...        false
4  New policies have improved the education syste...  barely-true


In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

# Apply tokenization to the dataset
dataset['tokens'] = dataset['text'].apply(lambda x: tokenizer(x, truncation=True, padding='max_length', max_length=512))

# Display the tokenized data for verification
print(dataset['tokens'].head())


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

0    [input_ids, token_type_ids, attention_mask]
1    [input_ids, token_type_ids, attention_mask]
2    [input_ids, token_type_ids, attention_mask]
3    [input_ids, token_type_ids, attention_mask]
4    [input_ids, token_type_ids, attention_mask]
Name: tokens, dtype: object


In [None]:
from sklearn.model_selection import train_test_split
import torch

# Convert labels to numeric format
label_map = {label: idx for idx, label in enumerate(dataset['label'].unique())}
dataset['numeric_label'] = dataset['label'].map(label_map)

# Split the dataset into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    dataset['tokens'], dataset['numeric_label'], test_size=0.2, random_state=42
)

# Create a custom Dataset class for PyTorch
class MisinformationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Convert the tokenized inputs and labels into a dataset object
train_dataset = MisinformationDataset(train_texts, train_labels)
test_dataset = MisinformationDataset(test_texts, test_labels)


In [None]:
from torch.utils.data import Dataset

# Define the custom dataset class
class MisinformationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings[idx]['input_ids']),
            'attention_mask': torch.tensor(self.encodings[idx]['attention_mask']),
            'labels': torch.tensor(self.labels[idx])
        }

    def __len__(self):
        return len(self.labels)


In [None]:
from sklearn.model_selection import train_test_split

# Assuming 'dataset' is your full DataFrame with columns 'text' and 'label'
# Split the dataset into training and test sets
train_df, test_df = train_test_split(dataset, test_size=0.2, random_state=42)

# Convert the 'text' column to lists for tokenization
train_texts = train_df['text'].tolist()
test_texts = test_df['text'].tolist()

# Convert the 'label' column to lists
train_labels = train_df['label'].tolist()
test_labels = test_df['label'].tolist()

# Tokenize the data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

# Create the dataset
train_dataset = MisinformationDataset(train_encodings, train_labels)
test_dataset = MisinformationDataset(test_encodings, test_labels)


In [None]:
import torch
from torch.utils.data import Dataset

class MisinformationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
from transformers import BertForSequenceClassification

# Determine the number of unique labels
unique_labels = set(train_labels)
num_labels = len(unique_labels)

# Load the pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
    eval_strategy="epoch"            # Update from `evaluation_strategy` to `eval_strategy`
)

trainer = Trainer(
    model=model,                         # The instantiated 🤗 Transformers model to be trained
    args=training_args,                  # Training arguments, defined above
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=test_dataset            # Evaluation dataset
)


In [None]:
import zipfile

# Define the path to the uploaded zip file
zip_file_path = '/content/archive (3).zip'

# Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall('/content/archive')  # Extracts all files into this directory

# Verify the extracted files
import os

# List the files in the extracted directory
for file in os.listdir('/content/archive'):
    print(file)


PolitiFactNewsUser.txt
BuzzFeed_fake_news_content.csv
BuzzFeedUser.txt
BuzzFeedUserFeature.mat
PolitiFactUserFeature.mat
BuzzFeedNewsUser.txt
BuzzFeed_real_news_content.csv
PolitiFact_fake_news_content.csv
PolitiFact_real_news_content.csv
PolitiFactUser.txt
PolitiFactNews.txt
PolitiFactUserUser.txt
BuzzFeedUserUser.txt
BuzzFeedNews.txt


In [None]:
import pandas as pd

# Load the BuzzFeed datasets
buzzfeed_fake_news_df = pd.read_csv('/content/archive/BuzzFeed_fake_news_content.csv')
buzzfeed_real_news_df = pd.read_csv('/content/archive/BuzzFeed_real_news_content.csv')

# Load the PolitiFact datasets
politifact_fake_news_df = pd.read_csv('/content/archive/PolitiFact_fake_news_content.csv')
politifact_real_news_df = pd.read_csv('/content/archive/PolitiFact_real_news_content.csv')

# Display the first few rows of each DataFrame to verify loading
print("BuzzFeed Fake News Data:")
print(buzzfeed_fake_news_df.head())

print("\nBuzzFeed Real News Data:")
print(buzzfeed_real_news_df.head())

print("\nPolitiFact Fake News Data:")
print(politifact_fake_news_df.head())

print("\nPolitiFact Real News Data:")
print(politifact_real_news_df.head())


BuzzFeed Fake News Data:
                id                                              title  \
0   Fake_1-Webpage  Proof The Mainstream Media Is Manipulating The...   
1  Fake_10-Webpage  Charity: Clinton Foundation Distributed “Water...   
2  Fake_11-Webpage  A Hillary Clinton Administration May be Entire...   
3  Fake_12-Webpage  Trump’s Latest Campaign Promise May Be His Mos...   
4  Fake_13-Webpage                    Website is Down For Maintenance   

                                                text  \
0  I woke up this morning to find a variation of ...   
1  Former President Bill Clinton and his Clinton ...   
2  After collapsing just before trying to step in...   
3  Donald Trump is, well, deplorable. He’s sugges...   
4                    Website is Down For Maintenance   

                                                 url  \
0  http://www.addictinginfo.org/2016/09/19/proof-...   
1  http://eaglerising.com/36899/charity-clinton-f...   
2  http://eaglerising.com/36880

In [None]:
from sklearn.model_selection import train_test_split

# Add a label column: 0 for fake, 1 for real
buzzfeed_fake_news_df['label'] = 0
buzzfeed_real_news_df['label'] = 1
politifact_fake_news_df['label'] = 0
politifact_real_news_df['label'] = 1

# Combine all datasets into a single DataFrame
combined_df = pd.concat([buzzfeed_fake_news_df, buzzfeed_real_news_df, politifact_fake_news_df, politifact_real_news_df], ignore_index=True)

# Display the first few rows of the combined DataFrame
print("Combined Data:")
print(combined_df.head())

# Select relevant columns for text and label
data_df = combined_df[['text', 'label']]

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(data_df, test_size=0.2, random_state=42)

# Display the sizes of the train and test datasets
print(f"Training Set Size: {len(train_df)}")
print(f"Testing Set Size: {len(test_df)}")


Combined Data:
                id                                              title  \
0   Fake_1-Webpage  Proof The Mainstream Media Is Manipulating The...   
1  Fake_10-Webpage  Charity: Clinton Foundation Distributed “Water...   
2  Fake_11-Webpage  A Hillary Clinton Administration May be Entire...   
3  Fake_12-Webpage  Trump’s Latest Campaign Promise May Be His Mos...   
4  Fake_13-Webpage                    Website is Down For Maintenance   

                                                text  \
0  I woke up this morning to find a variation of ...   
1  Former President Bill Clinton and his Clinton ...   
2  After collapsing just before trying to step in...   
3  Donald Trump is, well, deplorable. He’s sugges...   
4                    Website is Down For Maintenance   

                                                 url  \
0  http://www.addictinginfo.org/2016/09/19/proof-...   
1  http://eaglerising.com/36899/charity-clinton-f...   
2  http://eaglerising.com/36880/a-hillary

In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_data(texts):
    return tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

# Tokenize the training and testing datasets
train_encodings = tokenize_data(train_df['text'])
test_encodings = tokenize_data(test_df['text'])

# Convert labels to tensors
import torch

train_labels = torch.tensor(train_df['label'].values)
test_labels = torch.tensor(test_df['label'].values)

# Display a sample encoding
print("Sample Encoding:", train_encodings['input_ids'][0])


Sample Encoding: tensor([  101,  2006,  2865,  9927,  8264,  7276,  3058,  1529,  2285,  1010,
         2325,  2281,  1010,  2325,  2255,  1010,  2325,  2244,  1010,  2325,
         2257,  1010,  2325,  2251,  1010,  2325,  2238,  1010,  2325,  2089,
         1010,  2325,  2258,  1010,  2325,  2233,  1010,  2325,  2337,  1010,
         2325,  2254,  1010,  2325,  7832,  2005,  2034,  4883,  5981,  2623,
        14131, 12621,  1010,  1000,  6788, 22390,  2739,  1000,  8133,  1998,
        29420,  1997,  1996,  2034,  4883,  5981,  5115,  2005,  2279,  6928,
         2623,  1996,  7832,  2008,  2097,  3710,  2004,  1037,  8660,  2125,
         2391,  2005,  2010,  3980,  1012,  1996,  7832,  2421,  1000,  2637,
         1005,  1055,  3257,  1010,  1000,  1000, 10910, 14165,  1000,  1998,
         1000, 12329,  2637,  1012,  1000,  1996,  8874,  2234,  2007,  1037,
         5430,  4017,  2008,  1996,  7832,  2071,  2689,  2349,  2000,  2739,
         8973,  1012,  1996,  5981,  1010,  202

In [None]:
import pandas as pd
import zipfile
import os

# Define the path where you uploaded the zip file
zip_path = '/content/archive (3).zip'

# Define the directory to extract the files
extract_dir = '/content/extracted_files'

# Create the directory if it doesn't exist
if not os.path.exists(extract_dir):
    os.makedirs(extract_dir)

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# Load the datasets
fake_news_path = os.path.join(extract_dir, 'BuzzFeed_fake_news_content.csv')
real_news_path = os.path.join(extract_dir, 'BuzzFeed_real_news_content.csv')

fake_news = pd.read_csv(fake_news_path)
real_news = pd.read_csv(real_news_path)

# Display the first few rows of each dataset to verify they loaded correctly
print("Fake News Dataset:")
print(fake_news.head())

print("\nReal News Dataset:")
print(real_news.head())


Fake News Dataset:
                id                                              title  \
0   Fake_1-Webpage  Proof The Mainstream Media Is Manipulating The...   
1  Fake_10-Webpage  Charity: Clinton Foundation Distributed “Water...   
2  Fake_11-Webpage  A Hillary Clinton Administration May be Entire...   
3  Fake_12-Webpage  Trump’s Latest Campaign Promise May Be His Mos...   
4  Fake_13-Webpage                    Website is Down For Maintenance   

                                                text  \
0  I woke up this morning to find a variation of ...   
1  Former President Bill Clinton and his Clinton ...   
2  After collapsing just before trying to step in...   
3  Donald Trump is, well, deplorable. He’s sugges...   
4                    Website is Down For Maintenance   

                                                 url  \
0  http://www.addictinginfo.org/2016/09/19/proof-...   
1  http://eaglerising.com/36899/charity-clinton-f...   
2  http://eaglerising.com/36880/a-hil

In [None]:
# List the extracted files
extracted_files = os.listdir(extract_dir)
print("Extracted files:", extracted_files)


Extracted files: ['PolitiFactNewsUser.txt', 'BuzzFeed_fake_news_content.csv', 'BuzzFeedUser.txt', 'BuzzFeedUserFeature.mat', 'PolitiFactUserFeature.mat', 'BuzzFeedNewsUser.txt', 'BuzzFeed_real_news_content.csv', 'PolitiFact_fake_news_content.csv', 'PolitiFact_real_news_content.csv', 'PolitiFactUser.txt', 'PolitiFactNews.txt', 'PolitiFactUserUser.txt', 'BuzzFeedUserUser.txt', 'BuzzFeedNews.txt']


In [None]:
import pandas as pd

# Load the datasets
fake_news = pd.read_csv('/content/extracted_files/BuzzFeed_fake_news_content.csv')
real_news = pd.read_csv('/content/extracted_files/BuzzFeed_real_news_content.csv')

# Add labels: 0 for fake news, 1 for real news
fake_news['label'] = 0
real_news['label'] = 1

# Combine the datasets
news_data = pd.concat([fake_news, real_news], ignore_index=True)

# Shuffle the dataset
news_data = news_data.sample(frac=1).reset_index(drop=True)

# Display the first few rows of the combined dataset
print(news_data.head())


                id                                              title  \
0   Real_9-Webpage  Georgia poll: Donald Trump, Hillary Clinton in...   
1  Real_61-Webpage     “Why Aren’t I 50 Points Ahead?” – Eagle Rising   
2  Fake_23-Webpage  Trump Just Made A Campaign Promise So Ridiculo...   
3  Fake_84-Webpage  OUTRAGE! Obama Spends $770M Dollars In Tax Pay...   
4  Fake_32-Webpage  NYC Terrorist Ahmad Rahami Sued Police Departm...   

                                                text  \
0  Story highlights Trump has 45%, Clinton 42% an...   
1  Liberals in the media and the political arena ...   
2  \n\nPosted by Frank Wilkenmeyer on 19 Sep 2016...   
3  OUTRAGE! Obama Spends $770M Dollars In Tax Pay...   
4  196 SHARES Facebook Twitter\n\nFor a little ov...   

                                                 url  \
0                              http://cnn.it/2cynaZx   
1  http://eaglerising.com/36936/hillary-wants-to-...   
2  http://winningdemocrats.com/trump-just-made-a-...   


In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text
def tokenize_function(examples):
    return tokenizer(examples, padding='max_length', truncation=True, max_length=512)

# Apply tokenization
tokenized_data = news_data['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True))

# Display tokenized data
print(tokenized_data.head())


0    [101, 2466, 11637, 8398, 2038, 3429, 1003, 101...
1    [101, 13350, 1999, 1996, 2865, 1998, 1996, 257...
2    [101, 6866, 2011, 3581, 19863, 7520, 24344, 20...
3    [101, 19006, 999, 8112, 15970, 1002, 29065, 22...
4    [101, 20035, 6661, 9130, 10474, 2005, 1037, 22...
Name: text, dtype: object


In [None]:
from torch.utils.data import Dataset

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize with padding and truncation
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Set maximum sequence length
MAX_LEN = 512

# Create dataset
news_dataset = NewsDataset(news_data['text'].tolist(), news_data['label'].tolist(), tokenizer, MAX_LEN)

# Display dataset length
print(f"Dataset size: {len(news_dataset)}")


Dataset size: 182


In [None]:
from transformers import Trainer, TrainingArguments

# Define training arguments with no evaluation strategy
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="no"  # Disable evaluation
)

# Create Trainer instance without evaluation dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=news_dataset,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()


Step,Training Loss
10,0.6406
20,0.697
30,0.6945
40,0.6264


TrainOutput(global_step=46, training_loss=0.6658554284468942, metrics={'train_runtime': 1192.1542, 'train_samples_per_second': 0.153, 'train_steps_per_second': 0.039, 'total_flos': 47886212075520.0, 'train_loss': 0.6658554284468942, 'epoch': 1.0})

In [None]:
# Function to predict the label of a news article
def predict(text):
    # Tokenize the input text
    inputs = tokenizer(
        text,
        return_tensors='pt',
        max_length=512,
        truncation=True,
        padding=True
    )

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_label = torch.argmax(predictions).item()

    # Map the prediction to the corresponding label
    label_map = {0: 'Fake', 1: 'Real'}
    return label_map[predicted_label]

# Test the prediction function with a sample text
sample_text = "Obama is not american"
prediction = predict(sample_text)
print(f"Prediction: {prediction}")


Prediction: Fake


In [None]:
# Save the model and tokenizer
model.save_pretrained('./model')
tokenizer.save_pretrained('./model')


('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json')

In [None]:
from transformers import BertForSequenceClassification

# Load the model and tokenizer
model = BertForSequenceClassification.from_pretrained('./model')
tokenizer = BertTokenizer.from_pretrained('./model')


In [None]:
 Function to predict the label of a news article
def predict(text):
    # Tokenize the input text
    inputs = tokenizer(
        text,
        return_tensors='pt',
        max_length=512,
        truncation=True,
        padding=True
    )

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_label = torch.argmax(predictions).item()

    # Map the prediction to the corresponding label
    label_map = {0: 'Fake', 1: 'Real'}
    return label_map[predicted_label]

# Test the prediction function with a sample text
sample_text = "Obama is not american."
prediction = predict(sample_text)
print(f"Prediction: {prediction}")
