In [6]:
!pip install transformers

import warnings
warnings.filterwarnings('ignore')  # "error", "ignore", "always", "default", "module" or "once"

# Import necessary libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertModel
from transformers import BertForSequenceClassification



# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
file_path = 'Twitter16Dataset.csv'
df = pd.read_csv(file_path, names=["Label", "SourceID", "Tweet"], header=0)


# Define preprocessing functions
def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

def remove_special_characters(text):
    return re.sub(r'\W', ' ', text)

def remove_mentions(text):
    return re.sub(r'@\w+', '', text)

def tokenize(text):
    return text.split()

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word.lower() not in stop_words]

def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

# Preprocess the dataset
df['Tweet'] = df['Tweet'].apply(remove_urls)
df['Tweet'] = df['Tweet'].apply(remove_mentions)
df['Tweet'] = df['Tweet'].apply(remove_special_characters)
df['Tweet'] = df['Tweet'].apply(tokenize)
df['Tweet'] = df['Tweet'].apply(remove_stopwords)
df['Tweet'] = df['Tweet'].apply(lemmatize)
df['Tweet'] = df['Tweet'].apply(lambda tokens: ' '.join(tokens))

# Encode the labels
label_mapping = {'unverified': 0, 'non-rumor': 1, 'TRUE': 2, 'FALSE': 3}
df['Label'] = df['Label'].map(label_mapping)

# Add the line to remove rows with missing labels
df = df.dropna(subset=['Label'])

# Split the dataset
train_df, val_df = train_test_split(df, test_size=0.4, random_state=42, stratify=df['Label'])

# Save the preprocessed datasets to CSV files
train_df.to_csv('train_preprocessed.csv', index=False)
val_df.to_csv('val_preprocessed.csv', index=False)

# Vectorize the preprocessed text
# Create TF-IDF feature vectors
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['Tweet'])
X_val = vectorizer.transform(val_df['Tweet'])
y_train = train_df['Label']
y_val = val_df['Label']


# Train and evaluate the SVM model
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_y_pred = svm_model.predict(X_val)

svm_accuracy = accuracy_score(y_val, svm_y_pred)
svm_precision = precision_score(y_val, svm_y_pred, average='weighted')
svm_recall = recall_score(y_val, svm_y_pred, average='weighted')
svm_f1 = f1_score(y_val, svm_y_pred, average='weighted')

print("SVM Model:")
print(f"Accuracy: {svm_accuracy:.4f}")
print(f"Precision: {svm_precision:.4f}")
print(f"Recall: {svm_recall:.4f}")
print(f"F1 Score: {svm_f1:.4f}")

# Train and evaluate the Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_y_pred = lr_model.predict(X_val)

lr_accuracy = accuracy_score(y_val, lr_y_pred)
lr_precision = precision_score(y_val, lr_y_pred, average='weighted')
lr_recall = recall_score(y_val, lr_y_pred, average='weighted')
lr_f1 = f1_score(y_val, lr_y_pred, average='weighted')


print("Logistic Regression Model:")
print(f"Accuracy: {lr_accuracy:.4f}")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall: {lr_recall:.4f}")
print(f"F1 Score: {lr_f1:.4f}")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


SVM Model:
Accuracy: 0.8140
Precision: 0.8620
Recall: 0.8140
F1 Score: 0.8215
Logistic Regression Model:
Accuracy: 0.8049
Precision: 0.8181
Recall: 0.8049
F1 Score: 0.8074


In [7]:
# Import necessary libraries
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup



# Define the RumorDataset class
class RumorDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['Tweet']
        label = self.data.iloc[idx]['Label']
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(int(label), dtype=torch.long)
        }

# Load the ConfliBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("snowood1/ConfliBERT-scr-uncased")
model = AutoModelForSequenceClassification.from_pretrained("snowood1/ConfliBERT-scr-uncased", num_labels=4)  # Use the correct model and set the number of labels


# Define constants
MAX_LENGTH = 128
BATCH_SIZE = 32

# Create DataLoader for the training and validation datasets
train_dataset = RumorDataset(train_df, tokenizer, MAX_LENGTH)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataset = RumorDataset(val_df, tokenizer, MAX_LENGTH)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

Some weights of the model checkpoint at snowood1/ConfliBERT-scr-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at snowood1/Conf

In [8]:
import torch.nn as nn
from sklearn.metrics import classification_report

def train_model(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss()(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)

def eval_model(model, dataloader, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    loss_fn = nn.CrossEntropyLoss()
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs.logits, labels)
            total_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.detach().cpu().numpy())
            all_labels.extend(labels.detach().cpu().numpy())

    return total_loss / len(dataloader), classification_report(all_labels, all_preds, output_dict=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

EPOCHS = 30
LEARNING_RATE = 5e-5
WEIGHT_DECAY = 0
WARMUP_STEPS = 300

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=EPOCHS * len(train_dataloader))

for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    train_loss = train_model(model, train_dataloader, optimizer, scheduler, device)
    val_loss, val_report = eval_model(model, val_dataloader, device)
    print(f"Train loss: {train_loss:.4f}, Val loss: {val_loss:.4f}")
    print(f"Val report: {val_report}")

# Save the model
output_dir = "rumor_detection_model"
tokenizer.save_pretrained(output_dir)
model.save_pretrained(output_dir)

Epoch 1/30
Train loss: 1.4345, Val loss: 1.4197
Val report: {'0': {'precision': 0.17391304347826086, 'recall': 0.04938271604938271, 'f1-score': 0.07692307692307693, 'support': 81}, '1': {'precision': 0.3333333333333333, 'recall': 0.024390243902439025, 'f1-score': 0.04545454545454545, 'support': 82}, '2': {'precision': 0.2535211267605634, 'recall': 0.43373493975903615, 'f1-score': 0.32, 'support': 83}, '3': {'precision': 0.24203821656050956, 'recall': 0.4634146341463415, 'f1-score': 0.3179916317991632, 'support': 82}, 'accuracy': 0.24390243902439024, 'macro avg': {'precision': 0.2507014300331668, 'recall': 0.24273063346429985, 'f1-score': 0.19009231354419637, 'support': 328}, 'weighted avg': {'precision': 0.2509441376041494, 'recall': 0.24390243902439024, 'f1-score': 0.19083340172430896, 'support': 328}}
Epoch 2/30
Train loss: 1.4020, Val loss: 1.4046
Val report: {'0': {'precision': 0.18947368421052632, 'recall': 0.2222222222222222, 'f1-score': 0.20454545454545453, 'support': 81}, '1': 

In [9]:
import torch
from transformers import AutoModel, AutoTokenizer

# Load the pretrained model
output_dir = "rumor_detection_model"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Load the trained model
model = BertForSequenceClassification.from_pretrained(output_dir)
model.to(device)
model.eval()

# Define prediction function
def predict_rumor_status(text, model, tokenizer, max_length):
    # Encode the input text
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    # Make prediction
    with torch.no_grad():
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

    # Map predictions to labels
    label_mapping = {
        0: "true-rumour",
        1: "false-rumour",
        2: "unverified-rumour",
        3: "non-rumour",
    }
    return label_mapping[preds.item()]

# Test the prediction function with an example text
example_text = "Breaking news: Giant pandas can now fly!"
predicted_status = predict_rumor_status(example_text, model, tokenizer, MAX_LENGTH)
print(f"Predicted rumor status: {predicted_status}")

Predicted rumor status: unverified-rumour


In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


def evaluate_model(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.detach().cpu().numpy())
            all_labels.extend(labels.detach().cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    print("Evaluation:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Per label performance:")
    print(classification_report(all_labels, all_preds))
    
# Evaluate the model
evaluate_model(model, val_dataloader, device)


Evaluation:
Accuracy: 0.7805
Precision: 0.7868
Recall: 0.7805
F1 Score: 0.7823
Per label performance:
              precision    recall  f1-score   support

           0       0.85      0.74      0.79        81
           1       0.67      0.73      0.70        82
           2       0.91      0.89      0.90        83
           3       0.72      0.76      0.74        82

    accuracy                           0.78       328
   macro avg       0.79      0.78      0.78       328
weighted avg       0.79      0.78      0.78       328



In [11]:
example_texts = [
    "Breaking news: Giant pandas can now fly!",
    "A new scientific study shows that drinking coffee can reduce the risk of heart diseases.",
    "The US government has announced a new stimulus package to support small businesses.",
]

for text in example_texts:
    predicted_status = predict_rumor_status(text, model, tokenizer, MAX_LENGTH)
    print(f"Text: {text}\nPredicted rumor status: {predicted_status}\n")


Text: Breaking news: Giant pandas can now fly!
Predicted rumor status: unverified-rumour

Text: A new scientific study shows that drinking coffee can reduce the risk of heart diseases.
Predicted rumor status: non-rumour

Text: The US government has announced a new stimulus package to support small businesses.
Predicted rumor status: false-rumour



In [None]:
import numpy as np

def label_distribution(labels, name):
    unique_labels, counts = np.unique(labels, return_counts=True)
    print(f"{name} label distribution:")
    for label, count in zip(unique_labels, counts):
        print(f"Label {label}: {count}")

# Get the true labels from the validation dataset
true_labels = val_df['Label'].values
label_distribution(true_labels, "Validation dataset")

# Get the predicted labels from the model
pred_labels = np.array([])
model.eval()
with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        pred_labels = np.concatenate((pred_labels, preds.cpu().numpy()))

label_distribution(pred_labels, "Predicted labels")
