In [8]:
!pip install transformers
!pip install transformers nlpaug  # Install nlpaug


import warnings
warnings.filterwarnings('ignore')  # "error", "ignore", "always", "default", "module" or "once"

# Import necessary libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertModel
from transformers import BertForSequenceClassification

# Import nlpaug modules (New)
import nlpaug.augmenter.word as naw

# Create the augment_text function (New)
def augment_text(text, method='synonym', n=1):
    if method == 'synonym':
        augmenter = naw.SynonymAug(aug_src='wordnet')
    elif method == 'swap':
        augmenter = naw.RandomWordAug(action="swap")
    elif method == 'delete':
        augmenter = naw.RandomWordAug()
    else:
        raise ValueError("Invalid augmentation method")
    
    augmented_texts = augmenter.augment(text, n=n)
    return [text] + augmented_texts

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
file_path = 'Twitter15Dataset.csv'
df = pd.read_csv(file_path, names=["Label", "SourceID", "Tweet"], header=0)


# Define preprocessing functions
def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

def remove_special_characters(text):
    return re.sub(r'\W', ' ', text)

def remove_mentions(text):
    return re.sub(r'@\w+', '', text)

def tokenize(text):
    return text.split()

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word.lower() not in stop_words]

def lemmatize(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

# Preprocess the dataset
df['Tweet'] = df['Tweet'].apply(remove_urls)
df['Tweet'] = df['Tweet'].apply(remove_mentions)
df['Tweet'] = df['Tweet'].apply(remove_special_characters)
df['Tweet'] = df['Tweet'].apply(tokenize)
df['Tweet'] = df['Tweet'].apply(remove_stopwords)
df['Tweet'] = df['Tweet'].apply(lemmatize)
df['Tweet'] = df['Tweet'].apply(lambda tokens: ' '.join(tokens))

# Encode the labels
label_mapping = {'unverified': 0, 'non-rumor': 1, 'TRUE': 2, 'FALSE': 3}
df['Label'] = df['Label'].map(label_mapping)

# Add the line to remove rows with missing labels
df = df.dropna(subset=['Label'])

# Split the dataset
train_df, val_df = train_test_split(df, test_size=0.4, random_state=42, stratify=df['Label'])

# Apply data augmentation
n_augmentations = 2
train_df['Augmented_Tweet'] = train_df['Tweet'].apply(lambda x: augment_text(x, method='synonym', n=n_augmentations))
train_df = train_df.explode('Augmented_Tweet').reset_index(drop=True)

# Save the preprocessed datasets to CSV files
train_df.to_csv('train_preprocessed.csv', index=False)
val_df.to_csv('val_preprocessed.csv', index=False)

# Vectorize the preprocessed text
# Create TF-IDF feature vectors
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['Augmented_Tweet'])
X_val = vectorizer.transform(val_df['Tweet'])
y_train = train_df['Label']
y_val = val_df['Label']


# Train and evaluate the SVM model
svm_model = SVC(kernel='linear', C=1, probability=True, class_weight='balanced')
svm_model.fit(X_train, y_train)
svm_y_pred = svm_model.predict(X_val)

svm_accuracy = accuracy_score(y_val, svm_y_pred)
svm_precision = precision_score(y_val, svm_y_pred, average='weighted')
svm_recall = recall_score(y_val, svm_y_pred, average='weighted')
svm_f1 = f1_score(y_val, svm_y_pred, average='weighted')

print("SVM Model:")
print(f"Accuracy: {svm_accuracy:.4f}")
print(f"Precision: {svm_precision:.4f}")
print(f"Recall: {svm_recall:.4f}")
print(f"F1 Score: {svm_f1:.4f}")

# Train and evaluate the Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_y_pred = lr_model.predict(X_val)

lr_accuracy = accuracy_score(y_val, lr_y_pred)
lr_precision = precision_score(y_val, lr_y_pred, average='weighted')
lr_recall = recall_score(y_val, lr_y_pred, average='weighted')
lr_f1 = f1_score(y_val, lr_y_pred, average='weighted')


print("Logistic Regression Model:")
print(f"Accuracy: {lr_accuracy:.4f}")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall: {lr_recall:.4f}")
print(f"F1 Score: {lr_f1:.4f}")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


SVM Model:
Accuracy: 0.8255
Precision: 0.8302
Recall: 0.8255
F1 Score: 0.8271
Logistic Regression Model:
Accuracy: 0.8238
Precision: 0.8289
Recall: 0.8238
F1 Score: 0.8253


In [16]:
def evaluate_SVMandLRmodel(model_name, y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average=None)  # Changed from 'weighted' to None
    recall = recall_score(y_true, y_pred, average=None)  # Changed from 'weighted' to None
    f1 = f1_score(y_true, y_pred, average=None)  # Changed from 'weighted' to None

    print(f"{model_name} Model:")
    print(f"Accuracy: {accuracy:.4f}")
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)

    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }
    return metrics


# Evaluate the SVM model
print("\nSVM Model Evaluation:")
svm_metrics = evaluate_SVMandLRmodel("SVM", y_val, svm_y_pred)

# Evaluate the Logistic Regression model
print("\nLogistic Regression Model Evaluation:")
lr_metrics = evaluate_SVMandLRmodel("Logistic Regression", y_val, lr_y_pred)



SVM Model Evaluation:
SVM Model:
Accuracy: 0.8255
Precision:  [0.82638889 0.71084337 0.93661972 0.84722222]
Recall:  [0.79333333 0.79194631 0.89261745 0.82432432]
F1 Score:  [0.80952381 0.74920635 0.91408935 0.83561644]

Logistic Regression Model Evaluation:
Logistic Regression Model:
Accuracy: 0.8238
Precision:  [0.80666667 0.71084337 0.91724138 0.88148148]
Recall:  [0.80666667 0.79194631 0.89261745 0.80405405]
F1 Score:  [0.80666667 0.74920635 0.9047619  0.8409894 ]


In [9]:
# Import necessary libraries
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup



# Define the RumorDataset class
class RumorDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['Tweet']
        label = self.data.iloc[idx]['Label']
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(int(label), dtype=torch.long)
        }

# Load the ConfliBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("snowood1/ConfliBERT-scr-uncased")
model = AutoModelForSequenceClassification.from_pretrained("snowood1/ConfliBERT-scr-uncased", num_labels=4) 


# Define constants
MAX_LENGTH = 128
BATCH_SIZE = 32

# Create DataLoader for the training and validation datasets
train_dataset = RumorDataset(train_df, tokenizer, MAX_LENGTH)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataset = RumorDataset(val_df, tokenizer, MAX_LENGTH)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

Some weights of the model checkpoint at snowood1/ConfliBERT-scr-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at snowood1/Conf

In [10]:
import torch.nn as nn
from sklearn.metrics import classification_report

def train_model(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss()(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)

def eval_model(model, dataloader, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    loss_fn = nn.CrossEntropyLoss()
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs.logits, labels)
            total_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.detach().cpu().numpy())
            all_labels.extend(labels.detach().cpu().numpy())

    return total_loss / len(dataloader), classification_report(all_labels, all_preds, output_dict=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

EPOCHS = 12
LEARNING_RATE = 1e-5
WEIGHT_DECAY = 0.1
WARMUP_STEPS = 200

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=EPOCHS * len(train_dataloader))

for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    train_loss = train_model(model, train_dataloader, optimizer, scheduler, device)
    val_loss, val_report = eval_model(model, val_dataloader, device)
    print(f"Train loss: {train_loss:.4f}, Val loss: {val_loss:.4f}")
    print(f"Val report: {val_report}")

# Save the model
output_dir = "rumor_detection_model"
tokenizer.save_pretrained(output_dir)
model.save_pretrained(output_dir)

Epoch 1/12
Train loss: 1.4093, Val loss: 1.3906
Val report: {'0': {'precision': 0.2222222222222222, 'recall': 0.4533333333333333, 'f1-score': 0.2982456140350877, 'support': 150}, '1': {'precision': 0.2553191489361702, 'recall': 0.24161073825503357, 'f1-score': 0.2482758620689655, 'support': 149}, '2': {'precision': 0.3950617283950617, 'recall': 0.21476510067114093, 'f1-score': 0.27826086956521734, 'support': 149}, '3': {'precision': 0.29411764705882354, 'recall': 0.13513513513513514, 'f1-score': 0.18518518518518517, 'support': 148}, 'accuracy': 0.26174496644295303, 'macro avg': {'precision': 0.29168018665306944, 'recall': 0.26121107684866074, 'f1-score': 0.25249188271361395, 'support': 596}, 'weighted avg': {'precision': 0.29155955674562545, 'recall': 0.26174496644295303, 'f1-score': 0.2526815814197379, 'support': 596}}
Epoch 2/12
Train loss: 1.3822, Val loss: 1.3374
Val report: {'0': {'precision': 0.26282051282051283, 'recall': 0.2733333333333333, 'f1-score': 0.26797385620915026, 'sup

In [11]:
import torch
from transformers import AutoModel, AutoTokenizer

# Load the pretrained model
output_dir = "rumor_detection_model"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(output_dir)

# Load the trained model
model = BertForSequenceClassification.from_pretrained(output_dir)
model.to(device)
model.eval()

# Define prediction function
def predict_rumor_status(text, model, tokenizer, max_length):
    # Encode the input text
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    # Make prediction
    with torch.no_grad():
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

    # Map predictions to labels
    label_mapping = {
        0: "true-rumour",
        1: "false-rumour",
        2: "unverified-rumour",
        3: "non-rumour",
    }
    return label_mapping[preds.item()]

# Test the prediction function with an example text
example_text = "Breaking news: Giant pandas can now fly!"
predicted_status = predict_rumor_status(example_text, model, tokenizer, MAX_LENGTH)
print(f"Predicted rumor status: {predicted_status}")

Predicted rumor status: unverified-rumour


In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


def evaluate_model(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.detach().cpu().numpy())
            all_labels.extend(labels.detach().cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    print("Evaluation:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Per label performance:")
    print(classification_report(all_labels, all_preds))
    
# Evaluate the model
evaluate_model(model, val_dataloader, device)


Evaluation:
Accuracy: 0.7869
Precision: 0.7871
Recall: 0.7869
F1 Score: 0.7853
Per label performance:
              precision    recall  f1-score   support

           0       0.79      0.74      0.76       150
           1       0.75      0.74      0.75       149
           2       0.79      0.92      0.85       149
           3       0.82      0.75      0.78       148

    accuracy                           0.79       596
   macro avg       0.79      0.79      0.79       596
weighted avg       0.79      0.79      0.79       596



In [None]:
example_texts = [
    "Breaking news: Giant pandas can now fly!",
    "A new scientific study shows that drinking coffee can reduce the risk of heart diseases.",
    "The US government has announced a new stimulus package to support small businesses.",
]

for text in example_texts:
    predicted_status = predict_rumor_status(text, model, tokenizer, MAX_LENGTH)
    print(f"Text: {text}\nPredicted rumor status: {predicted_status}\n")


Text: Breaking news: Giant pandas can now fly!
Predicted rumor status: false-rumour

Text: A new scientific study shows that drinking coffee can reduce the risk of heart diseases.
Predicted rumor status: false-rumour

Text: The US government has announced a new stimulus package to support small businesses.
Predicted rumor status: false-rumour



In [13]:
import numpy as np

def label_distribution(labels, name):
    unique_labels, counts = np.unique(labels, return_counts=True)
    print(f"{name} label distribution:")
    for label, count in zip(unique_labels, counts):
        print(f"Label {label}: {count}")

# Get the true labels from the validation dataset
true_labels = val_df['Label'].values
label_distribution(true_labels, "Validation dataset")

# Get the predicted labels from the model
pred_labels = np.array([])
model.eval()
with torch.no_grad():
    for batch in val_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        pred_labels = np.concatenate((pred_labels, preds.cpu().numpy()))

label_distribution(pred_labels, "Predicted labels")


Validation dataset label distribution:
Label 0: 150
Label 1: 149
Label 2: 149
Label 3: 148
Predicted labels label distribution:
Label 0.0: 141
Label 1.0: 146
Label 2.0: 173
Label 3.0: 136


In [14]:
class EnsembleClassifier:
    def __init__(self, svm_model, lr_model, confliBERT_model, tokenizer, device):
        self.svm_model = svm_model
        self.lr_model = lr_model
        self.confliBERT_model = confliBERT_model
        self.tokenizer = tokenizer
        self.device = device

    def _predict_confliBERT(self, text, max_length=128):
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        with torch.no_grad():
            input_ids = encoding['input_ids'].to(self.device)
            attention_mask = encoding['attention_mask'].to(self.device)
            outputs = self.confliBERT_model(input_ids, attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
        return preds.item()

    def predict(self, X):
        # For ConfliBERT predictions
        confliBERT_preds = [self._predict_confliBERT(text) for text in X]

        # For SVM and LR predictions
        X_tfidf = vectorizer.transform(X)
        svm_preds = self.svm_model.predict_proba(X_tfidf)
        lr_preds = self.lr_model.predict_proba(X_tfidf)

        # Combine predictions
        ensemble_preds = []
        for i in range(len(X)):
            avg_preds = (svm_preds[i] + lr_preds[i]) / 2
            avg_preds[confliBERT_preds[i]] += 0.5
            ensemble_preds.append(np.argmax(avg_preds))

        return np.array(ensemble_preds)


In [15]:
# Train and evaluate an ensemble model using SVM, Logistic Regression, and ConfliBERT
ensemble_model = EnsembleClassifier(svm_model, lr_model, model, tokenizer, device)

ensemble_y_pred = ensemble_model.predict(val_df['Tweet'])

ensemble_accuracy = accuracy_score(y_val, ensemble_y_pred)
ensemble_precision = precision_score(y_val, ensemble_y_pred, average='weighted')
ensemble_recall = recall_score(y_val, ensemble_y_pred, average='weighted')
ensemble_f1 = f1_score(y_val, ensemble_y_pred, average='weighted')

print("Ensemble Model (SVM + Logistic Regression + ConfliBERT):")
print(f"Accuracy: {ensemble_accuracy:.4f}")
print(f"Precision: {ensemble_precision:.4f}")
print(f"Recall: {ensemble_recall:.4f}")
print(f"F1 Score: {ensemble_f1:.4f}")


Ensemble Model (SVM + Logistic Regression + ConfliBERT):
Accuracy: 0.8440
Precision: 0.8447
Recall: 0.8440
F1 Score: 0.8435
