In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

In [2]:
# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [3]:
# Define a function for text preprocessing using BERT
def preprocess_text(text):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids, attention_mask = encoding['input_ids'], encoding['attention_mask']
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
    last_hidden_state = outputs.last_hidden_state
    embeddings = torch.mean(last_hidden_state, 1)  # Mean pooling
    return embeddings.squeeze().numpy()

In [4]:

# Load preprocessed and split datasets
train_df = pd.read_csv("train_IFND.csv")
val_df = pd.read_csv("val_IFND.csv")
test_df = pd.read_csv("test_IFND.csv")

In [5]:

# Apply feature extraction to the datasets
def extract_features_sequential(df):
    text_embeddings = []
    for _, row in df.iterrows():
        text_embeddings.append(preprocess_text(row['Statement']))
    df['Text_Embedding'] = text_embeddings
    df['Combined_Feature'] = df['Text_Embedding']  # Only using text features
    return df

In [6]:
train_df = extract_features_sequential(train_df)
val_df = extract_features_sequential(val_df)
test_df = extract_features_sequential(test_df)

In [None]:
# Convert combined features and labels to numpy arrays
def prepare_dataset(df):
    X = np.stack(df['Combined_Feature'].values)
    y = df['Label'].apply(lambda x: 1 if x == 'TRUE' else 0).values
    return X, y

X_train, y_train = prepare_dataset(train_df)
X_val, y_val = prepare_dataset(val_df)
X_test, y_test = prepare_dataset(test_df)

In [None]:
# Create TensorDatasets and DataLoaders
train_data = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long))
val_data = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.long))
test_data = TensorDataset(torch.tensor(X_test, dtype=torch.float32), torch.tensor(y_test, dtype=torch.long))

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)


In [None]:
# Define the classifier
class TextClassifier(nn.Module):
    def __init__(self, input_dim):
        super(TextClassifier, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 2)  # Binary classification
        )
    
    def forward(self, x):
        return self.fc(x)

# Initialize the model, loss function, and optimizer
input_dim = X_train.shape[1]
model = TextClassifier(input_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training loop with early stopping
num_epochs = 10
best_val_loss = float('inf')
patience = 2
patience_counter = 0

for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    model.eval()
    val_loss = 0.0
    val_accuracy = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            val_accuracy += (preds == labels).float().mean()

    val_loss /= len(val_loader)
    val_accuracy /= len(val_loader)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}')
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered")
            break

# Evaluate on the test set
model.eval()
test_loss = 0.0
test_accuracy = 0.0
with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        test_accuracy += (preds == labels).float().mean()

test_loss /= len(test_loader)
test_accuracy /= len(test_loader)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

In [None]:
from sklearn.metrics import classification_report, roc_auc_score
import torch

# Ensure the model is in evaluation mode
model.eval()

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialize lists to collect predictions and true labels
all_preds = []
all_labels = []

# Disable gradient calculations for inference
with torch.no_grad():
    for inputs, labels in test_loader:
        # Move inputs and labels to the appropriate device (GPU or CPU)
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Get model outputs
        outputs = model(inputs)
        
        # Get the index of the max log-probability, i.e., the predicted class
        _, preds = torch.max(outputs, 1)
        
        # Append predictions and true labels to respective lists
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate classification report
class_report = classification_report(all_labels, all_preds, target_names=['FALSE', 'TRUE'])

# Function to colorize classification report
def colorize_report(report):
    report_lines = report.split('\n')
    new_report = []
    for line in report_lines:
        if line.strip():
            if line.startswith('FALSE') or line.startswith('TRUE'):
                new_report.append('\033[1m' + line + '\033[0m')
            else:
                new_report.append(line)
    return '\n'.join(new_report)

# Colorize classification report
colorized_report = colorize_report(class_report)

# Print colorized classification report
print("Classification Report:")
print(colorized_report)

# Calculate ROC-AUC score
roc_auc = roc_auc_score(all_labels, all_preds)
print(f"ROC-AUC Score: {roc_auc:.4f}")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Calculate confusion matrix
conf_matrix = confusion_matrix(all_labels, all_preds)

# Define labels
labels = ['FALSE', 'TRUE']

# Create seaborn heatmap
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.2)  # Adjust font size
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)

# Add labels and title
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')

# Display the plot
plt.show()
