In [3]:
# Import Libraries
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
import tensorflow as tf
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
# Load the data
df = pd.read_csv("source data/twitter_human_bots_dataset.csv")

In [5]:
# Check cuda status
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU not available, CPU used


In [6]:
# Set seeds for reproducibility
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
                  
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [7]:
missing_descriptions = df['description'].isna().sum()
print(f"Number of rows with NaN in 'description': {missing_descriptions}")

# Count rows where 'description' is NaN and 'label' is 'human'
num_human = df[df['description'].isna() & (df['account_type'] == 'human')].shape[0]

# Count rows where 'description' is NaN and 'label' is 'bot'
num_bot = df[df['description'].isna() & (df['account_type'] == 'bot')].shape[0]

# Print the results
print(f"Number of rows with NaN 'description' and account_type 'human': {num_human}")
print(f"Number of rows with NaN 'description' and account_type 'bot': {num_bot}")

Number of rows with NaN in 'description': 7257
Number of rows with NaN 'description' and account_type 'human': 2911
Number of rows with NaN 'description' and account_type 'bot': 4346


In [8]:
# Replace NaN values in 'description' with a placeholder string
df['description'].fillna('unknown', inplace=True)
print(df['description'].isna().sum())

0


In [17]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Convert account_type from categorical ('bot'/'human') to numerical labels (0/1)
df['account_type'] = df['account_type'].map({'bot': 0, 'human': 1})

# Convert pandas DataFrame to Hugging Face dataset format
dataset = Dataset.from_pandas(df[['description', 'account_type']])

# Perform an 80-20 train-test split with a fixed random seed
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)

In [18]:
from datasets import Dataset, DatasetDict

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["description"], padding="max_length", truncation=True, max_length=64)

# Tokenize the training and testing datasets
tokenized_train = train_test_split['train'].map(tokenize_function, batched=True)
tokenized_test = train_test_split['test'].map(tokenize_function, batched=True)

# Rename columns for consistency
tokenized_train = tokenized_train.rename_column("account_type", "labels")
tokenized_test = tokenized_test.rename_column("account_type", "labels")

# Remove the 'description' column from the tokenized datasets
tokenized_train = tokenized_train.remove_columns(["description"])
tokenized_test = tokenized_test.remove_columns(["description"])

# Set format for PyTorch
tokenized_train.set_format("torch", columns=["labels", "input_ids", "token_type_ids", "attention_mask"])
tokenized_test.set_format("torch", columns=["labels", "input_ids", "token_type_ids", "attention_mask"])

# Create a DatasetDict
tokenized_datasets = DatasetDict({
    "train": tokenized_train,
    "test": tokenized_test
})

# Check the structure of the DatasetDict
print(tokenized_datasets)

Map: 100%|██████████| 29950/29950 [00:02<00:00, 10601.12 examples/s]
Map: 100%|██████████| 7488/7488 [00:00<00:00, 14570.50 examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 29950
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7488
    })
})





In [19]:
# Shuffle the dataset with a fixed seed and select a range of examples
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))
eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(200))

In [20]:
from torch.utils.data import DataLoader

batchsize=10
# Create Dataloader
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batchsize)
eval_dataloader = DataLoader(eval_dataset, batch_size=batchsize)

In [21]:
from transformers import AutoModelForMaskedLM
# Because we initialized BertForMaskedLM and concat is with our classifier instead of directly using BertForSequenceClassification
# Some weights of the model checkpoint at bert-base-uncased were not used is within the expectation.
import torch
from torch import nn

class Model(nn.Module):
    def __init__(self,output_dim,dropout_rate):
        super(Model,self).__init__()
        self.encoder=AutoModelForMaskedLM.from_pretrained("bert-base-uncased", output_hidden_states=True, return_dict=True)
        self.dropout=nn.Dropout(dropout_rate)
        # For the "bert-base-uncased" model, each hidden state has a dimension of 768.
        # the value 3072=4*768 corresponds to the total dimension of the concatenated hidden states from the BERT model.
        self.classifier=nn.Linear(3072,output_dim)


    def forward(self,input_ids,token_type_ids,attention_mask):
        outputs = self.encoder(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)

        # The BERT-base-uncased model has 12 hidden layers plus the initial embedding layer.
        # This line concatenates the hidden states from the last four transformer layers from BERT along the last dimension.
        # The shape of the resulting tensor is [batch_size, sequence_length, hidden_dim*4].
        hidden_states = torch.cat(tuple([outputs.hidden_states[i] for i in [-1, -2, -3, -4]]), dim=-1)

        # We are actually extracting the hidden state of the [CLS] token for each sequence in the batch.
        # This [CLS] token's hidden state is typically used as a fixed-size representation of the entire sequence.
        # This representation has been learned during BERT's pretraining to capture important information for various tasks.
        # In the context of classification, you can think of the [CLS] token's hidden state as a summary of the sequence's content.
        # Dropout is applied to this representation and is then fed into the linear classifier to make predictions for the task at hand.
        x=self.dropout(hidden_states[:, 0, :])
        x=self.classifier(x)
        return x

In [22]:
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW

model = Model(output_dim=2, dropout_rate = 0.5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
loss_fct = CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=5e-5)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

In [26]:
from transformers import get_scheduler
import evaluate
from tqdm.auto import tqdm

epochs = 5
num_training_steps = epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    )
metric = evaluate.load("accuracy")
progress_bar = tqdm(range(num_training_steps))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [27]:
for epoch in range(epochs):
        for batch in train_dataloader:
            model.train()
            # Loop through batches in the training data loader
            batch = {k: v.to(device) for k, v in batch.items()}
            label_ids = batch['labels'].long()
            input_ids = batch['input_ids']
            token_type_ids = None
            # When using BERT for tasks like single-text classification or sequence labeling, the token_type_ids is an optional parameter, commonly set to None.
            attention_mask = batch['attention_mask']
            # Perform a forward pass through the model to get logits
            logits = model(input_ids, token_type_ids, attention_mask)

            # Calculate the loss using the provided loss function
            loss = loss_fct(logits, label_ids.view(-1))
            # Perform backward pass and update model parameters
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad() # Clear accumulated gradients
            progress_bar.update(1) # Update progress bar

        # Set the model to evaluation mode for validation
        model.eval()
        for batch in eval_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad(): # disable gradient computation
                label_ids = batch['labels']
                input_ids = batch['input_ids']
                token_type_ids = None
                attention_mask = batch['attention_mask']
                logits = model(input_ids, token_type_ids, attention_mask)
                loss = loss_fct(logits, label_ids.view(-1))

            # Get predicted labels by selecting the class with the highest probability
            predictions = torch.argmax(logits, dim=-1)
            metric.add_batch(predictions=predictions, references=batch["labels"])

        acc = metric.compute()
        print(f'Epoch {epoch+1}')
        print(f'val_loss : {loss}')
        print(f"val_accuracy: {acc['accuracy'] * 100}")
        print(25*'==')

 20%|██        | 200/1000 [06:02<28:33,  2.14s/it]

Epoch 1
val_loss : 0.6618937253952026
val_accuracy: 72.0


 40%|████      | 400/1000 [14:39<33:04,  3.31s/it]  

Epoch 2
val_loss : 0.7385145425796509
val_accuracy: 79.5


 60%|██████    | 600/1000 [22:00<13:31,  2.03s/it]

Epoch 3
val_loss : 0.7959121465682983
val_accuracy: 76.0


 80%|████████  | 800/1000 [28:53<07:29,  2.25s/it]

Epoch 4
val_loss : 1.0708229541778564
val_accuracy: 79.0


100%|██████████| 1000/1000 [35:54<00:00,  1.93s/it]

Epoch 5
val_loss : 1.0560492277145386
val_accuracy: 79.5
