In [8]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
import tensorflow as tf
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [13]:
# Load Data
df = pd.read_csv("source data/twitter_human_bots_dataset.csv")

In [14]:
# Convert 'created_at' to datetime
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')

# Calculate Account Age
current_time = datetime.now()
df['account_age_days'] = (current_time - df['created_at']).dt.days

# Time-Based Features
df['creation_hour'] = df['created_at'].dt.hour
df['creation_day_of_week'] = df['created_at'].dt.dayofweek
df['creation_month'] = df['created_at'].dt.month
df['creation_year'] = df['created_at'].dt.year
df['creation_quarter'] = df['created_at'].dt.quarter
df['is_weekend'] = df['creation_day_of_week'] >= 5
df['creation_week_of_year'] = df['created_at'].dt.isocalendar().week
df['is_beginning_of_month'] = df['created_at'].dt.day <= 5
df['is_end_of_month'] = df['created_at'].dt.day >= 26

# Define part of day based on hour
def part_of_day(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

df['part_of_day'] = df['creation_hour'].apply(part_of_day)

# Additional Features
humans_mean = df[df['account_type'] == 'human']['average_tweets_per_day'].mean()
humans_std = df[df['account_type'] == 'human']['average_tweets_per_day'].std()
df['deviation_from_humans'] = (df['average_tweets_per_day'] - humans_mean) / humans_std

# Description Length Feature
df['description_length'] = df['description'].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)

# Followers/Friends Ratios
df['followers_to_friends_ratio'] = df['followers_count'] / df['friends_count']
df['followers_to_friends_ratio'].fillna(0, inplace=True)

# Followers to Tweets Per Day Ratio
df['followers_to_tweets_per_day_ratio'] = df['followers_count'] / df['average_tweets_per_day']
df['followers_to_tweets_per_day_ratio'].fillna(0, inplace=True)

# Mentions Count in Description
import re

def extract_mentions(description):
    return re.findall(r'@\w+', str(description))

df['mentions'] = df['description'].apply(extract_mentions)
df['mention_count'] = df['mentions'].apply(len)

# Ensure any remaining NaN values are filled if necessary
########################################################################
# TO DISCUSS METHOD OF IMPUTATION
########################################################################
df.fillna(0, inplace=True)
df.replace(np.inf,0, inplace = True)

In [15]:
# Encoding Categorical Features
df['account_type'] = df['account_type'].map({'human': 0, 'bot': 1})

encode_cols = ['default_profile', 'default_profile_image', 'geo_enabled', 'lang', 'location', 'verified',
               'creation_year', 'is_weekend', 'is_beginning_of_month', 'is_end_of_month', 'part_of_day']

label_encoder = LabelEncoder()
for col in encode_cols:
    df[col] = label_encoder.fit_transform(df[col].astype(str))  # Convert to string to handle NaNs if any


# Define Feature Columns and Target
id_col = ['id']
labels = ['account_type']
predictive_cols = ['default_profile', 'default_profile_image', 'favourites_count', 'followers_count', 'friends_count',
                   'geo_enabled', 'lang', 'location', 'statuses_count', 'verified', 'average_tweets_per_day', 
                   'account_age_days', 'creation_hour', 'creation_day_of_week', 'creation_month', 'creation_year',
                   'creation_quarter', 'is_weekend', 'creation_week_of_year', 'is_beginning_of_month', 
                   'is_end_of_month', 'part_of_day', 'deviation_from_humans', 'description_length', 
                   'followers_to_friends_ratio', 'followers_to_tweets_per_day_ratio', 'mention_count','account_type']

print(df.describe())

         Unnamed: 0                     created_at  default_profile  \
count  37438.000000                          37438     37438.000000   
mean   18718.500000  2012-05-10 19:06:11.558710528         0.419894   
min        0.000000            2006-07-05 19:52:46         0.000000   
25%     9359.250000  2009-12-26 20:54:38.750000128         0.000000   
50%    18718.500000            2011-10-27 02:04:41         0.000000   
75%    28077.750000            2014-04-16 15:39:40         1.000000   
max    37437.000000            2019-04-24 08:53:21         1.000000   
std    10807.564026                            NaN         0.493548   

       default_profile_image  favourites_count  followers_count  \
count           37438.000000      37438.000000     3.743800e+04   
mean                0.014905      12302.062183     3.703098e+05   
min                 0.000000          0.000000     0.000000e+00   
25%                 0.000000        362.000000     3.500000e+01   
50%                 0.000

### Tokenization

In [16]:
## Tokenize 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def process_text(text):
    # Check if the input is not a string, return an empty string if so
    if not isinstance(text, str):
        return ""
    
    # Replace links with {link} and videos with [video]
    text = re.sub(r'{link}', '', text)
    text = re.sub(r"\[video\]", '', text)
    
    # Convert to Lowercase
    text = text.lower()

    return text

# Process text
texts = df['description'].apply(lambda x: process_text(x))

vocab_size = 14225  # You can adjust this based on your dataset size and memory limits
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(texts)

# Tokenize text
sequences = tokenizer.texts_to_sequences(texts)

# 4. Pad sequences to ensure they all have the same length
max_length = 50  # Adjust based on typical length of descriptions in your dataset
data = pad_sequences(sequences, maxlen=max_length, padding='post')

### Split text and numerical data

In [17]:
y = df['account_type']
x_text = data
# removed target and description columns, and a few other text columns
x_num = df.drop(columns=[
    'description', 'account_type',
    'created_at', 'profile_background_image_url', 'profile_image_url', 'screen_name', 'mentions'])

### Split into training and testing sets

In [18]:
# Separate into train and test data
from sklearn.model_selection import train_test_split

x_text_train, x_text_test, x_num_train, x_num_test, y_train, y_test = train_test_split(
    x_text, x_num, y, test_size=0.2, random_state=42)

### Create DataLoader

In [19]:
import torch
from torch.utils.data import Dataset, DataLoader

class TwitterDataset(Dataset):
    def __init__(self, text_data, num_data, labels):
        self.text_data = torch.tensor(text_data, dtype=torch.long)  # Text as long integers
        self.num_data = torch.tensor(num_data.astype('float32').values, dtype=torch.float32)  # Numerical features as float
        self.labels = torch.tensor(labels.values, dtype=torch.long)  # Labels as long (for classification)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'text': self.text_data[idx],
            'numerical': self.num_data[idx],
            'label': self.labels[idx]
        }
    
# Initialize Dataset and DataLoader
train_dataset = TwitterDataset(x_text_train, x_num_train, y_train)
test_dataset = TwitterDataset(x_text_test, x_num_test, y_test)

# # Define DataLoader for train and test sets
batch_size = 512 # 512
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

### Neural Network Model

In [20]:
import torch.nn as nn

class TwitterBotDetector(nn.Module):
    def __init__(self, num_numerical_features, embedding_dim, hidden_dim):
        super(TwitterBotDetector, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.fc1 = nn.Linear(embedding_dim + num_numerical_features, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)  # Output layer for binary classification
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x_text, x_num):
        x_embed = self.embedding(x_text)
        x_embed = x_embed.mean(dim=1)  # Average embeddings over the sequence
        x = torch.cat((x_embed, x_num), dim=1)  # Concatenate text and numerical features
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return self.sigmoid(x)  # Sigmoid for binary classification

### Model Training

In [21]:
model = TwitterBotDetector(num_numerical_features=x_num.shape[1], 
                           embedding_dim=50, 
                           hidden_dim=128)

criterion = nn.BCELoss()  # Binary Cross Entropy Loss for binary classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    for batch in train_loader:
        text_batch = batch['text']  # Access text data
        num_batch = batch['numerical']  # Access numerical data
        labels = batch['label']  # Access labels

        optimizer.zero_grad()  # Zero the gradients
        outputs = model(text_batch, num_batch)  # Forward pass
        loss = criterion(outputs, labels.float().view(-1, 1))  # Calculate loss
        
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [1/5], Loss: 35.4331
Epoch [2/5], Loss: 30.3150
Epoch [3/5], Loss: 37.4016
Epoch [4/5], Loss: 29.1339
Epoch [5/5], Loss: 37.4016


### Model Evaluation

In [22]:
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    total = 0
    correct = 0
    for batch in test_loader:
        text_batch = batch['text']  # Access text data
        num_batch = batch['numerical']  # Access numerical data
        labels = batch['label']  # Access labels

        outputs = model(text_batch, num_batch)
        predicted = (outputs > 0.5).float()  # Thresholding to get predictions
        total += labels.size(0)
        correct += (predicted.view(-1) == labels).sum().item()

    accuracy = correct / total
    print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.6679
