# Importing dependencies

In [6]:
import pandas as pd
from pymongo import MongoClient
import numpy as np 
import pandas as pd
import csv
from pandas import json_normalize

# 1) Tweets Sampling from the tweet dataset

## 1.1 Querying from MongoDB
Due to the limitations of the computational power of our devices, we have decided to split the 8 tweet datasets up amongst ourselves (our group). The script below depicts the sequential steps to take in order to derive the final graph and user datasets for each of the tweet files.

Due to the sheer size of the datasets, we had to utilise MongoDB to assist in the querying and sampling of the data that was required.

In [7]:
# Change your db and collection name accordingly
hostname = 'localhost'
port = 27017  
client = MongoClient(hostname, port)
db = client['BT4222'] #Change your db accordingly
tweets = db['Tweets_0'] #change your tweet data accordingly

In [8]:
# Filter for all retweets that are in English and extract author_ids
pipeline = [
    {'$match': {'lang': 'en', 'text': {'$regex':'^RT'}}},
    {'$project': {'_id': 0, 'author_id': 1}}
]
user_cursor = tweets.aggregate(pipeline)

In [9]:
#Identifying the number of unique author_ids
unique_ids = set()
for record in user_cursor:
    unique_ids.add(record['author_id'])

unique_count = len(unique_ids)
print("Number of unique author_id:", unique_count)

Number of unique author_id: 0


## 1.2 Load the labelled dataset `label.csv` to find out which users are bots and humans

In [10]:
# load the label dataset
labels = pd.read_csv("label.csv")
labels["id"] = labels["id"].str.replace("u","")
labels["id"] = labels["id"].astype("int64")

# convert label to human: 0, bot: 1
labels["label"] = labels["label"].str.replace("human","0")
labels["label"] = labels["label"].str.replace("bot","1")
labels["label"] = labels["label"].astype("int")

labels.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/hengboonlong/Library/CloudStorage/OneDrive-NationalUniversityofSingapore/Twibot-22/label.csv'

## 1.3 Engage in Disproportionate Stratified Sampling to get equal number of bots and humans
Clearly, the bot data present in the dataset is very limited and this has led to a huge imbalance within the data set. To answer the severe underrepresentation of bots within the dataset, we have decided to utilise a disproportionate stratified sampling to obtain an equal number of bots and humans.

### Limitations 
- Risk of underrepresentation of majority class (Human Dataset) / eliminating too much information from the Human Dataset

### Mitigating Measures
- Due to the comprehensiveness and size of our dataset, we were still able to retain 50,000 human ids and thus, we believe that the 50,000 human ids can adequately represent the human population.

In [None]:
# Identifying bot and human ids
bot_ids = []
human_ids = []

for id in unique_ids:
    try:
        if (labels[labels["id"]==id]["label"].values[0] == 0):
            human_ids.append(id)
        elif (labels[labels["id"]==id]["label"].values[0] == 1):
            bot_ids.append(id)
            
    # if id doesnt exist in label dataset
    except:
        continue
        
print(f"Number of bots: {len(bot_ids)}") 
print(f"Number of humans: {len(human_ids)}")

Number of bots: 11427
Number of humans: 173822


In [None]:
from math import ceil
from random import sample

# Sample 80% of the minority class (bots)
n_samples_per_class = ceil(0.8 * len(bot_ids))
print(f"Number of samples per class: {n_samples_per_class}")

sampled_human_ids = sample(human_ids, n_samples_per_class)
sampled_bot_ids = sample(bot_ids, n_samples_per_class)
sampled_ids = sampled_human_ids + sampled_bot_ids

print(f"Number of sampled bots: {len(sampled_bot_ids)}") 
print(f"Number of sampled humans: {len(sampled_human_ids)}")
print(f"Number of sampled ids: {len(sampled_ids)}")

Number of samples per class: 9142
Number of sampled bots: 9142
Number of sampled humans: 9142
Number of sampled ids: 18284


## 1.4 Extract all tweets from MongoDB where the authors or tweet mentions are in sampled_ids

In [None]:
pipeline = [
    {'$match': {'$or': [{'author_id': {'$in': sampled_ids}}, {'entities.user_mention.id': {'$in': sampled_ids}}]}},
    {'$project': {
        '_id': 0, 
        'author_id': 1, 
        'conversation_id': 1, 
        'created_at': 1, 
        'id': 1, 
        'text': 1, 
        'user_mentions_count': {
            '$cond': {
                'if': {'$isArray': "$entities.user_mentions"},
                'then': {'$size': "$entities.user_mentions"},
                'else': 0
            }
        },
        'hashtags_count': {
            '$cond': {
                'if': {'$isArray': "$entities.hashtags"},
                'then': {'$size': "$entities.hashtags"},
                'else': 0
            }
        },
        'symbols_count': {
            '$cond': {
                'if': {'$isArray': "$entities.symbols"},
                'then': {'$size': "$entities.symbols"},
                'else': 0
            }
        },
        'urls_count': {
            '$cond': {
                'if': {'$isArray': "$entities.urls"},
                'then': {'$size': "$entities.urls"},
                'else': 0
            }
        },
        'mentioned_user_id': {'$arrayElemAt': ["$entities.user_mentions.id", 0]},
        'in_reply_to_user_id': 1, 
        'retweet_count': '$public_metrics.retweet_count', 
        'reply_count': '$public_metrics.reply_count', 
        'like_count': '$public_metrics.like_count', 
        'quote_count': '$public_metrics.quote_count'
    }}
]

tweet_cursor = tweets.aggregate(pipeline)

In [None]:
tweets_df = pd.DataFrame(list(tweet_cursor))
tweets_df.head()

Unnamed: 0,author_id,conversation_id,created_at,id,in_reply_to_user_id,text,user_mentions_count,hashtags_count,symbols_count,urls_count,retweet_count,reply_count,like_count,quote_count,mentioned_user_id
0,3971456121,1496841052032012288,2022-02-24 13:34:51+00:00,t1496841052032012293,,💥Today at 5PM CET - the first ERA Journal Club...,0,0,0,1,1,,0,,
1,3971456121,1496753536889204736,2022-02-24 07:47:06+00:00,t1496753536889204736,,🆕European Renal Best Practice endorsement of\n...,4,0,0,1,11,,7,,20373510.0
2,3971456121,1496407866793775104,2022-02-23 08:53:32+00:00,t1496407866793775111,,📣NEW Editorial:\nWhat the seminal experience o...,2,0,0,1,0,,1,,81876080.0
3,3971456121,1496142253743644672,2022-02-22 15:18:04+00:00,t1496142253743644684,,🚨Only a few days left to check out our Article...,1,0,0,1,11,,21,,4836129000.0
4,3971456121,1495740814122754048,2022-02-21 12:42:54+00:00,t1495740814122754048,,RT @Stones__: Another #AnimalHouse pearl court...,3,3,0,0,4,,0,,1313196000.0


## 1.5 Data Preprocessing

### Add a new column `relationship` to classify tweet as a `retweet` \ `reply` \ `post`
- If a tweet starts with "RT", it is a retweet
- If a tweet has a "in_reply_to_user_id" that is non-null, it is a reply to the said user_id
- Else, it will be a normal post

In [None]:
# Add a new column "relationship" -> retweet/reply/post
def add_relationship(df):
    if (df["text"][:2] == "RT"):
        return "Retweet"
    elif (pd.notna(df["in_reply_to_user_id"])):
        return "Reply"
    else:
        return "Post"

tweets_df["relationship"] = tweets_df.apply(lambda x : add_relationship(x), axis=1)

# Add a new column to define the target_user_id
def target_user(df):
    if (df["relationship"] == "Retweet"):
        return df["mentioned_user_id"]
    elif (df["relationship"] == "Reply"):
        return df["in_reply_to_user_id"]
    else:
        return np.nan

tweets_df["target_user_id"] = tweets_df.apply(lambda x : target_user(x), axis=1)

In [None]:
# drop the "in_reply_to_user_id" and "mentioned_user_id"
tweets_df = tweets_df.drop(["in_reply_to_user_id", "mentioned_user_id"], axis=1)
tweets_df = tweets_df.rename({"author_id": "source_user_id", "id": "tweet_id"}, axis=1)

### Drop all rows with null `target_user_id` as it does not capture any relationship between two users

In [None]:
# Check what is the total unique source_user_id and target_user_id in the dataset
unique_ids = set(np.concatenate((tweets_df["source_user_id"],tweets_df[tweets_df["target_user_id"].notnull()]["target_user_id"])))
print(f"Number of unique source and target user ids: {len(unique_ids)}")

Number of unique source and target user ids: 219395


### Identification of all the source and target user ids without a label
This means that the source and target user id has no user metadata and thus, will be removed from our overall dataset

In [None]:
bot_ids = []
human_ids = []
ids_no_label = []

for id in unique_ids:
    try:
        if (labels[labels["id"]==id]["label"].values[0] == 0):
            human_ids.append(id)
        elif (labels[labels["id"]==id]["label"].values[0] == 1):
            bot_ids.append(id)
            
    # if id doesnt exist in label dataset
    except:
        ids_no_label.append(id)

print(f"Number of bots: {len(bot_ids)}") 
print(f"Number of humans: {len(human_ids)}")
print(f"Number of ids with no label: {len(ids_no_label)}")

Number of bots: 11699
Number of humans: 83697
Number of ids with no label: 123999


## 1.6 Utilise Disproportionate Stratified Sampling to ensure that final dataset includes equal number of bots and humans in both `source_user_id` and `target_user_id`

Although disproportionate sacrifices some precision in the estimate of the majority class, in the context of our project, is great for identifying differences between underrepresented groups (bots) and the majority class (humans)

In [None]:
# Ensure that we have same number of human and bots captured in the tweet dataset
num_bots = len(bot_ids)
num_humans = len(human_ids)
if (num_bots < num_humans):
    sampled_human_ids = sample(human_ids, ceil(1.3*num_bots))
    final_ids = bot_ids + sampled_human_ids
else:
    sampled_bot_ids = sample(bot_ids, ceil(1.3*num_humans))
    final_ids = human_ids + sampled_bot_ids

# Drop any rows where the source_user_id and target_user_id not in final_ids
tweets_df = tweets_df.drop(tweets_df[(tweets_df["target_user_id"].notnull()) & ((~tweets_df["target_user_id"].isin(final_ids)) | (~tweets_df["source_user_id"].isin(final_ids)))].index, axis=0)
tweets_df = tweets_df.drop(tweets_df[(tweets_df["target_user_id"].isnull()) & (~tweets_df["source_user_id"].isin(final_ids))].index, axis=0)

### Final Counts of valid Human and Bot IDs within the dataset

In [None]:
unique_ids = set(np.concatenate((tweets_df["source_user_id"],tweets_df[tweets_df["target_user_id"].notnull()]["target_user_id"])))
print(f"Number of unique source and target user ids: {len(unique_ids)}")

bot_id = 0
human_id = 0
id_no_label = 0

for id in unique_ids:
    try:
        if (labels[labels["id"]==id]["label"].values[0] == 0):
            human_id += 1
        elif (labels[labels["id"]==id]["label"].values[0] == 1):
            bot_id += 1
            
    # if id doesnt exist in label dataset
    except:
        id_no_label += 1
        
print(f"Number of bots: {bot_id}") 
print(f"Number of humans: {human_id}")
print(f"Number of ids with no label: {id_no_label}")

Number of unique source and target user ids: 21457
Number of bots: 10763
Number of humans: 10694
Number of ids with no label: 0


# 2. Querying relevant user metadata based on the final set of sample ids (Section 1)

## 2.1 Accessing MongoDB for valid user's metadata

In [None]:
users = db['users']

# Define your query (empty to fetch all documents)
query = {}

# Specify the fields you want to include (1) or exclude (0)
projection = {
    'id' : 1,
    'created_at' : 1,
    'description' : 1,
    'location' : 1,
    'name' : 1,
    'profile_image_url' : 1, 
    'public_metrics': 1,
    'url' : 1,
    'username' : 1,
    'verified' : 1,
    'entities' : 1
}

# Execute the query
documents = users.find(query, projection)

# Convert the query result to a pandas DataFrame
df = pd.DataFrame(list(documents))

In [None]:
# Normalising nested jsons within the dataframe
df = pd.concat([df, json_normalize(df["entities"])], axis = 1)
df = pd.concat([df, json_normalize(df["public_metrics"])], axis = 1)

## 2.2 Data Preprocessing
- ensuring the correct data type for each feature

In [None]:
#verified
df['verified'] = df['verified'].apply(lambda x: 1 if x else 0)

#urls
df["url.urls"] = df["url.urls"].fillna(0) 
df["url.urls"] = df["url.urls"].apply(lambda x : len(x) if x else 0)

#number of urls in their descriptions
df["description.urls"] = df["description.urls"].fillna(0) 
df["description.urls"] = df["description.urls"].apply(lambda x : len(x) if x else 0)

#number of mentions
df["description.mentions"] = df["description.mentions"].fillna(0) 
df["description.mentions"] = df["description.mentions"].apply(lambda x : len(x) if x else 0)

#number of hashtags
df["description.hashtags"] = df["description.hashtags"].fillna(0) 
df["description.hashtags"] = df["description.hashtags"].apply(lambda x : len(x) if x else 0)

#number of cashtags
df["description.cashtags"] = df["description.cashtags"].fillna(0) 
df["description.cashtags"] = df["description.cashtags"].apply(lambda x : len(x) if x else 0)

df = df.drop(['entities', 'public_metrics'], axis = 1)

#align the user information with source_user_id to serve as the key for join

df["source_user_id"] = df["id"].apply(lambda x : int(x.replace("u", "")))


In [None]:
def convert_mixed_timestamps(ts):
    try:
        # Attempt to convert using the ISO format
        return pd.to_datetime(ts, exact=False, errors='raise')
    except ValueError:
        # Check if ts is not convertable to float (e.g., empty string, '[]')
        try:
            # This will fail if ts is not a valid string representation of a float
            float_ts = float(ts)
        except ValueError:
            # If conversion fails, return None or handle as needed
            return None
        else:
            # If conversion is successful, proceed to convert to datetime
            return pd.to_datetime(float_ts / 1e9, unit='s', utc=True)

## 2.3 Feature Extraction I (User Profile Features)
Refer to the report for the mathematical representations / formulas of each of the Features. The intuition behind the creation of each feature is also detailed in the report with references from relevant scholarship.

1) `Username length`
2) `Name length`
3) `Description length`
4) `Number of digits in username`
5) `Entropy of username`
6) `Entropy of description`
7) `Name and username similarity`
8) `Ratio of length of username to length of name`
10) `Reputation`
11) `Age of the account` 


1, 2 & 3. `Username, name and description length`

In [None]:
df["username_length"] = df["username"].apply(len)
df["name_length"] = df["name"].apply(len)
df["description_length"] = df["description"].apply(len)

4. `Number of digits in username`

In [None]:
df['numDigits_username'] = df['username'].apply(lambda x: sum(c.isdigit() for c in x))

5 & 6. `Entropy of username and description`

In [None]:
import math
from collections import Counter

# Function to calculate entropy of a string
def entropy(s):
    probabilities = [n_x/len(s) for x, n_x in Counter(s).items()]
    e = -sum([p * math.log(p) / math.log(2.0) for p in probabilities])
    return e

df['username_entropy'] = df['username'].apply(entropy)
df['description_entropy'] = df['description'].apply(entropy)

7. `Name and username similarity`

In [None]:
def name_similarity(str1, str2):
    matches = sum(1 for a, b in zip(str1, str2) if a == b)
    total_length = len(str1) + len(str2)
    similarity_score = (2 * matches) / total_length if total_length > 0 else 0
    return similarity_score

df['names_similarity'] = df.apply(lambda x: name_similarity(x['name'], x['username']), axis=1)

8. `Ratio of name length to username length`

In [None]:
df['names_ratio'] = df.apply(lambda x: len(x['username']) / len(x['name']) if len(x['name']) > 0 else 0, axis=1)

9. `Reputation`

In [None]:
df["reputation"] = df['following_count'] / (df['followers_count'] + 1e-9)

10. `Age of the account` 


In [None]:
#current date - user creation date / total number of tweets
#convert created_at to timestamp
df['created_at_timestamp'] = df['created_at'].apply(convert_mixed_timestamps)
current_date = pd.Timestamp.now().tz_localize('UTC') 
df['age_of_account'] = (current_date - df["created_at_timestamp"]).dt.days / df["tweet_count"]

## 2.4 Feature Extraction II (Temporal Features)
1) `Retweet Ratio`
2) `URL Ratio`
3) `Max number of URLs in tweets` 
4) `Tweet Time Standard Deviation`
5) `Mention Ratio` 
6) `Max number of mentions in tweets`
7) `Hashtag Ratio`
8) `Max Number of Hashtags in tweet` 
10) `Average Length of Tweets by user`
11) `Average number of tweets containing URLs`


In [None]:
g = tweets_df

### Data Preprocessing

Converting column `source_user_id` to Integer whilst dropping any invalid entries.

In [None]:
# Identification of problematic rows that are present

problematic_rows = []
for index, row in g.iterrows():
    try:
        _ = int(row['source_user_id']) 
    except ValueError:
        problematic_rows.append(index) 

# Display problematic rows
print("Problematic rows at indices:", problematic_rows)
graph = g.drop(problematic_rows).reset_index(drop = True)

g["source_user_id"] = g["source_user_id"].apply(lambda x: int(x))

Problematic rows at indices: []


Converting `created_at_timestamp` to `datetime` type

In [None]:
#convert to timestamp
g['created_at_timestamp'] = g['created_at'].apply(convert_mixed_timestamps)

Fill NA values for `retweet_count`, `reply_count`, `like_count`, `quote_count` with 0

In [None]:
#Filling NAN values
g['retweet_count'] = g['retweet_count'].fillna(0)
g['reply_count'] = g['reply_count'].fillna(0)
g['like_count'] = g['like_count'].fillna(0)
g['quote_count'] = g['quote_count'].fillna(0)

Filter for only valid users from g (valid `source_user_id` and `target_user_id`)

In [None]:
user_ids = set(np.concatenate((g["source_user_id"],g[g["target_user_id"].notnull()]["target_user_id"])))
user = df[df["source_user_id"].isin(user_ids)]

Join user dataframe with labels dataframe so that each user can be tagged to their label

In [None]:
user = user.merge(labels, left_on="source_user_id", right_on="id", how="inner")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21457 entries, 0 to 21456
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   _id                   21457 non-null  object             
 1   created_at            21457 non-null  object             
 2   description           21457 non-null  object             
 3   id_x                  21457 non-null  object             
 4   location              16243 non-null  object             
 5   name                  21457 non-null  object             
 6   profile_image_url     21457 non-null  object             
 7   url                   21457 non-null  object             
 8   username              21457 non-null  object             
 9   verified              21457 non-null  bool               
 10  url.urls              21457 non-null  int64              
 11  description.urls      21457 non-null  int64              
 12  desc

In [None]:
user = user.drop("_id", axis=1)

1. `Retweet Ratio`

In [None]:
def retweet_ratio(series):
    if 'Retweet' in series.value_counts():
        return (series.value_counts()['Retweet'] + 1) / (len(series) + 3)
    else:
        return 1/(len(series) + 3)

retweet_ratios = pd.DataFrame(g.groupby('source_user_id')['relationship'].agg(retweet_ratio))

retweet_ratios['source_user_id'] = retweet_ratios.index

retweet_ratios.reset_index(drop=True, inplace=True)

retweet_ratios.columns = ['retweet_ratio', 'source_user_id']

10511

In [None]:
# Merging Retweet Ratios into user dataframe
user = pd.merge(user, retweet_ratios, on='source_user_id', how='left')

# fill the null values with the mean retweet ratio
user['retweet_ratio'] = user.groupby('label')['retweet_ratio'].transform(lambda x: x.fillna(x.mean()))

# make sure no null values left
user["retweet_ratio"].isnull().sum()

2. `URL Ratio`

In [None]:
#URL Ratio = number of URLs posted per user / total number of tweets of user 

#Aggregate URLs
url_count = pd.DataFrame(g.groupby('source_user_id')['urls_count'].sum())
url_count['source_user_id'] = url_count.index
url_count.reset_index(drop=True, inplace=True)
url_count.columns = ['url_count_tweets', 'source_user_id']
user = pd.merge(user, url_count, on='source_user_id', how='left')

#calculating the url_ratio
user['url_ratio'] = (user["url.urls"] + user["url_count_tweets"]) / user["tweet_count"]

In [None]:
# fill the null values with the mean url ratio 
user['url_ratio'] = user.groupby('label')['url_ratio'].transform(lambda x: x.fillna(x.mean()))

# make sure no null values left
user["url_ratio"].isnull().sum()

0

3. `Max Number of URLs per tweet`

In [None]:
#Aggregate URLs
url_max = pd.DataFrame(g.groupby('source_user_id')['urls_count'].max())
url_max['source_user_id'] = url_max.index
url_max.reset_index(drop=True, inplace=True)
url_max.columns = ['url_max_tweets', 'source_user_id']
user = pd.merge(user, url_max, on='source_user_id', how='left')

In [None]:
# fill the null values with the mean max number of URLs per tweet 
user['url_max_tweets'] = user.groupby('label')['url_max_tweets'].transform(lambda x: x.fillna(x.mean()))

# make sure no null values left
user["url_max_tweets"].isnull().sum()

0

4. `Tweet Time Standard Deviation`

In [None]:
def tweet_time_sd(series):
    if len(series) < 2:
        return np.nan
    time_diffs = series.diff().dt.total_seconds()[1:]

    mean_time_diff = time_diffs.mean()

    sd = (time_diffs - mean_time_diff) ** 2

    mean = np.sqrt(sd.mean()) / 86400

    return mean 

tweet_time_SD = pd.DataFrame(g.groupby('source_user_id')['created_at_timestamp'].agg(tweet_time_sd))
tweet_time_SD['source_user_id'] = tweet_time_SD.index
tweet_time_SD.reset_index(drop=True, inplace=True)
tweet_time_SD.columns = ['time_interval_sd_day', 'source_user_id']

user = pd.merge(user, tweet_time_SD, on='source_user_id', how='left')

In [None]:
# fill the null values with the mean tweet time std 
user['time_interval_sd_day'] = user.groupby('label')['time_interval_sd_day'].transform(lambda x: x.fillna(x.mean()))

# make sure no null values left
user["time_interval_sd_day"].isnull().sum()

0

5. `Mention Ratio`

In [None]:
#Mention Ratio = number of Mentions / total number of tweets of user u

#Aggregate Mentions
mention_count = pd.DataFrame(g.groupby('source_user_id')['user_mentions_count'].sum())
mention_count['source_user_id'] = mention_count.index
mention_count.reset_index(drop=True, inplace=True)
mention_count.columns = ['mention_count_tweets', 'source_user_id']

#Addition into user
user = pd.merge(user, mention_count, on='source_user_id', how='left')

In [None]:
# fill the null values with the mean mention ratio 
user['mention_count_tweets'] = user.groupby('label')['mention_count_tweets'].transform(lambda x: x.fillna(x.mean()))

# make sure no null values left
user["mention_count_tweets"].isnull().sum()

0

6. `Max Number of Mention per tweet`

In [None]:
#Aggregate Mentions
mention_max = pd.DataFrame(g.groupby('source_user_id')['user_mentions_count'].max())
mention_max['source_user_id'] = mention_max.index
mention_max.reset_index(drop=True, inplace=True)
mention_max.columns = ['mention_max_tweets', 'source_user_id']
user = pd.merge(user, mention_max, on='source_user_id', how='left')

In [None]:
# fill the null values with the mean max number of mention per tweet 
user['mention_max_tweets'] = user.groupby('label')['mention_max_tweets'].transform(lambda x: x.fillna(x.mean()))

# make sure no null values left
user["mention_max_tweets"].isnull().sum()

0

7. `Hashtags Ratio`

In [None]:
#Hashtags Ratio = number of Mentions / total number of tweets of user u

#Aggregate Mentions
hashtag_count = pd.DataFrame(g.groupby('source_user_id')['hashtags_count'].sum())
hashtag_count['source_user_id'] = hashtag_count.index
hashtag_count.reset_index(drop=True, inplace=True)
hashtag_count.columns = ['hashtag_count_tweets', 'source_user_id']

#Addition into user
user = pd.merge(user, hashtag_count, on='source_user_id', how='left')

In [None]:
# fill the null values with the mean hashtag ratio 
user['hashtag_count_tweets'] = user.groupby('label')['hashtag_count_tweets'].transform(lambda x: x.fillna(x.mean()))

# make sure no null values left
user["hashtag_count_tweets"].isnull().sum()

0

8. `Max Number of Hashtags per tweet`

In [None]:
#Aggregate Hashtags
hashtags_max = pd.DataFrame(g.groupby('source_user_id')['hashtags_count'].max())
hashtags_max['source_user_id'] = hashtags_max.index
hashtags_max.reset_index(drop=True, inplace=True)
hashtags_max.columns = ['hashtags_max_tweets', 'source_user_id']
user = pd.merge(user, hashtags_max, on='source_user_id', how='left')

In [None]:
# fill the null values with the mean max number of hashtag per tweet 
user['hashtags_max_tweets'] = user.groupby('label')['hashtags_max_tweets'].transform(lambda x: x.fillna(x.mean()))

# make sure no null values left
user["hashtags_max_tweets"].isnull().sum()

0

9 & 10. `Average tweet length`
`Number of tweets`
`Number of tweets with URLs`

In [None]:
# total tweet count
tweet_counts = g.groupby('source_user_id').size().reset_index(name='tweet_count')

# average length
g['tweet_length'] = g['text'].str.len()
avg_tweet_length = g.groupby('source_user_id')['tweet_length'].mean().reset_index(name='avg_tweet_length')

def contains_anything(lst):
    if isinstance(lst, list):
        return len(lst) > 0
    else:
        return False

tweets_with_url_counts = g[g['urls_count']>0].groupby('source_user_id').size().reset_index(name='url_tweet_count')

In [None]:
# Joining user, tweet_counts, avg_tweet_length, tweet_with_url_counts
user = pd.merge(user, tweet_counts, on='source_user_id', how='left')
user = pd.merge(user, avg_tweet_length, on='source_user_id', how='left')
user = pd.merge(user, tweets_with_url_counts, on='source_user_id', how='left')

In [None]:
user.rename(columns={'tweet_count_y': 'tweet_count'}, inplace=True)
user = user.drop("tweet_count",axis=1)
user = user.rename({'tweet_count_x': 'tweet_count'}, axis=1)

In [None]:
cols = ["url_count_tweets","avg_tweet_length","url_tweet_count"]

# fill the null values with the mean value 
for col in cols:
    user[col] = user.groupby('label')[col].transform(lambda x: x.fillna(x.mean()))

# make sure no null values left
user.isnull().sum()

created_at                 0
description                0
id_x                       0
location                5214
name                       0
profile_image_url          0
url                        0
username                   0
verified                   0
url.urls                   0
description.urls           0
description.mentions       0
description.hashtags       0
description.cashtags       0
followers_count            0
following_count            0
tweet_count                0
listed_count               0
source_user_id             0
username_length            0
name_length                0
description_length         0
numDigits_username         0
username_entropy           0
description_entropy        0
names_similarity           0
names_ratio                0
reputation                 0
created_at_timestamp       0
age_of_account             0
id_y                       0
label                      0
retweet_ratio              0
url_count_tweets           0
url_ratio     

## 2.5 Feature Learning (Tweet Embedding) 

### Data Preprocessing
- Tokenisation
- Lemmatisation

In [None]:
#preprocess tweets

import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize tokenizer and lemmatizer
tokenizer = TweetTokenizer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Data cleaning, tokenization and lemmatization
def preprocess_tweets(tweet):
    tweet_cleaned = re.sub(r'https?://[^ ]+', '', tweet)  # Remove URLs
    tweet_cleaned = re.sub(r'@\w+', '', tweet_cleaned)    # Remove mentions
    tweet_cleaned = re.sub(r'RT', '', tweet_cleaned)      # Remove RT
    tweet_cleaned = tweet_cleaned.lower()                 # Convert to lowercase
    tweet_cleaned = re.sub(r'[^\w\s]', '', tweet_cleaned) # Remove punctuation

    # Tokenization
    tokens = tokenizer.tokenize(tweet_cleaned)

    # Stop words removal and lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

    return tokens

g["tokens"] = g["text"].apply(preprocess_tweets)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hengboonlong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hengboonlong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Obtain word embeddings using gloVE library (tailored to Twitter Data)

In [None]:
import gensim
import gensim.downloader as api

# Load the GloVe model (choose the desired model size, e.g., 'glove-twitter-100' for Twitter data)
glove_model = api.load('glove-twitter-100')

def get_tweet_embedding(tweet_tokens, glove_model):

    # Retrieve GloVe embeddings for each token
    embeddings = [glove_model[token] for token in tweet_tokens if token in glove_model]

    # Handle tweets with no tokens found in the GloVe model
    if not embeddings:
        return None

    # Aggregate the embeddings, e.g., by averaging
    tweet_embedding = sum(embeddings) / len(embeddings)
    return tweet_embedding

g["tweet_embedding"] = g["tokens"].apply(lambda x : get_tweet_embedding(x, glove_model))

## 3. Save the graph and user dataset

In [None]:
# keep only necessary columns 
cols_to_keep = ["source_user_id","target_user_id","relationship","tweet_embedding"]
g_filtered = g[cols_to_keep]

In [None]:
# remove all null target_user_id from graph as we cannot form an edge
g_filtered = g_filtered.drop(g_filtered[g_filtered["target_user_id"].isnull()].index, axis=0)

In [None]:
# drop unnecessary columns
user = user.drop(["id_x","id_y"],axis=1)

In [None]:
# normalisation (max-min normalisation)
cols = ['reputation', 'retweet_ratio',
       'url_count_tweets', 'url_ratio', 'url_max_tweets',
       'time_interval_sd_day', 'mention_count_tweets', 'mention_max_tweets',
       'hashtag_count_tweets', 'hashtags_max_tweets', 'age_of_account']

for col in cols:
    user[col] = (user[col] - user[col].min()) / (user[col].max() - user[col].min())

### [MISC] Checks to ensure that all `source_user_id` and `target_user_id` in graph dataset are included in user dataset

In [None]:
g_filtered["source_user_id"].isin(user["source_user_id"]).sum() == len(g_filtered["source_user_id"])

True

In [None]:
g_filtered["source_user_id"].isin(user["source_user_id"]).sum() == len(g_filtered["source_user_id"])

True

### [MISC] Checks for distribution of humans and bots in source and target user id

In [None]:
source_ids = set(g_filtered["source_user_id"])
print(f"Number of unique source user ids: {len(source_ids)}")
print("-----------------------------------------")

source_bot_id = 0
source_human_id = 0

for id in source_ids:
    try:
        if (labels[labels["id"]==id]["label"].values[0] == 0):
            source_human_id += 1
        elif (labels[labels["id"]==id]["label"].values[0] == 1):
            source_bot_id += 1
            
    # if id doesnt exist in label dataset
    except:
        continue
        
print(f"Number of bots: {source_bot_id}") 
print(f"Number of humans: {source_human_id}")
print("\n")


target_ids = set(g_filtered["target_user_id"])
print(f"Number of unique target user ids: {len(target_ids)}")
print("-----------------------------------------")

target_bot_id = 0
target_human_id = 0

for id in target_ids:
    try:
        if (labels[labels["id"]==id]["label"].values[0] == 0):
            target_human_id += 1
        elif (labels[labels["id"]==id]["label"].values[0] == 1):
            target_bot_id += 1
            
    # if id doesnt exist in label dataset
    except:
        continue
        
print(f"Number of bots: {target_bot_id}") 
print(f"Number of humans: {target_human_id}")

Number of unique source user ids: 7997
-----------------------------------------
Number of bots: 6864
Number of humans: 1133


Number of unique target user ids: 15175
-----------------------------------------
Number of bots: 5319
Number of humans: 9856


### Number of human and bots captured in this graph dataset

In [None]:
user["label"].value_counts()

label
1    10763
0    10694
Name: count, dtype: int64

### Save the final user dataset

In [None]:
# drop problematic columns
user = user.drop(["description","created_at","location","name","profile_image_url","url","username"], axis=1)
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21457 entries, 0 to 21456
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   verified              21457 non-null  bool               
 1   url.urls              21457 non-null  int64              
 2   description.urls      21457 non-null  int64              
 3   description.mentions  21457 non-null  int64              
 4   description.hashtags  21457 non-null  int64              
 5   description.cashtags  21457 non-null  int64              
 6   followers_count       21457 non-null  int64              
 7   following_count       21457 non-null  int64              
 8   tweet_count           21457 non-null  int64              
 9   listed_count          21457 non-null  int64              
 10  source_user_id        21457 non-null  int64              
 11  username_length       21457 non-null  int64              
 12  name

In [None]:
tweet_num = 0 # change to which tweet.json you are using
user.to_csv(f"user_{tweet_num}.csv", index=False)

### Save the final graph dataset

In [None]:
g_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 43830 entries, 40 to 653128
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   source_user_id   43830 non-null  int64  
 1   target_user_id   43830 non-null  float64
 2   relationship     43830 non-null  object 
 3   tweet_embedding  41275 non-null  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 1.7+ MB


In [None]:
tweet_num = 0 # change to which tweet.json you are using
g_filtered.to_csv(f"graph_{tweet_num}.csv", index=False, )