In [3]:
# Saves the installed requirements to a requirements file
%pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Installs the requirements from the requirements file
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [5]:
# Import Libraries and Data
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn
import scorecardpy as sc
import warnings 
import re


warnings.simplefilter(action='ignore', category=FutureWarning)

df = pd.read_csv("twitter_human_bots_dataset.csv")
df.head()



Unnamed: 0.1,Unnamed: 0,created_at,default_profile,default_profile_image,description,favourites_count,followers_count,friends_count,geo_enabled,id,lang,location,profile_background_image_url,profile_image_url,screen_name,statuses_count,verified,average_tweets_per_day,account_age_days,account_type
0,0,2016-10-15 21:32:11,False,False,"Blame @xaiax, Inspired by @MakingInvisible, us...",4,1589,4,False,787405734442958848,en,unknown,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/7874121826...,best_in_dumbest,11041,False,7.87,1403,bot
1,1,2016-11-09 05:01:30,False,False,Photographing the American West since 1980. I ...,536,860,880,False,796216118331310080,en,Estados Unidos,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/8023296328...,CJRubinPhoto,252,False,0.183,1379,human
2,2,2017-06-17 05:34:27,False,False,Scruffy looking nerf herder and @twitch broadc...,3307,172,594,True,875949740503859204,en,"Los Angeles, CA",http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1278890453...,SVGEGENT,1001,False,0.864,1159,human
3,3,2016-07-21 13:32:25,True,False,Wife.Godmother.Friend.Feline Fanatic! Assistan...,8433,517,633,True,756119643622735875,en,"Birmingham, AL",,http://pbs.twimg.com/profile_images/1284884924...,TinkerVHELPK5,1324,False,0.889,1489,human
4,4,2012-01-15 16:32:35,False,False,Loan coach at @mancity & Aspiring DJ,88,753678,116,True,464781334,en,"England, United Kingdom",http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/9952566258...,JoleonLescott,4202,True,1.339,3138,human


### Provided Features
* created_at: timestamp
    * Day account was created

* default_profile: Boolean
    * Indicating whether the account has a default profile

* default_profile_image: Boolean
    * Indicating whether the account has a default image profile

* description: String
    * User account description

* favourites_count: Int
    * Total number of favourite tweets

* followers_count: Int
    * Total number of followers

* friend_count: Int
    * Total number of friends (people who follow back)

* geo_enabled: Boolean
    * Indicating whether the account has the geographic location enabled

* id: string
    * unique identifier of the account

* lang: string
    * Language of the account

* location: 
    * Location of the account

* profile_background: string
    * Profile background image url

* profile_image_url: String
    * Profile image URL

* screen_name: string
    * username

* statuses_count: int
    * Total number of tweets

* verified: Boolean
    * Indicating whether the account has been verified

* average_tweets_per_day: int
    * Average tweets posted per day (statuses_count / account_age_day)

* account_age_day: int
    * Account age measured in days

* account_type: binary
    * account type, bot or human

### Transformed Features to consider:
* Sentiment analysis for description
* Number of mentions in description to other bot accounts
* Length of description
* Ratio Followers_count to friends count
* Ratio of Followers_count to tweets per day
* Ratio of friends to tweets per day
* Ratio of tweets since account created
* Time when account was created (Past midnight of timezone, obtained from location)
* Standard deviation of avg tweets from avg tweets of bots

### Total Features: 18 (provided) + 9 (created)




In [6]:
df.columns

Index(['Unnamed: 0', 'created_at', 'default_profile', 'default_profile_image',
       'description', 'favourites_count', 'followers_count', 'friends_count',
       'geo_enabled', 'id', 'lang', 'location', 'profile_background_image_url',
       'profile_image_url', 'screen_name', 'statuses_count', 'verified',
       'average_tweets_per_day', 'account_age_days', 'account_type'],
      dtype='object')

Length of Description Feature

In [17]:
df['description_length'] = df['description'].apply(lambda x: len(str(x)) if pd.notnull(x) else 0)
print(df['description_length'].describe())
print(df['description_length'].isna().sum())

count    37438.000000
mean        66.878092
std         55.092550
min          0.000000
25%         14.000000
50%         58.000000
75%        118.000000
max        190.000000
Name: description_length, dtype: float64
0


Ratio of Followers_count to tweets per day

In [23]:
df['followers_to_friends_ratio'] = df['followers_count'] / df['friends_count']
print(df['followers_to_friends_ratio'].describe())
print(df['followers_to_friends_ratio'].value_counts())
print(f"\nNumber of NA: {df['followers_to_friends_ratio'].isna().sum()}")

count    3.640400e+04
mean              inf
std               NaN
min      0.000000e+00
25%      5.224889e-01
50%      1.708792e+00
75%      1.042999e+03
max               inf
Name: followers_to_friends_ratio, dtype: float64
followers_to_friends_ratio
inf            5376
0.000000        221
1.000000         85
0.500000         64
0.333333         59
               ... 
3528.268145       1
602.938852        1
1.043967          1
1.221818          1
450.684211        1
Name: count, Length: 26700, dtype: int64

Number of NA: 1034


  sqr = _ensure_numeric((avg - values) ** 2)


Ratio of Followers_count to tweets per day

In [13]:
df['followers_to_tweets_per_day_ratio'] = df['followers_count'] / df['average_tweets_per_day']
print(df['followers_to_tweets_per_day_ratio'].describe())

count    3.742300e+04
mean              inf
std               NaN
min      0.000000e+00
25%      4.651163e+01
50%      2.173913e+02
75%      2.794128e+03
max               inf
Name: followers_to_tweets_per_day_ratio, dtype: float64


  sqr = _ensure_numeric((avg - values) ** 2)


Ratio of friends to tweets per day

In [12]:
df['friends_to_tweets_per_day_ratio'] = df['friends_count'] / df['average_tweets_per_day']
print(df['friends_to_tweets_per_day_ratio'].describe())

count    3.741100e+04
mean              inf
std               NaN
min      0.000000e+00
25%      1.927754e+01
50%      1.250562e+02
75%      4.652773e+02
max               inf
Name: friends_to_tweets_per_day_ratio, dtype: float64


  sqr = _ensure_numeric((avg - values) ** 2)


In [21]:
def extract_mentions(description):
    mentions = re.findall(r'@\w+', str(description))
    return mentions

# Apply the function to the description column to create a new column with the mentions
df['mentions'] = df['description'].apply(extract_mentions)

# Feature: Count of mentions
df['mention_count'] = df['mentions'].apply(len)

# Display the dataframe with the new columns
print(df[['description', 'mentions', 'mention_count' ]].head())


                                         description  \
0  Blame @xaiax, Inspired by @MakingInvisible, us...   
1  Photographing the American West since 1980. I ...   
2  Scruffy looking nerf herder and @twitch broadc...   
3  Wife.Godmother.Friend.Feline Fanatic! Assistan...   
4               Loan coach at @mancity & Aspiring DJ   

                     mentions  mention_count  
0  [@xaiax, @MakingInvisible]              2  
1                          []              0  
2           [@twitch, @gmail]              2  
3                          []              0  
4                  [@mancity]              1  


In [22]:
df['followers_to_friends_ratio'].fillna(0, inplace=True)
df['followers_to_tweets_per_day_ratio'].fillna(0, inplace=True)
df['friends_to_tweets_per_day_ratio'].fillna(0, inplace=True)

# Structure of ML Process

## Data
* Data Cleaning, Imputation, etc..
* Train Test split (based on account creation date)
* 2006-2017 used to train,test validate model
* 2018-2019, out of sample testing, to test how well model performs with more recent data

## Features
* Feature Creation
* Feature Reduction (using Weight of Evidence(woe) check for feature importance, Correlation)
* Run feature selection using tree base algo (random forest or smth)

## Model training

### Tree based models
* XGB, LGBM, RF, ETC, DT

### Neural Networks
* NN, autoencoders(?), GNN

---

### Get Shap for model explanability both local and global
* Tune model on best model selected (either on precision/f1/recall and feature blend)

### Test model on out of sample set
* Get metrics(acc,precision,f1, etc..) from out of sample set

### Dashboard(?) --enhancement
* Transfer results to a dashboard

In [23]:
# Checking for Null Values
missing_values = df.isnull().sum()
missing_values

Unnamed: 0                              0
created_at                              0
default_profile                         0
default_profile_image                   0
description                          7256
favourites_count                        0
followers_count                         0
friends_count                           0
geo_enabled                             0
id                                      0
lang                                 7957
location                                3
profile_background_image_url         4499
profile_image_url                       1
screen_name                             0
statuses_count                          0
verified                                0
average_tweets_per_day                  0
account_age_days                        0
account_type                            0
description_length                      0
followers_to_friends_ratio              0
followers_to_tweets_per_day_ratio       0
friends_to_tweets_per_day_ratio   

In [24]:
df['followers_to_friends_ratio'].fillna(0, inplace=True)
df['followers_to_tweets_per_day_ratio'].fillna(0, inplace=True)
df['friends_to_tweets_per_day_ratio'].fillna(0, inplace=True)

### How WOE works:
Run the features you want to conduct weight of evidence on. For example, I put the features in temp_list and then created a dataframe containing only those features in df_tmp.

* Step 1: Replace temp_list with your features
* Step 2: Run code chunk below
* Step 3: Look at the WOE score for each bin of each feature to interpret WOE results

How to Interpret WOE scores
Negative or Positive Values indicate the direction of the feature while the magnitude indicate the strength of the feature.
E.g For the feature "statuses_count", if the woe score for one of its bin is 0.5 it indicates that for that bin, the feature "statuses_count" identifies many bots.
Conversely if the woe score for one of its bin is -0.7, it means that the feature is not good a predicting bots.

Also do look at the IV scores to determine the features contribution. Generally an IV of >0.02 indicates weak predictive power while >0.5 indicates strong predictive power.

In [25]:
temp_lst = ['favourites_count', 'followers_count','account_type','mention_count']
#Transforming Bot/Human to 1s and 0s
df['account_type'] = df['account_type'].replace({'bot': 1, 'human': 0})

# Replace 'target' with actual column name
y = df['account_type']  
# Drop target to leave only features
X = df.drop(columns=['account_type'])  

df_tmp = df[temp_lst]
# Conduct the binning process using woebin
bins = sc.woebin(df_tmp, y='account_type', bins = 5)

# Display the binning information (optional to view WoE values per variable)
for key in bins.keys():
    print(f"Binning for {key}:")
    print(bins[key])


[INFO] creating woe binning ...
Binning for mention_count:
        variable         bin  count  count_distr   good    bad   badprob  \
0  mention_count  [-inf,1.0)  31088     0.830386  19724  11364  0.365543   
1  mention_count   [1.0,2.0)   3841     0.102596   3063    778  0.202551   
2  mention_count   [2.0,inf)   2509     0.067017   2226    283  0.112794   

        woe    bin_iv  total_iv breaks  is_special_values  
0  0.148299  0.018694  0.149074    1.0              False  
1 -0.670738  0.040137  0.149074    2.0              False  
2 -1.362830  0.090243  0.149074    inf              False  
Binning for favourites_count:
           variable              bin  count  count_distr   good    bad  \
0  favourites_count    [-inf,3000.0)  22524     0.601635  11704  10820   
1  favourites_count  [3000.0,5000.0)   2707     0.072306   2167    540   
2  favourites_count     [5000.0,inf)  12207     0.326059  11142   1065   

    badprob       woe    bin_iv  total_iv  breaks  is_special_values 

In [26]:

df.shape

#Transforming features
#tagging of description, construct some sort NLA
#text analysis for description
#ratio followers_count to friends count
#ratio of followers to tweets per day
# ratio of friends to tweets per day
#length of descriptions
"""
        'created_at', 'default_profile', 'default_profile_image',
       'description', 'favourites_count', 'followers_count', 'friends_count',
       'geo_enabled', 'id', 'lang', 'location', 'profile_background_image_url',
       'profile_image_url', 'screen_name', 'statuses_count', 'verified',
       'average_tweets_per_day', 'account_age_days', 'account_type'

"""



"\n        'created_at', 'default_profile', 'default_profile_image',\n       'description', 'favourites_count', 'followers_count', 'friends_count',\n       'geo_enabled', 'id', 'lang', 'location', 'profile_background_image_url',\n       'profile_image_url', 'screen_name', 'statuses_count', 'verified',\n       'average_tweets_per_day', 'account_age_days', 'account_type'\n\n"

In [27]:
df['year'] = pd.to_datetime(df['created_at']).dt.year
year_distribution = df['year'].value_counts().sort_index()
year_distribution

2006      13
2007     354
2008    1447
2009    7598
2010    4668
2011    5550
2012    4594
2013    3013
2014    2476
2015    2140
2016    2276
2017    2445
2018     835
2019      29
Name: year, dtype: int64