In [7]:
# Import Libraries and Data
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn
import scorecardpy as sc
import warnings 

warnings.simplefilter(action='ignore', category=FutureWarning)

df = pd.read_csv("/Users/wenwei/Documents/Sku/y4s1/bt4222/project/twitter_human_bots_dataset.csv")
df.head()



### Provided Features
* created_at: timestamp
    * Day account was created

* default_profile: Boolean
    * Indicating whether the account has a default profile

* default_profile_image: Boolean
    * Indicating whether the account has a default image profile

* description: String
    * User account description

* favourites_count: Int
    * Total number of favourite tweets

* followers_count: Int
    * Total number of followers

* friend_count: Int
    * Total number of friends (people who follow back)

* geo_enabled: Boolean
    * Indicating whether the account has the geographic location enabled

* id: string
    * unique identifier of the account

* lang: string
    * Language of the account

* location: 
    * Location of the account

* profile_background: string
    * Profile background image url

* profile_image_url: String
    * Profile image URL

* screen_name: string
    * username

* statuses_count: int
    * Total number of tweets

* verified: Boolean
    * Indicating whether the account has been verified

* average_tweets_per_day: int
    * Average tweets posted per day (statuses_count / account_age_day)

* account_age_day: int
    * Account age measured in days

* account_type: binary
    * account type, bot or human

### Transformed Features to consider:
* Sentiment analysis for description
* Number of mentions in description to other bot accounts
* Length of description
* Ratio Followers_count to friends count
* Ratio of Followers_count to tweets per day
* Ratio of friends to tweets per day
* Ratio of tweets since account created
* Time when account was created (Past midnight of timezone, obtained from location)
* Standard deviation of avg tweets from avg tweets of bots

### Total Features: 18 (provided) + 9 (created)




# Structure of ML Process

## Data
* Data Cleaning, Imputation, etc..
* Train Test split (based on account creation date)
* 2006-2017 used to train,test validate model
* 2018-2019, out of sample testing, to test how well model performs with more recent data

## Features
* Feature Creation
* Feature Reduction (using Weight of Evidence(woe) check for feature importance, Correlation)
* Run feature selection using tree base algo (random forest or smth)

## Model training

### Tree based models
* XGB, LGBM, RF, ETC, DT

### Neural Networks
* NN, autoencoders(?), GNN

---

### Get Shap for model explanability both local and global
* Tune model on best model selected (either on precision/f1/recall and feature blend)

### Test model on out of sample set
* Get metrics(acc,precision,f1, etc..) from out of sample set

### Dashboard(?) --enhancement
* Transfer results to a dashboard

In [2]:
# Checking for Null Values
missing_values = df.isnull().sum()
missing_values

Unnamed: 0                         0
created_at                         0
default_profile                    0
default_profile_image              0
description                     7257
favourites_count                   0
followers_count                    0
friends_count                      0
geo_enabled                        0
id                                 0
lang                            7957
location                           4
profile_background_image_url    4499
profile_image_url                  1
screen_name                        0
statuses_count                     0
verified                           0
average_tweets_per_day             0
account_age_days                   0
account_type                       0
dtype: int64

### How WOE works:
Run the features you want to conduct weight of evidence on. For example, I put the features in temp_list and then created a dataframe containing only those features in df_tmp.

* Step 1: Replace temp_list with your features
* Step 2: Run code chunk below
* Step 3: Look at the WOE score for each bin of each feature to interpret WOE results

How to Interpret WOE scores
Negative or Positive Values indicate the direction of the feature while the magnitude indicate the strength of the feature.
E.g For the feature "statuses_count", if the woe score for one of its bin is 0.5 it indicates that for that bin, the feature "statuses_count" identifies many bots.
Conversely if the woe score for one of its bin is -0.7, it means that the feature is not good a predicting bots.

Also do look at the IV scores to determine the features contribution. Generally an IV of >0.02 indicates weak predictive power while >0.5 indicates strong predictive power.

In [11]:
temp_lst = ['favourites_count', 'followers_count','friends_count','account_type','statuses_count']
#Transforming Bot/Human to 1s and 0s
df['account_type'] = df['account_type'].replace({'bot': 1, 'human': 0})

# Replace 'target' with actual column name
y = df['account_type']  
# Drop target to leave only features
X = df.drop(columns=['account_type'])  

df_tmp = df[temp_lst]
# Conduct the binning process using woebin
bins = sc.woebin(df_tmp, y='account_type', bins = 5)

# Display the binning information (optional to view WoE values per variable)
for key in bins.keys():
    print(f"Binning for {key}:")
    print(bins[key])


[INFO] creating woe binning ...
Binning for statuses_count:
         variable               bin  count  count_distr   good   bad  \
0  statuses_count     [-inf,4000.0)  18354     0.490251   9254  9100   
1  statuses_count  [4000.0,65000.0)  16356     0.436882  13827  2529   
2  statuses_count     [65000.0,inf)   2728     0.072867   1932   796   

    badprob       woe    bin_iv  total_iv   breaks  is_special_values  
0  0.495805  0.682904  0.247503  0.598909   4000.0              False  
1  0.154622 -0.999114  0.348942  0.598909  65000.0              False  
2  0.291789 -0.187027  0.002464  0.598909      inf              False  
Binning for favourites_count:
           variable              bin  count  count_distr   good    bad  \
0  favourites_count    [-inf,3000.0)  22524     0.601635  11704  10820   
1  favourites_count  [3000.0,5000.0)   2707     0.072306   2167    540   
2  favourites_count     [5000.0,inf)  12207     0.326059  11142   1065   

    badprob       woe    bin_iv  tot

In [4]:

df.shape

#Transforming features
#tagging of description, construct some sort NLA
#text analysis for description
#ratio followers_count to friends count
#ratio of followers to tweets per day
# ratio of friends to tweets per day
#length of descriptions
"""
        'created_at', 'default_profile', 'default_profile_image',
       'description', 'favourites_count', 'followers_count', 'friends_count',
       'geo_enabled', 'id', 'lang', 'location', 'profile_background_image_url',
       'profile_image_url', 'screen_name', 'statuses_count', 'verified',
       'average_tweets_per_day', 'account_age_days', 'account_type'

"""



"\n        'created_at', 'default_profile', 'default_profile_image',\n       'description', 'favourites_count', 'followers_count', 'friends_count',\n       'geo_enabled', 'id', 'lang', 'location', 'profile_background_image_url',\n       'profile_image_url', 'screen_name', 'statuses_count', 'verified',\n       'average_tweets_per_day', 'account_age_days', 'account_type'\n\n"

In [7]:
df['year'] = pd.to_datetime(df['created_at']).dt.year
year_distribution = df['year'].value_counts().sort_index()
year_distribution