## Load dataset and compute features

In [1]:
import json
import datetime
import pandas as pd
with open("./datasets/vendor-purchased-2019/vendor-purchased-2019_tweets.json", "r") as f:
    JSON = json.load(f)
print(len(JSON))

dataset = []
for data in JSON:
    instance = dict()
    acc_feat = dict()
    
    UTC = data["created_at"]
    user = data["user"]
    
    #Get "friends/follower_ratio" feature
    if user["followers_count"] == 0 :
        ratio = 0
    else:
        ratio = user["friends_count"]/(user["followers_count"]**2)
        
    acc_feat["friends/follower_ratio"] = ratio
    acc_feat['bot_in_biography'] = int(type(user['description']) is str and 'bot' in user['description'].lower())
    acc_feat['bot_in_name'] = int('bot' in user['screen_name'].lower())
    
    acc_feat['has_name'] = int(user['name'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
    acc_feat['has_image'] = int(user['default_profile_image'] != 1)
    acc_feat['has_address'] = int(user['location'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
    acc_feat['has_biography'] = int(user['description'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
    acc_feat['followers_ge_30'] = int(user['followers_count'] >= 30)
    acc_feat['belongs_to_a_list'] = int(user['listed_count'] > 0)
    acc_feat['url_in_profile'] = int(user['url'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
    #acc_feat['followers_2_times_ge_friends'] = int(2 * user['followers_count'] >= user['friends_count'])
    
    #acc_feat['ratio_friends_followers_around_100'] = int(user['followers_count'] > 0 and 80.0 <= float(user['friends_count']) / user['followers_count'] >= 120.0)
    
    acc_feat['ratio_friends_followers_ge_50'] = int(user['followers_count'] > 0 and float(user['friends_count']) / user['followers_count'] >= 50)
    acc_feat['default_image_after_2_month'] = int(user['default_profile_image'] == 1 and (datetime.datetime.now() - datetime.datetime.strptime(user['created_at'],'%a %b %d %H:%M:%S +0000 %Y')) > datetime.timedelta(weeks=4)) 
    acc_feat['friends_ge_100'] = int(user['friends_count'] >= 100)
    acc_feat['no_bio'] = int(user['description'] in ['NULL', 'NaN', '', ' ', pd.np.nan])
    acc_feat['no_location'] = int(user['location'] in ['NULL', 'NaN', '', ' ', pd.np.nan])
    
    acc_feat['nb_friends'] = int(user['friends_count'])
    #acc_feat['ratio_friends_followers_square'] = float(user['friends_count']) / pow(user['followers_count'], 2) if user['followers_count'] > 0 else 0
    #acc_feat['age'] = (datetime.datetime.now() - datetime.datetime.strptime(user['created_at'],'%a %b %d %H:%M:%S +0000 %Y')).total_seconds()
    #acc_feat['following_rate'] = float(user['friends_count']) / acc_feat['age']
    
    
    instance["id"] = user["id"]
    instance["screen_name"] = user["screen_name"]
    instance["features"] = acc_feat
    

    dataset.append(instance)
    
print(dataset[0])

1088
{'id': 53805021, 'screen_name': 'guowei534', 'features': {'friends/follower_ratio': 39.888888888888886, 'bot_in_biography': 0, 'bot_in_name': 0, 'has_name': 1, 'has_image': 0, 'has_address': 0, 'has_biography': 0, 'followers_ge_30': 0, 'belongs_to_a_list': 0, 'url_in_profile': 1, 'ratio_friends_followers_ge_50': 1, 'default_image_after_2_month': 1, 'friends_ge_100': 1, 'no_bio': 1, 'no_location': 1, 'nb_friends': 359}}


  acc_feat['has_name'] = int(user['name'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
  acc_feat['has_address'] = int(user['location'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
  acc_feat['has_biography'] = int(user['description'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
  acc_feat['url_in_profile'] = int(user['url'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
  acc_feat['no_bio'] = int(user['description'] in ['NULL', 'NaN', '', ' ', pd.np.nan])
  acc_feat['no_location'] = int(user['location'] in ['NULL', 'NaN', '', ' ', pd.np.nan])


In [2]:
with open("./datasets/vendor-purchased-2019/vendor-purchased-2019.tsv", "r") as f :
    TSV = f.readlines()
print(len(TSV))    

labels = []
for label in TSV:
    #All user_ids in TSV are bot.
    bot_user_id = label.split()[0]
    labels.append(int(bot_user_id))

print(labels[0])

1088
736629545512632320


In [3]:
import pandas as pd

original_df = pd.DataFrame(dataset)                                        
original_df.to_pickle("bot_features.pkl")

In [4]:
import pandas as pd

#Load N features and add Label, and make label y
bot_features = pd.read_pickle("bot_features.pkl")
bot_number = len(bot_features)

print(bot_features)

                      id      screen_name  \
0               53805021        guowei534   
1               66317516          Aridego   
2              483428005  FinalFantasyRef   
3              512754110   sherif35040661   
4             2583514062      ProphetNyce   
...                  ...              ...   
1083          1409637493  IsabellaManetti   
1084           623584562      FlirtyNotes   
1085          2906301805        EraseSoul   
1086  948314138886987777          div1rts   
1087          3434325791    loa_thesecret   

                                               features  
0     {'friends/follower_ratio': 39.888888888888886,...  
1     {'friends/follower_ratio': 0.16568559556786702...  
2     {'friends/follower_ratio': 0.1044140625, 'bot_...  
3     {'friends/follower_ratio': 0.5298765432098765,...  
4     {'friends/follower_ratio': 183.5, 'bot_in_biog...  
...                                                 ...  
1083  {'friends/follower_ratio': 1.7397105268315216e.

In [5]:
print(bot_features["features"][0])

{'friends/follower_ratio': 39.888888888888886, 'bot_in_biography': 0, 'bot_in_name': 0, 'has_name': 1, 'has_image': 0, 'has_address': 0, 'has_biography': 0, 'followers_ge_30': 0, 'belongs_to_a_list': 0, 'url_in_profile': 1, 'ratio_friends_followers_ge_50': 1, 'default_image_after_2_month': 1, 'friends_ge_100': 1, 'no_bio': 1, 'no_location': 1, 'nb_friends': 359}


## Crawl retweet ratio through tweepy (You need tokens & key)

In [6]:
import tweepy

access_token = xxx                                                           
access_token_secret = xxx
consumer_key = xxx                                                            
consumer_secret = xxx 

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

In [7]:
dataset = list()
for idx, row in bot_features.iterrows() :
    instance = dict()
    acc_feat = dict()
    print(row["screen_name"])
    try:
        user = api.get_user(row["screen_name"])
    except:
        continue
    #user_info = api.lookup_users(user_ids=[user.id])

    statuses = api.user_timeline(user.screen_name) 
    tweets_len = list()
    tweets_id=list()
    for status in statuses: 
        tweets_id.append(status.id)
        tweets_len.append(len(status.text))

    num_retweets = 0
    for post_id in tweets_id:
        retweets_list = api.retweets(id=post_id) 
        print(len(retweets_list))
        # printing the screen names of the retweeters 
        num_retweets += len(retweets_list)
    num_posts = len(tweets_id)
    if num_posts == 0:
        retweet_ratio = 0
    else : 
        retweet_ratio = num_retweets/num_posts
    post_length_avg = sum(tweets_len)/len(tweets_len)
    acc_feat["retweet_ratio"] =retweet_ratio
    acc_feat["post_length"] = post_length_avg
    features = row["features"]
    features = {**features, **acc_feat}
    instance["id"]=row["id"]
    instance["screen_name"]=row["screen_name"]
    instance["features"]=features
    dataset.append(instance)
    
new_features = pd.DataFrame(dataset)
bot_features.append(dataset, ignore_index = True)
bot_features.to_pickle("bot_features.pkl")

guowei534


RateLimitError: [{'message': 'Rate limit exceeded', 'code': 88}]