## Read datasets

In [1]:
import pandas as pd
import numpy as np
import datetime
import config as c
from collections import OrderedDict

In [2]:
compute_only_class_a = True
datasets_fake = ["FSF", "INT", "TWT"]
datasets_hum = ["TFP", "E13"]
datasets = datasets_hum

In [3]:
users = pd.read_csv(c.folder_datasets + datasets[0] + '/' + c.file_users)
tweets = pd.read_csv(c.folder_datasets + datasets[0] + '/' + c.file_tweets, dtype={"geo": str})
friends = pd.read_csv(c.folder_datasets + datasets[0] + '/' + c.file_friends)
followers = pd.read_csv(c.folder_datasets + datasets[0] + '/' + c.file_followers)
for dataset in datasets[1:]:
    users = pd.concat([users, pd.read_csv(c.folder_datasets + dataset + '/' + c.file_users)])
    tweets = pd.concat([tweets, pd.read_csv(c.folder_datasets + dataset + '/' + c.file_tweets, dtype={"geo": str})])
    friends = pd.concat([friends, pd.read_csv(c.folder_datasets + dataset + '/' + c.file_friends)])
    followers = pd.concat([followers, pd.read_csv(c.folder_datasets + dataset + '/' + c.file_followers)])  

## Compute features

In [4]:
def is_from_web():
    for n in tweets.source.unique():
        for src in ["iphone", "android", "foursquare", "instagram", "web"]:
            if src in n.lower():
                return 1
    return 0

import string
def use_ponctuation(user_id):
    # If one tweets use ponctuation return 1
    user_tweets = tweets.loc[tweets['user_id'] == user['id'], 'text']
    for tweet in user_tweets:
        for c in tweet:
            # See if the char is punctuation.
            if c in string.punctuation:
                return 1
        return 0


In [5]:
import time
import datetime
start_time = time.time()
SAMPLE_SIZE = 100
X = list()
y = list()
features_name = list()
nb_fake_acc =0
# Compute features for each Twitter account
dataset = list()
for index, user in users.iterrows():
    instance = dict()
    acc_feat = dict()
    #if index > SAMPLE_SIZE:
    #    break
    
    # Class A (Profile)
    #===================
    if int(user['followers_count']) == 0:
        acc_feat["friends/follower_ratio"] = 0
    else :
        acc_feat["friends/follower_ratio"] = int(user['friends_count'])/(int(user['followers_count'])**2)

    acc_feat['bot_in_biography'] = int(type(user['description']) is str and 'bot' in user['description'].lower())
    acc_feat['bot_in_name'] = int('bot' in user['screen_name'].lower())
    
    acc_feat['has_name'] = int(user['name'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
    acc_feat['has_image'] = int(user['default_profile_image'] != 1)
    acc_feat['has_address'] = int(user['location'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
    acc_feat['has_biography'] = int(user['description'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
    acc_feat['followers_ge_30'] = int(user['followers_count'] >= 30)
    acc_feat['belongs_to_a_list'] = int(user['listed_count'] > 0)
    #acc_feat['nb_tweets_ge_50'] = int(tweets.loc[tweets['user_id'] == user['id']].size >= 50)
    acc_feat['url_in_profile'] = int(user['url'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
    #acc_feat['followers_2_times_ge_friends'] = int(2 * user['followers_count'] >= user['friends_count'])
    
    #acc_feat['ratio_friends_followers_around_100'] = int(user['followers_count'] > 0 and 80.0 <= float(user['friends_count']) / user['followers_count'] >= 120.0)
    #acc_feat['duplicate_profile_picture'] = int(users.loc[users['default_profile_image'] == user['default_profile_image']].size > 1)
    
    acc_feat['ratio_friends_followers_ge_50'] = int(user['followers_count'] > 0 and float(user['friends_count']) / user['followers_count'] >= 50)
    acc_feat['default_image_after_2_month'] = int(user['default_profile_image'] == 1 and (datetime.datetime.now() - datetime.datetime.strptime(user['created_at'],'%a %b %d %H:%M:%S +0000 %Y')) > datetime.timedelta(weeks=4)) 
    acc_feat['friends_ge_100'] = int(user['friends_count'] >= 100)
    acc_feat['no_bio'] = int(user['description'] in ['NULL', 'NaN', '', ' ', pd.np.nan])
    acc_feat['no_location'] = int(user['location'] in ['NULL', 'NaN', '', ' ', pd.np.nan])
    #acc_feat['no_tweets'] = int(tweets.loc[tweets['user_id'] == user['id']].size == 0)
    
    acc_feat['nb_friends'] = int(user['friends_count'])
    #acc_feat['nb_tweets'] = int(tweets.loc[tweets['user_id'] == user['id']].size)
    #acc_feat['ratio_friends_followers_square'] = float(user['friends_count']) / pow(user['followers_count'], 2) if user['followers_count'] > 0 else 0
    #acc_feat['age'] = (datetime.datetime.now() - datetime.datetime.strptime(user['created_at'],'%a %b %d %H:%M:%S +0000 %Y')).total_seconds()
    #acc_feat['following_rate'] = float(user['friends_count']) / acc_feat['age']
    
    # Class B (Timeline)
    #===================
    if not compute_only_class_a:
        #acc_feat['geo_localized'] = int(tweets.loc[tweets['user_id'] == user['id'] & (tweets['geo'] != '')].size)
        #acc_feat['is_favorite'] = int(tweets.loc[tweets['user_id'] == user['id'] & (tweets['favorite_count'] > 0)].size)
        #acc_feat['from_web'] = is_from_web()
        #acc_feat['use_ponctuation'] = use_ponctuation(user['id'])
        
        #acc_feat['same_sentence']
        acc_feat['from_API'] = tweets.loc[tweets['user_id'] == user['id'] & (tweets['source'] != '<a href="http://twitter.com/tweetbutton" rel="nofollow">Tweet Button</a>')].size > 0
        
    
    target = 1 if user['dataset'] in datasets_fake else 0
    nb_fake_acc = nb_fake_acc + target
    
    y.append(target)
    instance["id"] = user["id"]
    instance["screen_name"] = user["screen_name"]
    instance["features"] = acc_feat
    dataset.append(instance)
    features_name = list(acc_feat.keys())
nb_hum_acc = abs(len(y) - nb_fake_acc)
total_time = datetime.timedelta(seconds=time.time() - start_time)
print("Feature computation time : " + str(total_time))

  acc_feat['has_name'] = int(user['name'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
  acc_feat['has_address'] = int(user['location'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
  acc_feat['has_biography'] = int(user['description'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
  acc_feat['url_in_profile'] = int(user['url'] not in ['NULL', 'NaN', '', ' ', pd.np.nan])
  acc_feat['no_bio'] = int(user['description'] in ['NULL', 'NaN', '', ' ', pd.np.nan])
  acc_feat['no_location'] = int(user['location'] in ['NULL', 'NaN', '', ' ', pd.np.nan])


Feature computation time : 0:00:00.505649


In [6]:
print("TOTAL accounts: " + str(users.size))
print("# fake accounts: " + str(nb_fake_acc))
print("# human accounts: " + str(nb_hum_acc))
print("Features name: " + str(list(features_name)))
print(dataset[0])

TOTAL accounts: 66300
# fake accounts: 0
# human accounts: 1950
Features name: ['friends/follower_ratio', 'bot_in_biography', 'bot_in_name', 'has_name', 'has_image', 'has_address', 'has_biography', 'followers_ge_30', 'belongs_to_a_list', 'url_in_profile', 'ratio_friends_followers_ge_50', 'default_image_after_2_month', 'friends_ge_100', 'no_bio', 'no_location', 'nb_friends']
{'id': 24503, 'screen_name': 'Bonanzinga', 'features': {'friends/follower_ratio': 5.7370898357435176e-05, 'bot_in_biography': 0, 'bot_in_name': 0, 'has_name': 1, 'has_image': 1, 'has_address': 1, 'has_biography': 1, 'followers_ge_30': 1, 'belongs_to_a_list': 1, 'url_in_profile': 1, 'ratio_friends_followers_ge_50': 0, 'default_image_after_2_month': 0, 'friends_ge_100': 1, 'no_bio': 0, 'no_location': 0, 'nb_friends': 1466}}


In [7]:
import pandas as pd
                                                                     
original_df = pd.DataFrame(dataset)                                        
original_df.to_pickle("hum_features.pkl")

In [8]:
import pandas as pd

hum_features = pd.read_pickle("hum_features.pkl")
hum_number = len(hum_features)
print(hum_features)

              id     screen_name  \
0          24503      Bonanzinga   
1          22903         effeffe   
2         382393            ciro   
3         286543         abragad   
4         438023    fullcaffeine   
...          ...             ...   
1945  1127280169         frilaif   
1946  1156344000    Stronzetta__   
1947  1169114810  angelagervasi2   
1948  1212975186         Movie1O   
1949  1213937306  xjawaadscookie   

                                               features  
0     {'friends/follower_ratio': 5.7370898357435176e...  
1     {'friends/follower_ratio': 0.01113406795224977...  
2     {'friends/follower_ratio': 0.00062475783124930...  
3     {'friends/follower_ratio': 0.00061856862064978...  
4     {'friends/follower_ratio': 0.01483510975976477...  
...                                                 ...  
1945  {'friends/follower_ratio': 0.00286612426035502...  
1946  {'friends/follower_ratio': 0.11363636363636363...  
1947  {'friends/follower_ratio': 0, 'bot_in_b

## Crawl retweet ratio through tweepy (You need tokens & key)

In [9]:
import tweepy

access_token = xxx                                                          
access_token_secret = xxx
consumer_key = xxx                                                       
consumer_secret = xxx

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

In [10]:
dataset = list()

for idx, row in hum_features.iterrows() :
    instance = dict()
    acc_feat=dict()
    print(row["screen_name"])
    try :
        user = api.get_user(row["screen_name"])
    except :
        continue
    #user_info = api.lookup_users(user_ids=[user.id])

    statuses = api.user_timeline(user.screen_name) 
    
    tweets_len = list()
    tweets_id=list()
    for status in statuses:
        tweets_id.append(status.id)
        tweets_len.append(len(status.text))

    num_retweets = 0
    for post_id in tweets_id:
        retweets_list = api.retweets(id=post_id) 
        print(len(retweets_list))
        # printing the screen names of the retweeters 
        if len(retweets_list) > 0 :
            num_retweets += 1
        
    num_posts = len(tweets_id)
    if num_posts == 0:
        retweet_ratio = 0
    else : 
        retweet_ratio = num_retweets/num_posts
    post_length_avg = sum(tweets_len)/len(tweets_len)
    
    acc_feat["retweet_ratio"] =retweet_ratio
    acc_feat["post_length"] = post_length_avg
    features = row["features"]
    features = {**features, **acc_feat}
    instance["id"]=row["id"]
    instance["screen_name"]=row["screen_name"]
    instance["features"]=features
    dataset.append(instance)
        
new_features = pd.DataFrame(dataset)
hum_features.to_pickle("hum_features.pkl")

Bonanzinga


RateLimitError: [{'message': 'Rate limit exceeded', 'code': 88}]