In [1]:
import pandas as pd
from bloom_filter import BloomFilter
from sklearn.ensemble import RandomForestClassifier
from utils import read_data, flatten_features, clean_data, create_new_columns
import numpy as np

  pd.set_option('display.max_colwidth', -1)


In [4]:
file_name = 'experiment_big.json'
wiki_df = read_data(file_name)
print(wiki_df.columns)
wiki_df = flatten_features(wiki_df, 'length', ['old', 'new'])
wiki_df = flatten_features(wiki_df, 'revision', ['old', 'new'])

# create_new_columns(wiki_df)
wiki_df = clean_data(wiki_df)
print(wiki_df.columns)
important_columns = ['namespace', 'minor', 'edit_length', 'comment_length',
                     'length.old', 'length.new', 'revision.old', 'revision.new']

features = wiki_df[important_columns]
target = wiki_df.loc[:, ['bot']]
rf = RandomForestClassifier()
rf_model = rf.fit(features, target)

# get bots from our prediction
predicted_bool = rf_model.predict(features)
predicted_users = np.array(wiki_df.loc[predicted_bool, ['user']])

# train Bloom Filter
filter_size = 20000
hash_size = 8
bf = BloomFilter(filter_size, hash_size, debug=False)
bf.train_bloom_filter(predicted_users)

In [6]:
TP = 0
FN = 0
TN = 0
FP = 0

def evaluate(is_bot, is_bot_predicted):
    global TP
    global FN
    global TN
    global FP
    accuracy, precision, recall = 0,0,0
    if is_bot and is_bot_predicted:   
        TP += 1
    elif is_bot and not is_bot_predicted:
        FN += 1
    elif not is_bot and is_bot_predicted:
        FP += 1
    else:
        TN += 1
    all_observations = (TP + TN + FP + FN)
    if all_observations:
        accuracy = (TP + TN) / all_observations
    if TP:
        precision = TP / (TP + FP)
        recall = TP / (TP + FN)
    return accuracy, precision, recall

In [9]:
from sseclient import SSEClient as EventSource
import json
import time

url = 'https://stream.wikimedia.org/v2/stream/recentchange'
wiki = 'enwiki' #Client side filter
count = 0

for event in EventSource(url):
    if (int(str(hash(event.data))[-1]) < 2):
        try:
            change = json.loads(event.data)
            if change['wiki'] == wiki:
                user_name = change['user']
                is_bot = change['bot']
                print(f'User {user_name},\t is {"" if is_bot else " not"} a bot')
                is_bot_predicted = bf.is_in_bloom_filter(user_name)
                print(f'User {user_name},\t is {"" if is_bot_predicted else " not"} a bot \t\t(predicted)\n')
                count += 1
                accuracy, precision, recall = evaluate(is_bot, is_bot_predicted)
                if count % 10 == 0:
                    print(f"""
                    count = {count}
                    TP = {TP}, FP = {FP}, FN = {FN}, TN = {TN}
                    Accuracy = {accuracy * 100: .2f} %
                    Precision = {precision * 100: .2f} %
                    Recall = {recall * 100: .2f} %
                    """)


        except ValueError:
            continue
    else:
        continue

User Prad Nelluru,	 is  not a bot
User Prad Nelluru,	 is  not a bot 		(predicted)

User Somedifferentstuff,	 is  not a bot
User Somedifferentstuff,	 is  not a bot 		(predicted)

User Dontbeliedto,	 is  not a bot
User Dontbeliedto,	 is  not a bot 		(predicted)

User SteveMc25,	 is  not a bot
User SteveMc25,	 is  not a bot 		(predicted)

User Tassedethe,	 is  not a bot
User Tassedethe,	 is  not a bot 		(predicted)

User Fh1,	 is  not a bot
User Fh1,	 is  not a bot 		(predicted)

User SingingSinatra4,	 is  not a bot
User SingingSinatra4,	 is  not a bot 		(predicted)

User Ser Amantio di Nicolao,	 is  not a bot
User Ser Amantio di Nicolao,	 is  not a bot 		(predicted)

User Ser Amantio di Nicolao,	 is  not a bot
User Ser Amantio di Nicolao,	 is  not a bot 		(predicted)

User 2607:FEA8:BD20:F28:BDBE:6360:436D:132C,	 is  not a bot
User 2607:FEA8:BD20:F28:BDBE:6360:436D:132C,	 is  not a bot 		(predicted)


                    count = 10
                    TP = 0, FP = 0, FN = 0, TN = 10
    

NameError: name 'save_json' is not defined