In [1]:
import pandas as pd
import json
from sseclient import SSEClient as EventSource
from bloom_filter import BloomFilter
import time

## Collect data in stream

In [2]:
url = 'https://stream.wikimedia.org/v2/stream/recentchange'
wiki = 'enwiki' #Client side filter
counter = 0
changes = []

start = time.time()
duration = 28800

for event in EventSource(url):
    if event.event == 'message':
        try:
            change = json.loads(event.data)
        except ValueError:
            continue
        if change['wiki'] == wiki:
            counter += 1

            username_hash = hash(change['user'])
            # sample 20% changes
            if username_hash % 10 < 2:
                changes.append(change)

            current_time = time.time()
            runtime = current_time - start
            if runtime >= duration:
                break

In [None]:
train = pd.json_normalize(changes)
train.to_pickle('data/train.pkl')
train.shape

In [4]:
train = pd.read_pickle('data/train.pkl')

## Train bloom filter

In [5]:
bots = train.loc[train['bot']==True, 'comment'].unique().tolist()

In [6]:
bloom_filter = BloomFilter(n_elements=len(bots), fp_prob=0.1)

Initiazied filter of size 11990


In [7]:
for bot in bots:
    bloom_filter.insert(bot)

## Test bloom filter

In [8]:
url = 'https://stream.wikimedia.org/v2/stream/recentchange'
wiki = 'enwiki' #Client side filter
counter = 0
changes = []
maxEvents =  1000 # get n events and stop
for event in EventSource(url):
    if event.event == 'message':
        try:
            change = json.loads(event.data)
        except ValueError:
            continue
        if change['wiki'] == wiki:
            counter += 1

            bot_bloom = bloom_filter.check(change['comment'])
            change['bot_bloom'] = bot_bloom
            changes.append(change)

            if counter > maxEvents:
                break

In [9]:
test = pd.json_normalize(changes)
test.to_pickle('test.pkl')

In [10]:
test = pd.read_pickle('test.pkl')

In [11]:
bot_count = test[(test['bot']==True)].shape[0]
bloom_bot_count = test[(test['bot_bloom']==True)].shape[0]
tp_count = test[(test['bot']==True) & (test['bot_bloom']==True)].shape[0]
print(f'True bots: {bot_count}\nBloom filtered bots: {bloom_bot_count}\nTrue positives: {tp_count}')

True bots: 166
Bloom filtered bots: 337
True positives: 129


In [12]:
accuracy = test[test['bot'] == test['bot_bloom']].shape[0] / test.shape[0]
fp_rate = test[(test['bot']==False) & (test['bot_bloom']==True)].shape[0] / test.shape[0]
fn_rate = test[(test['bot']==True) & (test['bot_bloom']==False)].shape[0] / test.shape[0] # should be 0
fp_rate, fn_rate
print(f'Accuracy: {accuracy}\nFP rate: {fp_rate}\nFN rate: {fn_rate}')

Accuracy: 0.7552447552447552
FP rate: 0.2077922077922078
FN rate: 0.03696303696303696
