In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

import tqdm
import pathlib

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
normal_data = open('vews/normal_user_patterns_new.csv').read().strip().split('\n')
vandal_data = open('vews/vandal_user_patterns_new.csv').read().strip().split('\n')

In [3]:
print(len(normal_data), len(vandal_data))

16549 17027


In [4]:
columns = 'type meta consecutive reversion threehop common fast'.split()

In [5]:
categories = [list(s) for s in'frn mn cnx rnx tmNx zoux vfsx'.split()]
categories2id = {name: {v: i for i, v in enumerate(vocab)} for vocab, name in zip(categories, columns)}

In [6]:
def parse_action(actions):
    all_data = []
    for s in actions:
        data = {}
        data['type'] = s[0]
        data['meta'] = s[1]
        if s[0] == 'f':  # first
            assert len(s) == 2
        elif s[0] == 'r':  # re-edit
            assert len(s) == 5
            data['consecutive']= s[2]
            data['reversion'] = s[3]
            data['fast'] = s[4]
        elif s[0] == 'n':  # new
            assert len(s) == 5
            data['threehop']= s[2]
            data['common'] = s[3]
            data['fast'] = s[4]
        all_data.append(data)
    df = pd.DataFrame(all_data, columns=columns).fillna('x')
    for col in columns:
        df[col] = df[col].apply(lambda x: categories2id[col][x])
    return df

In [7]:
path = pathlib.Path('vews_all')
record_path = path / 'records'
record_path.mkdir(exist_ok=True, parents=True)

In [8]:
for i, line in tqdm.tqdm(enumerate(normal_data)):
    df = parse_action(line.split(','))
    df.to_csv(record_path / f'normal{i}.csv', header=True, index=False)
    
for i, line in tqdm.tqdm(enumerate(vandal_data)):
    df = parse_action(line.split(','))
    df.to_csv(record_path / f'vandal{i}.csv', header=True, index=False)

16549it [00:26, 614.38it/s]
17027it [00:26, 636.95it/s]


Save info files

In [9]:
lines = [name + ',' + ','.join([str(x) for x in cats]) for name, cats in zip(columns, categories)]
with open(path / 'categories.txt', 'w') as fh:
    fh.write('\n'.join(lines))

In [10]:
lines = [[col, 'cat', len(vocab)] for col, vocab in zip(columns, categories)]
df_info = pd.DataFrame(lines)
df_info.to_csv(path / 'info.csv', header=None, index=False)

Save groundtruth

In [11]:
normal_data = pd.Series(np.zeros(len(normal_data)), index=[f'normal{i}' for i in range(len(normal_data))])
vandal_data = pd.Series(np.ones(len(vandal_data)), index=[f'vandal{i}' for i in range(len(vandal_data))])

groundtruth = pd.concat([normal_data, vandal_data], 0).astype(int)

groundtruth.to_csv(path / 'groundtruth.csv', header=None, index=True)

In [12]:
groundtruth.mean()

0.5071181796521325