In [1]:
import sys
import os
import traceback
import pickle
import requests


sys.path.append("../")

import pandas as pd
import numpy as np

# from scripts.wiki import Wiki,Revision

In [2]:
class Wiki:
    '''
    MAIN CLASS TO store all revisions for a wiki along with editors and timestamp.
    '''
    def __init__(self,title, revs, all_tokens=[]):
        #self.id = id
        self.title = title
        self.revisions = revs
        self.add_all_token(all_tokens)
        

           
    def add_all_token(self, all_tokens):
        
        for token in all_tokens:
            self.revisions.loc[token["o_rev_id"]].added.add(token["token_id"])
            for in_revision in token["in"]:
                self.revisions.loc[in_revision].added.add(token["token_id"])
            for out_revision in token["out"]:
                self.revisions.loc[out_revision].removed.add(token["token_id"])
                
    def create_change(self, from_rev_id, to_rev_id, to_rev_content, epsilon_size):
        try:
            from_rev = self.revisions[from_rev_id]
            to_rev = self.revisions[to_rev_id]
            from_rev.deleted(to_rev)
            to_rev.content = to_rev_content
            to_rev.inserted_continuous_pos()
            to_rev.inserted_neighbours()
            from_rev.create_change_object(to_rev)
            from_rev.append_neighbour_vec(to_rev, epsilon_size)
        except:
            print("exception occurred in calculating change object",traceback.format_exc())
            print("problem in ", to_rev_content.keys() )

In [3]:
class Revision:
    def __init__(self, id, timestamp,editor):
        self.id = id
        self.timestamp = timestamp
        self.editor = editor
        self.added = set()
        self.removed = set()   
        
    def deleted(self, to_rev):
        self.content["removed"] = pd.Series(np.isin( self.content["token_id"].values, list(to_rev.removed), assume_unique= True ))
        end_pos = np.argwhere(np.ediff1d(np.pad(self.content["removed"].astype(np.int), (1,1), mode="constant", constant_values=0)) == -1) -1 
        start_pos = np.argwhere(np.ediff1d(np.pad(self.content["removed"].astype(np.int), (1,1), mode="constant", constant_values=0)) == 1)
        start_neighbour = start_pos - 1
        end_neighbour = end_pos + 1
        self.deleted_object = pd.DataFrame(np.c_[ start_pos, end_pos, start_neighbour, end_neighbour ],
                                       columns=[ "del_start_pos", "del_end_pos", "left_neigh", "right_neigh",])
    
    def inserted_continuous_pos(self):
        self.content["added"] = pd.Series(np.isin( self.content["token_id"].values, list(self.added), assume_unique= True))
        end_pos = np.argwhere(np.ediff1d(np.pad(self.content["added"].astype(np.int), (1,1), mode="constant", constant_values=0)) == -1) -1 
        start_pos = np.argwhere(np.ediff1d(np.pad(self.content["added"].astype(np.int), (1,1), mode="constant", constant_values=0)) == 1)
        self.added_pos = np.c_[start_pos, end_pos]

    def inserted_neighbours(self):
        start_token_pos = self.added_pos[:,0] - 1
        end_token_pos = self.added_pos[:,1] + 1
        self.start_token_id = self.content["token_id"].values[start_token_pos]
        self.end_token_id = self.content["token_id"].values[end_token_pos]
    
    def create_change_object(self, to_rev):
        self.ins_left = np.argwhere(np.isin(self.content.token_id.values, to_rev.start_token_id, assume_unique= True))
        self.ins_right = np.argwhere(np.isin(self.content.token_id.values, to_rev.end_token_id, assume_unique= True))
        self.inserted_object = pd.DataFrame(np.concatenate([to_rev.added_pos, self.ins_left, self.ins_right], axis=1),
                                       columns=["ins_start_pos", "ins_end_pos", "left_neigh", "right_neigh" ])

        self.change = pd.merge(self.inserted_object, self.deleted_object,how="outer", on=["left_neigh", "right_neigh"])
        self.change.fillna(-1, inplace=True)
        
    def append_neighbour_vec(self, to_rev, epsilon_size):
        self.wiki_who_tokens = self.content.token_id.values
        del self.content
        neighbour_df = self.change.apply(find_tokens, axis=1, args=(self, to_rev, epsilon_size))
        neighbour_df.columns= ["ins_tokens", "del_tokens", "left_neigh_slice", "right_neigh_slice", "left_token", "right_token"]
        self.change_df = pd.concat([self.change, neighbour_df], sort=False, axis=1)
        



In [4]:
def find_tokens(change, revision, to_rev, epsilon_size):
    start_left = (int(change["left_neigh"]) - epsilon_size)
    if start_left <0:
        start_left = 0
    left_neigh = slice( start_left, int(change["left_neigh"]) + 1)
    
    end_right = (int(change["right_neigh"]) + epsilon_size+1)
    if end_right >= revision.wiki_who_tokens.size:
        end_right = revision.wiki_who_tokens.size - 1
    right_neigh = slice(int(change["right_neigh"]), end_right )
    if(change["ins_start_pos"]==-1):
        ins_tokens = []
    else:
        ins_slice = slice(int(change["ins_start_pos"]), int(change["ins_end_pos"]+1) )
        ins_tokens = to_rev.content.token_id.values[ins_slice]
    if(change["del_start_pos"] == -1):
        del_tokens = []
    else:
        del_slice = slice(int(change["del_start_pos"]), int(change["del_end_pos"]+1) )
        del_tokens = revision.wiki_who_tokens[del_slice]
    left_token = revision.wiki_who_tokens[left_neigh]
    right_token = revision.wiki_who_tokens[right_neigh]
    return pd.Series([tuple(ins_tokens), tuple(del_tokens), left_neigh, right_neigh, tuple(left_token), tuple(right_token)])

In [5]:
baseurl = "https://api.wikiwho.net/en/api/v1.0.0-beta/"
article_name = "bioglass"
filename = article_name + ".h5"
content_dir = "../data/content/"
change_object_dir =  "../data/change objects/"
filepath = os.path.join(content_dir, filename)

epsilon_size = 30

In [6]:
len_file = article_name + "_rev_len.h5"
len_file_path = os.path.join(content_dir, len_file)

with pd.HDFStore(filepath, 'r') as store:
    #retrieving all rev list and change object from file
    rev_list = store.get("rev_list")["id"].values.tolist()
    keys = ["r" +  str(rev) for rev in rev_list]
    rev_len_list = [store.get(key).shape[0] for key in keys]
rev_len_df = pd.DataFrame({"rev_id":rev_list[:-1], "length": rev_len_list[:-1]})

rev_len_df.to_hdf(len_file_path, "rev_len")

In [7]:
# %%time
# with pd.HDFStore(filepath, 'r') as store:
#     #retrieving all rev list and change object from file
#     rev_list = store.get("rev_list")
#     all_rev = store.get("all_tokens")
#     all_tokens = all_rev.to_dict(orient="records")
#     #making revision objects
#     revs = rev_list.apply(lambda rev: Revision(rev["id"],rev["timestamp"], rev["editor"]),axis=1)
#     revs.index = rev_list.id
#     from_rev_id = revs.index[0]
    
#     wiki = Wiki(2345, content, revs, all_tokens)
#     wiki.revisions.iloc[0].content = store["r"+str(from_rev_id)] 
#     for to_rev_id in list(revs.index[1:]):
#         key="r"+str(to_rev_id)
#         to_rev_content = store[key]
#         wiki
#         wiki.create_change(from_rev_id, to_rev_id, to_rev_content, epsilon_size)
#         from_rev_id = to_rev_id

In [8]:
# save_filepath = os.path.join(change_object_dir, content+".pkl")
# with open(save_filepath, "wb") as file:
#     pickle.dump(wiki, file)

### saving change object for all the articles in the list

In [9]:
def create_change_object(article_name, content_dir = "../data/content/", 
                            change_object_dir =  "../data/change objects/", epsilon_size=30, save=False):
    
    content_filepath = os.path.join(content_dir, article_name+".h5")
    change_object_filepath = os.path.join(change_object_dir, article_name+".pkl")
    
    with pd.HDFStore(content_filepath, 'r') as store:
        #retrieving all rev list and change object from file
        rev_list = store.get("rev_list")
        all_rev = store.get("all_tokens")
        all_tokens = all_rev.to_dict(orient="records")
        
        #making revision objects
        revs = rev_list.apply(lambda rev: Revision(rev["id"],rev["timestamp"], rev["editor"]),axis=1)
        revs.index = rev_list.id
        
        # Getting first revision object and adding content ot it
        from_rev_id = revs.index[0]
        wiki = Wiki(article_name, revs, all_tokens)

        wiki.revisions.iloc[0].content = store["r"+str(from_rev_id)] 
        # adding content to all other revision and finding change object between them.
        
        for to_rev_id in list(revs.index[1:]):
            key="r"+str(to_rev_id)
            to_rev_content = store[key]
            wiki.create_change(from_rev_id, to_rev_id, to_rev_content, epsilon_size)
            from_rev_id = to_rev_id
         
    if save:
        with open(change_object_filepath, "wb") as file:
            pickle.dump(wiki, file)
        
    return wiki

    

In [10]:
article_name = 'bioglass'
content_dir = "../data/content/"
change_object_dir = "../data/change objects/"
epsilon_size=30
save=False

content_filepath = os.path.join(content_dir, article_name+".h5")
change_object_filepath = os.path.join(change_object_dir, article_name+".pkl")

with pd.HDFStore(content_filepath, 'r') as store:
    #retrieving all rev list and change object from file
    rev_list = store.get("rev_list")
    all_rev = store.get("all_tokens")
    all_tokens = all_rev.to_dict(orient="records")

    #making revision objects
    revs = rev_list.apply(lambda rev: Revision(rev["id"],rev["timestamp"], rev["editor"]),axis=1)
    revs.index = rev_list.id

    # Getting first revision object and adding content ot it
    from_rev_id = revs.index[0]
    wiki = Wiki(article_name, revs, all_tokens)

    wiki.revisions.iloc[0].content = store["r"+str(from_rev_id)] 
    # adding content to all other revision and finding change object between them.

    for to_rev_id in list(revs.index[1:]):
        key="r"+str(to_rev_id)
        to_rev_content = store[key]
        wiki.create_change(from_rev_id, to_rev_id, to_rev_content, epsilon_size)
        from_rev_id = to_rev_id

if save:
    with open(change_object_filepath, "wb") as file:
        pickle.dump(wiki, file)


- article_name: name of the article
- rev_list: ?
- revs: is related to rev_list
- all_rev: ?
- all_tokens: depens on all_rev?
- key: seems to be a revision
- store[key]: it is the same as to_revision_content


In [14]:
all_tokens

[{'in': [758323485],
  'o_rev_id': 18064039,
  'out': [758323388],
  'str': 'bioglass',
  'token_id': 0},
 {'in': [],
  'o_rev_id': 18064039,
  'out': [207995408],
  'str': '®',
  'token_id': 1},
 {'in': [], 'o_rev_id': 18064039, 'out': [], 'str': 'is', 'token_id': 2},
 {'in': [], 'o_rev_id': 18064039, 'out': [], 'str': 'a', 'token_id': 3},
 {'in': [],
  'o_rev_id': 18064039,
  'out': [18704296],
  'str': 'commerical',
  'token_id': 4},
 {'in': [],
  'o_rev_id': 18064039,
  'out': [18907606],
  'str': 'product',
  'token_id': 5},
 {'in': [], 'o_rev_id': 18064039, 'out': [], 'str': 'of', 'token_id': 6},
 {'in': [],
  'o_rev_id': 18064039,
  'out': [779393082],
  'str': 'bioactive',
  'token_id': 7},
 {'in': [],
  'o_rev_id': 18064039,
  'out': [18834333],
  'str': 'glasses',
  'token_id': 8},
 {'in': [],
  'o_rev_id': 18064039,
  'out': [363575227],
  'str': '.',
  'token_id': 9},
 {'in': [],
  'o_rev_id': 18064039,
  'out': [363575227],
  'str': 'it',
  'token_id': 10},
 {'in': [],
  '

In [19]:
from wikiwho_wrapper import WikiWho
ww = WikiWho()
ww.api.all_content("Bioglass")

{'article_title': 'Bioglass',
 'page_id': 2161298,
 'success': True,
 'message': None,
 'threshold': 0,
 'all_tokens': [{'str': 'bioglass',
   'o_rev_id': 18064039,
   'editor': '0|81.172.143.232',
   'token_id': 0,
   'in': [758323485],
   'out': [758323388]},
  {'str': '®',
   'o_rev_id': 18064039,
   'editor': '0|81.172.143.232',
   'token_id': 1,
   'in': [],
   'out': [207995408]},
  {'str': 'is',
   'o_rev_id': 18064039,
   'editor': '0|81.172.143.232',
   'token_id': 2,
   'in': [],
   'out': []},
  {'str': 'a',
   'o_rev_id': 18064039,
   'editor': '0|81.172.143.232',
   'token_id': 3,
   'in': [],
   'out': []},
  {'str': 'commerical',
   'o_rev_id': 18064039,
   'editor': '0|81.172.143.232',
   'token_id': 4,
   'in': [],
   'out': [18704296]},
  {'str': 'product',
   'o_rev_id': 18064039,
   'editor': '0|81.172.143.232',
   'token_id': 5,
   'in': [],
   'out': [18907606]},
  {'str': 'of',
   'o_rev_id': 18064039,
   'editor': '0|81.172.143.232',
   'token_id': 6,
   'in': [

In [None]:
#wiki = create_change_object('bioglass', save=False)

In [None]:
# for article in article_series[19:]:
#     print(article)
#     create_change_object(article)


### Saving change_object as dataframe

In [11]:
change_objects = []
wiki.revisions.iloc[:-1].apply(lambda revision: change_objects.append(revision.change_df))
# change_index = [ rev.id for rev in  wiki.revisions[1:].tolist()]
# change_df = pd.concat(change_objects, sort=False, keys=change_index, axis=)


timestamp_s = pd.to_datetime([ rev.timestamp for rev in  wiki.revisions.values.ravel().tolist()])
time_gap = pd.to_timedelta(timestamp_s[1:]-timestamp_s[:-1])

rev_ids = [ rev.id for rev in  wiki.revisions.tolist()]
from_rev_ids = rev_ids[:-1]
to_rev_ids= rev_ids[1:]

editor_s = [ rev.editor for rev in  wiki.revisions.tolist()]

index = list(zip(*[from_rev_ids, to_rev_ids, timestamp_s.tolist()[1:], time_gap, editor_s[1:]]))
change_df = pd.concat(change_objects, sort=False, keys=index, names=["from revision id", "to revision id", "timestamp", "timegap", "editor"])

In [None]:
# %%time
# change_object_dir =  "../data/change objects/"
# change_dataframe_path = os.path.join(change_object_dir, article_name+"_change.pkl")
# a=change_df.to_pickle(change_dataframe_path)

In [None]:
# %%time
# change_object_dir =  "../data/change objects/"
# change_dataframe_path = os.path.join(change_object_dir, article_name+"_change.pkl")
# a=pd.read_pickle(change_dataframe_path)

In [13]:
try:
    os.makedirs('../data/change objects')
except:
    pass

change_dataframe_path = os.path.join(change_object_dir, article_name+"_change.h5")
change_df.to_hdf(change_dataframe_path, key="data")

  check_attribute_name(name)
  check_attribute_name(name)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->['ins_tokens', 'del_tokens', 'left_neigh_slice', 'right_neigh_slice', 'left_token', 'right_token']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [None]:
change_df.head(50)