In [1]:
import requests
import numpy as np
import pandas as pd
import os
import traceback

In [2]:
def get_contents(baseurl, content, start_rev_id, end_rev_id=""):
    content_url = os.path.join(baseurl, "rev_content", content, str(start_rev_id)+"/")
    if end_rev_id:
        content_url = os.path.join(content_url, str(end_rev_id)+"/")
    params = { "o_rev_id": "false", "editor": "false", "token_id": "true", "in": "false", "out": "false" }
    try:
        response = requests.get(content_url, params= params)
        if response.status_code == requests.codes.ok: 
            response = response.json()
            if "revisions" in response.keys() :
                return response["revisions"]
            elif "revisions" not in response.keys() : 
                raise AttributeError("Server did not return revisions key it returned \t"+response.keys())
        elif response.status_code != requests.codes.ok : 
            print(content_url)
            raise AttributeError("Server returned bad code\t"+response.status_code)
    except:
        print(traceback.format_exc())

In [3]:
def tokens_to_df(tokens):
    tokens.insert(0, {'token_id':-1, 'str':  "{st@rt}"})
    tokens.append({'token_id':-2, 'str': "{$nd}"})
    return pd.DataFrame(tokens)

In [4]:
def save_content(revison_series, filename, content, step=200, baseurl="https://api.wikiwho.net/en/api/v1.0.0-beta/"):
    end_index = revison_series.size
    from_index = 0
    with pd.HDFStore(filename, 'a') as store:
        try:
            for to_index in  range(0, end_index, step):    
                rev_contents = get_contents(baseurl, content, str(revison_series[from_index]), str(revison_series[to_index]))
                from_index = to_index
                for rev_content in rev_contents:
                    key = "r"+list(rev_content.keys())[0]
                    df = tokens_to_df(list(rev_content.values())[0]["tokens"])
                    store.put(key, df, table=False)
            to_index = from_index + (end_index-1)%step
            rev_contents = get_contents(baseurl, content, str(revison_series[from_index]), str(revison_series[to_index]))
            rev_contents.extend(get_contents(baseurl, content, str(revison_series[to_index])))
            for rev_content in rev_contents:
                key = "r"+list(rev_content.keys())[0]
                df = tokens_to_df(list(rev_content.values())[0]["tokens"])
                store.put(key, df, table=False)
        except:
            print("problem ", traceback.format_exc())

In [5]:
def save_article(article_name, baseurl="https://api.wikiwho.net/en/api/v1.0.0-beta/", save_dir = "../data/content", step=200):
    params = {"editor": "true", "timestamp": "true"}
    filename = article_name + ".h5"
    revisions_url = os.path.join( baseurl, "rev_ids", article_name+"/")
    response = requests.get(revisions_url, params= params)
    revisons_list = response.json()["revisions"]
    rev_list_df = pd.DataFrame(revisons_list)
    save_path = os.path.join(save_dir, filename)
    
    all_content_url = os.path.join(baseurl, "all_content", article_name +"/")
    params = { "o_rev_id": "true", "editor": "false", "token_id": "true", "in": "true", "out": "true" }
    all_rev_data = requests.get(all_content_url, params= params)
    all_tokens_df = pd.DataFrame( all_rev_data.json()["all_tokens"] )
    
    with pd.HDFStore(save_path, 'a') as store:
        store.put("rev_list", rev_list_df, table=False)
        store.put("all_tokens", all_tokens_df, table=False)

    save_content(rev_list_df["id"], save_path, article_name, step=step)

In [6]:
def tokens_to_list(tokens):
    token_ids =   [ token["token_id"] for token in tokens ]

    token_ids.insert(0, -1)
    token_ids.append(-2)
    return pd.DataFrame(token_ids)

In [7]:
def save_content_list(revison_series, filename, content, step=200, baseurl="https://api.wikiwho.net/en/api/v1.0.0-beta/"):
    end_index = revison_series.size
    from_index = 0
    with pd.HDFStore(filename, 'a') as store:
        try:
            for to_index in  range(0, end_index, step):    
                rev_contents = get_contents(baseurl, content, str(revison_series[from_index]), str(revison_series[to_index]))
                from_index = to_index
                for rev_content in rev_contents:
                    key = "r"+list(rev_content.keys())[0]
                    df = tokens_to_list(list(rev_content.values())[0]["tokens"])
                    store.put(key, df, table=False)
            # 
            to_index = from_index + (end_index-1)%step
            rev_contents = get_contents(baseurl, content, str(revison_series[from_index]), str(revison_series[to_index]))
            rev_contents.extend(get_contents(baseurl, content, str(revison_series[to_index])))
            for rev_content in rev_contents:
                key = "r"+list(rev_content.keys())[0]
                df = tokens_to_list(list(rev_content.values())[0]["tokens"])
                store.put(key, df, table=False)
        except:
            print("problem ", traceback.format_exc())

In [8]:
%%time
baseurl = "https://api.wikiwho.net/en/api/v1.0.0-beta/"
content = "Bioglass"
save_dir = "../data/content"
params = {"editor": "true", "timestamp": "true"}
filename = content + ".h5"
save_path = os.path.join(save_dir, filename)
revisions_url = os.path.join( baseurl, "rev_ids", content+"/")
response = requests.get(revisions_url, params= params)
revisons_list = response.json()["revisions"]
rev_list_df = pd.DataFrame(revisons_list)
# print(save_path)
# save_content_list(rev_list_df["id"], save_path, content, step=200)

CPU times: user 19.7 ms, sys: 4.04 ms, total: 23.7 ms
Wall time: 455 ms


In [9]:
%%time
rev_contents = {}
# for revision in revisons_list[50:160]:
#     rev_id = str(revision["id"])
#     response = get_contents(baseurl, content, rev_id)
#     rev_contents[revision["id"]] = [ token["token_id"] for token in response[0][ rev_id ]["tokens"] ]
# with pd.HDFStore(filename, 'a') as store:
#     store.put("rev_list", rev_list_df, table=False)

# %time save_content(rev_list_df["id"], filename, content, step=200)

# %time save_content(rev_list_df["id"], filename, content, step=50)

# %time save_content(rev_list_df["id"], filename, content, step=20)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.72 µs


In [10]:
#article_series=pd.read_csv("../conflicted_article.csv")["articles"]

In [11]:
save_article("bioglass")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['in', 'out', 'str']]

  exec(code_obj, self.user_global_ns, self.user_ns)


In [12]:
# %%time
# for article in article_series[-3:]:
#     print(article)
#     save_article(article)