In [None]:
import pandas as pd
from tqdm.auto import tqdm

In [2]:
column_names = ['wiki_db', 'event_entity', 'event_type', 'event_timestamp', 
                'event_comment_escaped', 'event_user_id', 'event_user_text_historical_escaped', 
                'event_user_text_escaped', 'event_user_blocks_historical_string', 'event_user_blocks_string', 
                'event_user_groups_historical_string', 'event_user_groups_string', 
                'event_user_is_bot_by_historical_string', 'event_user_is_bot_by_string', 
                'event_user_is_created_by_self', 'event_user_is_created_by_system', 
                'event_user_is_created_by_peer', 'event_user_is_anonymous', 
                'event_user_registration_timestamp', 'event_user_creation_timestamp',
                'event_user_first_edit_timestamp', 'event_user_revision_count', 
                'event_user_seconds_since_previous_revision', 'page_id', 
                'page_title_historical_escaped', 'page_title_escaped', 
                'page_namespace_historical', 'page_namespace_is_content_historical',
                'page_namespace', 'page_namespace_is_content', 'page_is_redirect', 
                'page_is_deleted', 'page_creation_timestamp', 'page_first_edit_timestamp', 
                'page_revision_count', 'page_seconds_since_previous_revision', 'user_id', 
                'user_text_historical_escaped', 'user_text_escaped', 
                'user_blocks_historical_string', 'user_blocks_string', 
                'user_groups_historical_string', 'user_groups_string', 
                'user_is_bot_by_historical_string', 'user_is_bot_by_string', 
                'user_is_created_by_self', 'user_is_created_by_system', 
                'user_is_created_by_peer', 'user_is_anonymous', 'user_registration_timestamp', 
                'user_creation_timestamp', 'user_first_edit_timestamp', 'revision_id', 
                'revision_parent_id', 'revision_minor_edit', 'revision_deleted_parts_string',
                'revision_deleted_parts_are_suppressed', 'revision_text_bytes', 'revision_text_bytes_diff', 
                'revision_text_sha1', 'revision_content_model', 'revision_content_format', 
                'revision_is_deleted_by_page_deletion', 'revision_deleted_by_page_deletion_timestamp', 
                'revision_is_identity_reverted', 'revision_first_identity_reverting_revision_id', 
                'revision_seconds_to_identity_revert', 'revision_is_identity_revert', 
                'revision_is_from_before_page_creation', 'revision_tags_string']

columns_to_leave = [
    "event_timestamp", "event_user_text_escaped", "event_user_is_anonymous",
    "page_title_escaped", 'page_seconds_since_previous_revision', "revision_is_identity_reverted",
]

# Research questions: 
1. Revert rate distribution
2. Anon rate
3. Number of revisions per page
4. Number of page views

In [None]:
chunksize = 10 ** 5
filename = "../../../../Downloads/2023-09.ruwiki.2022.tsv"
filename_2 = "../../../../Downloads/2023-09.ruwiki.2023.tsv"
dfs = []

for df in tqdm(pd.read_table(filename, chunksize=chunksize, names=column_names)):
    df = df[df["page_namespace"] == 0]
    df = df[df.page_namespace_is_content]
    df = df[df["event_entity"] == "revision"]
    df = df[df['event_user_is_bot_by_historical_string'].isna()]
    df = df[columns_to_leave]
    dfs.append(df)
for df in tqdm(pd.read_table(filename_2, chunksize=chunksize, names=column_names)):
    df = df[df["page_namespace"] == 0]
    df = df[df.page_namespace_is_content]
    df = df[df["event_entity"] == "revision"]
    df = df[df['event_user_is_bot_by_historical_string'].isna()]
    df = df[columns_to_leave]
    dfs.append(df)

In [4]:
df_full = pd.concat(dfs)

In [5]:
%%time
import glob
import joblib
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import altair as alt
import json

tqdm.pandas()

extracted_data = joblib.load("../data/ru_wiki_extracted_pages.data")
error_data = joblib.load("../data/ru_wiki_error_pages.data")
final_data = pd.DataFrame(joblib.load("../data/ru_wiki_final_dataset.data"))
with open('../data/ru_reveal_wiki_location.json') as user_file:
    ru_reveal_wiki_location = json.load(user_file)

locations_dict = dict()
for location in tqdm(ru_reveal_wiki_location):
    location_key = list(location.keys())[0]
    location_values = np.sort(list(location.values())[0])
    locations_dict[location_key] = "_".join(location_values)
    
changed_pages = set(final_data[final_data.status.isin([3,4])].page_name.to_list())
error_pages = set(final_data[final_data.status.isin([1])].page_name.to_list() + [a["page_name"] for a in error_data])
unchanged_pages = set([a["page_name"] for a in extracted_data if a["is_duplicate"]] + final_data[final_data.status.isin([2])].page_name.to_list())

100%|██████████| 1035086/1035086 [00:03<00:00, 308965.78it/s]


CPU times: user 1min 10s, sys: 16.2 s, total: 1min 26s
Wall time: 1min 31s


In [6]:
changed_pages_3 = set(final_data[final_data.status.isin([3])].page_name.to_list())
changed_pages_4 = set(final_data[final_data.status.isin([4])].page_name.to_list())

In [7]:
import glob
import joblib
import pandas as pd
from tqdm.auto import tqdm
import altair as alt
files = glob.glob('../data/stats/*')
all_stats = []
for file in tqdm(files):
    all_stats += joblib.load(file)
    
df_stats = pd.DataFrame(all_stats)

100%|██████████| 193/193 [00:38<00:00,  5.01it/s]


In [8]:
df_stats_dict = {pn: value for pn, value in zip(df_stats.page_name, df_stats.mean_views)}

## Calculating stats

In [9]:
df_full.head()

Unnamed: 0,event_timestamp,event_user_text_escaped,event_user_is_anonymous,page_title_escaped,page_seconds_since_previous_revision,revision_is_identity_reverted
0,2022-01-01 00:00:24.0,Dmitri Lytov,False,"Вишневская,_Галина_Павловна",3505092.0,False
5,2022-01-01 00:00:37.0,Криворучка,False,Профессионалы.ру,292.0,False
6,2022-01-01 00:00:44.0,Bogdanov-62,False,"Хёрд,_Уильям_Теодор",118402068.0,False
8,2022-01-01 00:00:51.0,Михаил Тум,False,Электровозы_российских_железных_дорог_и_железн...,158914.0,False
10,2022-01-01 00:00:58.0,Bogdanov-62,False,"Фумасони_Бьонди,_Пьетро",111484529.0,False


In [13]:
revisions_count = df_full.groupby("page_title_escaped")["event_timestamp"].count().to_dict()

bool_dict = {True: True, False: False, "true": True, "false": False, "True": True, "False": False}
df_full.event_user_is_anonymous = df_full.event_user_is_anonymous.fillna("False").map(bool_dict)
df_full.revision_is_identity_reverted = df_full.revision_is_identity_reverted.fillna("False").map(bool_dict).astype(int)

anon_rate = df_full.groupby("page_title_escaped")["event_user_is_anonymous"].mean().to_dict()
revert_rate = df_full.groupby("page_title_escaped")["revision_is_identity_reverted"].mean().to_dict()
change_time = df_full.groupby("page_title_escaped")["page_seconds_since_previous_revision"].mean().to_dict()

In [14]:
changed_pages_list = list(changed_pages)
error_pages_list = list(error_pages)
unchanged_pages_list = list(unchanged_pages)
changed_pages_list_3 = list(changed_pages_3)
changed_pages_list_4 = list(changed_pages_4)

changed_pages_revcount = [revisions_count.get(str(t).replace(" ", "_"), 0) for t in changed_pages_list]
error_pages_revcount = [revisions_count.get(str(t).replace(" ", "_"), 0) for t in error_pages_list]
unchanged_pages_revcount = [revisions_count.get(str(t).replace(" ", "_"), 0) for t in unchanged_pages_list]
changed_pages_revcount_3 = [revisions_count.get(str(t).replace(" ", "_"), 0) for t in changed_pages_list_3]
changed_pages_revcount_4 = [revisions_count.get(str(t).replace(" ", "_"), 0) for t in changed_pages_list_4]

changed_pages_anon = [anon_rate.get(str(t).replace(" ", "_")) for t in changed_pages_list]
error_pages_anon = [anon_rate.get(str(t).replace(" ", "_")) for t in error_pages_list]
unchanged_pages_anon = [anon_rate.get(str(t).replace(" ", "_")) for t in unchanged_pages_list]
changed_pages_anon_3 = [anon_rate.get(str(t).replace(" ", "_")) for t in changed_pages_list_3]
changed_pages_anon_4 = [anon_rate.get(str(t).replace(" ", "_")) for t in changed_pages_list_4]

changed_pages_revert = [revert_rate.get(str(t).replace(" ", "_")) for t in changed_pages_list]
error_pages_revert = [revert_rate.get(str(t).replace(" ", "_")) for t in error_pages_list]
unchanged_pages_revert = [revert_rate.get(str(t).replace(" ", "_")) for t in unchanged_pages_list]
changed_pages_revert_3 = [revert_rate.get(str(t).replace(" ", "_")) for t in changed_pages_list_3]
changed_pages_revert_4 = [revert_rate.get(str(t).replace(" ", "_")) for t in changed_pages_list_4]

changed_pages_views = [df_stats_dict.get(str(t).replace(" ", "_")) for t in changed_pages_list]
error_pages_views = [df_stats_dict.get(str(t).replace(" ", "_")) for t in error_pages_list]
unchanged_pages_views = [df_stats_dict.get(str(t).replace(" ", "_")) for t in unchanged_pages_list]
changed_pages_views_3 = [df_stats_dict.get(str(t).replace(" ", "_")) for t in changed_pages_list_3]
changed_pages_views_4 = [df_stats_dict.get(str(t).replace(" ", "_")) for t in changed_pages_list_4]


In [34]:
df_stats = pd.DataFrame({
    "page_title": changed_pages_list + error_pages_list + unchanged_pages_list, 
    "page_type": ["changed"] * len(changed_pages) +  ["missing"] * len(error_pages) + ["duplicated"] * len(unchanged_pages),   
    "edit_count": changed_pages_revcount + error_pages_revcount + unchanged_pages_revcount,
    "anon_rate": changed_pages_anon + error_pages_anon + unchanged_pages_anon,
    "revert_rate": changed_pages_revert + error_pages_revert + unchanged_pages_revert,
    "page_views": changed_pages_views + error_pages_views + unchanged_pages_views
})

df_stats_detailed = pd.DataFrame({
    "page_title": changed_pages_list_3 + changed_pages_list_4 + error_pages_list + unchanged_pages_list, 
    "page_type": ["changed_3"] * len(changed_pages_3) + ["changed_4"] * len(changed_pages_4) +  ["missing"] * len(error_pages) + ["duplicated"] * len(unchanged_pages),   
    "edit_count": changed_pages_revcount_3 + changed_pages_revcount_4 + error_pages_revcount + unchanged_pages_revcount,
    "anon_rate": changed_pages_anon_3 + changed_pages_anon_4 + error_pages_anon + unchanged_pages_anon,
    "revert_rate": changed_pages_revert_3 + changed_pages_revert_4 + error_pages_revert + unchanged_pages_revert,
    "page_views": changed_pages_views_3 + changed_pages_views_4 + error_pages_views + unchanged_pages_views
})

In [35]:
df_stats["unedited"] = df_stats.edit_count == 0

In [36]:
df_stats.groupby("page_type").unedited.mean()

page_type
changed       0.192758
duplicated    0.422698
missing       0.018146
Name: unedited, dtype: float64

In [37]:
df_stats[df_stats.unedited == False].groupby("page_type").edit_count.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
page_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
changed,27175.0,25.237203,91.630736,1.0,2.0,6.0,20.0,10537.0
duplicated,1081889.0,6.007449,21.934085,1.0,1.0,2.0,5.0,13484.0
missing,17423.0,12.866957,96.247009,1.0,2.0,4.0,8.0,9991.0


In [188]:
df_stats[df_stats.unedited == False].groupby("page_type").anon_rate.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
page_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
changed,27175.0,0.209835,0.258025,0.0,0.0,0.111111,0.357143,1.0
duplicated,1081889.0,0.16278,0.288923,0.0,0.0,0.0,0.25,1.0
missing,17423.0,0.060875,0.171231,0.0,0.0,0.0,0.0,1.0


In [189]:
df_stats[df_stats.unedited == False].groupby("page_type").revert_rate.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
page_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
changed,27175.0,0.103244,0.174092,0.0,0.0,0.0,0.163934,1.0
duplicated,1081889.0,0.040488,0.123778,0.0,0.0,0.0,0.0,1.0
missing,17423.0,0.014747,0.063371,0.0,0.0,0.0,0.0,0.964286


In [190]:
df_stats.groupby("page_type").revert_rate.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
page_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
changed,27175.0,0.103244,0.174092,0.0,0.0,0.0,0.163934,1.0
duplicated,1081889.0,0.040488,0.123778,0.0,0.0,0.0,0.0,1.0
missing,17423.0,0.014747,0.063371,0.0,0.0,0.0,0.0,0.964286


In [229]:
df_stats.groupby("page_type").page_views.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
page_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
changed,6744.0,4190.71634,25951.17235,0.0,19.4,108.7,1338.35,1171779.2
duplicated,402408.0,421.87016,2645.309912,0.0,4.8,16.6,94.4,738782.6
missing,2038.0,410.158677,3238.590114,0.6,6.25,11.0,41.475,82795.4


In [40]:
# df_stats[df_stats.page_type.isin(["changed_3", "changed_4"])].edit_count.sum()
df_stats[df_stats.page_type.isin(["changed"])].page_views.sum() / df_stats.page_views.sum()

0.14211959997359686

In [41]:
df_stats_detailed[df_stats_detailed.page_type.isin(["changed_3"])].page_views.sum() / df_stats_detailed.page_views.sum()

0.04631556751144511

In [42]:
df_stats_detailed[df_stats_detailed.page_type.isin(["changed_4"])].page_views.sum() / df_stats_detailed.page_views.sum()

0.09580403246215173

In [44]:
len(df_stats[df_stats.page_type.isin(["changed"])]) / len(df_stats)

0.01748368694727264

# Building bootstrap stats: 

In [236]:
def draw_bs_replicates(data, func, size):
    """creates a bootstrap sample, computes replicates and returns replicates array"""
    # Create an empty array to store replicates
    bs_replicates = np.empty(size)
    
    # Create bootstrap replicates as much as size
    for i in tqdm(range(size)):
        # Create a bootstrap sample
        bs_sample = np.random.choice(data, size=10**3)  # always take 10K samples with replication
        # Get bootstrap replicate and append to bs_replicates
        bs_replicates[i] = func(bs_sample)
    
    return bs_replicates

In [244]:
stats = []
types = []
means = []
x_top = []
x_bottom = []

for stat in tqdm(["edit_count", "anon_rate", "revert_rate", "page_views"]):
    for type_ in ["changed", "duplicated", "missing"]:
        bs_stats = draw_bs_replicates(
            df_stats[df_stats.page_type == type_].dropna()[stat].to_list(), np.mean, 10000
        )
        stats.append(stat)
        types.append(type_)
        means.append(np.mean(bs_stats))
        x_bottom.append(np.percentile(bs_stats,[2.5]))
        x_top.append(np.percentile(bs_stats,[97.5]))

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [245]:
source = pd.DataFrame(
    {
        "stats": stats,
        "types": types,
        "means": means,
        "x_top": x_top,
        "x_bottom": x_bottom
    }
)
source.x_top = source.x_top.apply(lambda x: x[0])
source.x_bottom = source.x_bottom.apply(lambda x: x[0])

In [None]:
import altair as alt
import pandas as pd

def make_plot_local(ss_local, title="Edits count"):
    bar = alt.Chart(ss_local).mark_errorbar(ticks=True).encode(
        alt.X("x_top:Q").scale(zero=False).title(title),
        alt.X2("x_bottom:Q"),
        alt.Y("types:N", title = ""),
        color = "types:N"
    ).properties(width=75, height=75)

    point = alt.Chart(ss_local).mark_point(
        filled=True,
        color="black"
    ).encode(
        alt.X("means:Q"),
        alt.Y("types:N"),
        color = alt.Color("types:N", title="Group:")
    ).properties(width=75, height=75)

    return bar + point

source_local = source[source.stats == "edit_count"]
plot_1 = make_plot_local(source_local, title="(b) Edits count")

source_local = source[source.stats == "anon_rate"]
plot_2 = make_plot_local(source_local, title="(c) IP edits rate")

source_local = source[source.stats == "revert_rate"]
plot_3 = make_plot_local(source_local, title="(d) Revert rate")

source_local = source[source.stats == "page_views"]
plot_4 = make_plot_local(source_local, title="(a) Page views")

(plot_4 | plot_1) & (plot_2 | plot_3)

In [257]:
source

Unnamed: 0,stats,types,means,x_top,x_bottom
0,edit_count,changed,25.108996,29.266,21.37595
1,edit_count,duplicated,5.733034,6.899,4.848975
2,edit_count,missing,7.175653,9.080025,5.623975
3,anon_rate,changed,0.219022,0.236069,0.202127
4,anon_rate,duplicated,0.177994,0.197183,0.159415
5,anon_rate,missing,0.043944,0.053702,0.03465
6,revert_rate,changed,0.11174,0.123578,0.100203
7,revert_rate,duplicated,0.057591,0.066928,0.048768
8,revert_rate,missing,0.009514,0.0127,0.006582
9,page_views,changed,5533.719292,7765.3986,4115.06136
