In [143]:
%%time
import glob
import joblib
import pandas as pd
from tqdm.auto import tqdm
import altair as alt

files = glob.glob('../data/history/*')
all_data = []

for file in tqdm(files):
    all_data += joblib.load(file)

print("Number of samples:", len(all_data))
df = pd.DataFrame(all_data)
df.timestamp = pd.to_datetime(df.timestamp, utc=True)
df['timestamp_hours'] = df['timestamp'].dt.floor('h')

  0%|          | 0/3 [00:00<?, ?it/s]

Number of samples: 150000
CPU times: user 2.6 s, sys: 257 ms, total: 2.85 s
Wall time: 3.47 s


In [144]:
# Filtering Wiki revisions
df["is_wiki"] = df.user.apply(lambda x: "imported>" in str(x))
df = df[df["is_wiki"] == False]

# Filtering Bots
df = df[~df["user"].isin(["Rwbcontent", "RUbot"])]  ## removing bot

In [145]:
df_plot = df.groupby('timestamp_hours').count().revid.reset_index()
df_plot = \
    df_plot[
    (df_plot.timestamp_hours < "2023-09-20 00:00:00+00:00") & 
    (df_plot.timestamp_hours > "2023-08-01 00:00:00+00:00")]
len(df_plot)

1182

In [169]:
df_plot['time_str'] = df_plot['timestamp_hours'].dt.strftime('%H').astype(int)

alt.Chart(df_plot, title = alt.Title("RWFork", dx=-220)).mark_rect().encode(
    # x = alt.X('hoursminutes(timestamp_hours):O', title = 'hour of day (UTC)'),
    x = alt.X(
        'time_str:O', 
        title='hour of day (UTC)',
        axis=alt.Axis(
            labelAngle=0,
            labelExpr="(toNumber(substring(datum.value, 0, 2)) % 2 === 0) ? datum.value : ''"
        )
    ),
    y = alt.Y('day(timestamp_hours):O', title = 'day of week'),
    color = alt.Color('revid:Q', title='Number of edits', scale=alt.Scale(scheme='greenblue', domainMax=250))
).configure_axis(
    labelFontSize=20,
    titleFontSize=20
).configure_title(
    fontSize=20
).configure_legend(
    labelFontSize=18,
    titleFontSize=18
)

In [165]:
## Ruwiki stats
df_plot = df.groupby('timestamp_hours').count().revid.reset_index()
df_plot = \
    df_plot[
    (df_plot.timestamp_hours < "2023-09-20 00:00:00+00:00") & 
    (df_plot.timestamp_hours > "2023-08-01 00:00:00+00:00")]

df_plot["hour"] = df_plot.timestamp_hours.dt.hour
df_plot["weekday"] = df_plot.timestamp_hours.dt.day_name()

df_plot[~df_plot.weekday.isin(["Sunday", "Saturday"]) & (df_plot.hour > 7) & (df_plot.hour < 18)
       ].revid.sum() / df_plot.revid.sum()

0.5324787245502531

In [174]:
files = glob.glob('../data/history_wiki/*')
all_data = []

for file in tqdm(files):
    all_data += joblib.load(file)

print("Number of samples:", len(all_data))
df_wiki = pd.DataFrame(all_data)
df_wiki.timestamp = pd.to_datetime(df_wiki.timestamp)
df_wiki['timestamp_hours'] = df_wiki['timestamp'].dt.floor('h')

  0%|          | 0/22 [00:00<?, ?it/s]

Number of samples: 1080693


In [175]:
# Filtering Bots
df_wiki = df_wiki[~df_wiki["user"].apply(lambda x: "bot" in str(x).lower())]  ## removing bot
len(df_wiki)

# Filtering Bots
df_wiki = df_wiki[~df_wiki["user"].isin(["Alex NB OT", "RUbot"])]  ## removing bot

In [179]:
df_plot_wiki = df_wiki.groupby('timestamp_hours').count().revid.reset_index()
df_plot_wiki['time_str'] = df_plot_wiki['timestamp_hours'].dt.strftime('%H').astype(int)
df_plot_wiki = \
    df_plot_wiki[
    (df_plot_wiki.timestamp_hours < "2023-09-20 00:00:00+00:00") & 
    (df_plot_wiki.timestamp_hours > "2023-08-01 00:00:00+00:00")]

alt.Chart(df_plot_wiki, title = alt.Title("Russian Wikipedia", dx=-180)).mark_rect().encode(
    # x = alt.X('hoursminutes(timestamp_hours):O', title = 'hour of day (UTC)'),
    x = alt.X(
        'time_str:O', 
        title='hour of day (UTC)',
        axis=alt.Axis(
            labelAngle=0,
            labelExpr="(toNumber(substring(datum.value, 0, 2)) % 2 === 0) ? datum.value : ''"
        )
    ),
    y = alt.Y('day(timestamp_hours):O', title = 'day of week'),
    color = alt.Color('revid:Q', title='Number of edits', scale=alt.Scale(scheme='goldred', domainMax=1000, domainMin=1))
).configure_axis(
    labelFontSize=20,
    titleFontSize=20
).configure_title(
    fontSize=20
).configure_legend(
    labelFontSize=18,
    titleFontSize=18
)

In [180]:
## Russian wikipedia stats

df_plot_wiki = df_wiki.groupby('timestamp_hours').count().revid.reset_index()
df_plot_wiki = \
    df_plot_wiki[
    (df_plot_wiki.timestamp_hours < "2023-09-20 00:00:00+00:00") & 
    (df_plot_wiki.timestamp_hours > "2023-08-01 00:00:00+00:00")]

df_plot_wiki["hour"] = df_plot_wiki.timestamp_hours.dt.hour
df_plot_wiki["weekday"] = df_plot_wiki.timestamp_hours.dt.day_name()

df_plot_wiki[~df_plot_wiki.weekday.isin(["Sunday", "Saturday"]) & (df_plot_wiki.hour > 7) & (df_plot_wiki.hour < 18)
       ].revid.sum() / df_plot_wiki.revid.sum()

0.4006335828318542

In [181]:
df_plot_wiki["hour"] = df_plot_wiki.timestamp_hours.dt.hour
df_plot_wiki["date"] = df_plot_wiki.timestamp_hours.dt.date
df_plot_wiki = df_plot_wiki.sort_values("hour")

line = alt.Chart(df_plot_wiki[["revid", "hour"]]).mark_line().encode(
    x=alt.X('hour', sort=order),
    y='mean(revid)'
)

band = alt.Chart(df_plot_wiki[["revid", "hour"]]).mark_errorband(extent='ci').encode(
    x=alt.X('hour', sort=order),
    y=alt.Y('revid'),
)

plot_2 = band + line
plot_2

In [182]:
df_plot["day_of_week"] = df_plot.timestamp_hours.dt.day_name()
# df_plot = df_plot[~df_plot.day_of_week.isin(["Sunday", "Saturday"])]
df_plot["hour"] = df_plot.timestamp_hours.dt.hour
df_plot["date"] = df_plot.timestamp_hours.dt.date
df_plot = df_plot.sort_values("hour")

line = alt.Chart(df_plot[["revid", "hour"]]).mark_line().encode(
    x=alt.X('hour', sort=order),
    y='mean(revid)'
)

band = alt.Chart(df_plot[["revid", "hour"]]).mark_errorband(extent='ci').encode(
    x=alt.X('hour', sort=order),
    y=alt.Y('revid'),
)

plot_2 = band + line
plot_2