In [1]:
%%time
import glob
import joblib
import pandas as pd
from tqdm.auto import tqdm
import altair as alt

files = glob.glob('../data/history/*')
all_data = []

for file in tqdm(files):
    all_data += joblib.load(file)

print("Number of samples:", len(all_data))
df = pd.DataFrame(all_data)
df.timestamp = pd.to_datetime(df.timestamp)
df['timestamp_hours'] = df['timestamp'].dt.floor('h')

  0%|          | 0/3 [00:00<?, ?it/s]

Number of samples: 150000
CPU times: user 3.43 s, sys: 1.07 s, total: 4.51 s
Wall time: 3.62 s


In [2]:
# Filtering Wiki revisions
df["is_wiki"] = df.user.apply(lambda x: "imported>" in str(x))
df = df[df["is_wiki"] == False]

# Filtering Bots
df = df[~df["user"].isin(["Rwbcontent", "RUbot"])]  ## removing bot

In [11]:
df_plot = df.groupby('timestamp_hours').count().revid.reset_index()
df_plot = \
    df_plot[
    (df_plot.timestamp_hours < "2023-09-20 00:00:00+00:00") & 
    (df_plot.timestamp_hours > "2023-06-01 00:00:00+00:00")]

alt.Chart(df_plot).mark_rect().encode(
    x = alt.X('hoursminutes(timestamp_hours):O', title = 'hour of day'),
    y = alt.Y('monthdate(timestamp_hours):O', title = 'date'),
    color = alt.Color('revid:Q', title='Number of edits')
)

In [4]:
alt.Chart(df_plot, title = alt.Title("RWK", dx=-220)).mark_rect().encode(
    x = alt.X('hoursminutes(timestamp_hours):O', title = 'hour of day (UTC)'),
    y = alt.Y('day(timestamp_hours):O', title = 'day of week'),
    color = alt.Color('revid:Q', title='Number of edits', scale=alt.Scale(scheme='greenblue', domainMax=250))
)

In [5]:
## Ruwiki stats

df_plot = df.groupby('timestamp_hours').count().revid.reset_index()
df_plot = \
    df_plot[
    (df_plot.timestamp_hours < "2023-09-20 00:00:00+00:00") & 
    (df_plot.timestamp_hours > "2023-08-01 00:00:00+00:00")]

df_plot["hour"] = df_plot.timestamp_hours.dt.hour
df_plot["weekday"] = df_plot.timestamp_hours.dt.day_name()

df_plot[~df_plot.weekday.isin(["Sunday", "Saturday"]) & (df_plot.hour > 7) & (df_plot.hour < 18)
       ].revid.sum() / df_plot.revid.sum()

0.5324787245502531

In [6]:
files = glob.glob('../data/history_wiki/*')
all_data = []

for file in tqdm(files):
    all_data += joblib.load(file)

print("Number of samples:", len(all_data))
df_wiki = pd.DataFrame(all_data)
df_wiki.timestamp = pd.to_datetime(df_wiki.timestamp)
df_wiki['timestamp_hours'] = df_wiki['timestamp'].dt.floor('h')

  0%|          | 0/22 [00:00<?, ?it/s]

Number of samples: 1080693


In [7]:
# Filtering Bots
df_wiki = df_wiki[~df_wiki["user"].apply(lambda x: "bot" in str(x).lower())]  ## removing bot
len(df_wiki)

# Filtering Bots
df_wiki = df_wiki[~df_wiki["user"].isin(["Alex NB OT", "RUbot"])]  ## removing bot

In [8]:
df_plot_wiki = df_wiki.groupby('timestamp_hours').count().revid.reset_index()
df_plot_wiki = \
    df_plot_wiki[
    (df_plot_wiki.timestamp_hours < "2023-09-20 00:00:00+00:00") & 
    (df_plot_wiki.timestamp_hours > "2023-08-01 00:00:00+00:00")]

alt.Chart(df_plot_wiki, title = alt.Title("Russian Wikipedia", dx=-180)).mark_rect().encode(
    x = alt.X('hoursminutes(timestamp_hours):O', title = 'hour of day (UTC)'),
    y = alt.Y('day(timestamp_hours):O', title = 'day of week'),
    color = alt.Color('revid:Q', title='Number of edits', scale=alt.Scale(scheme='goldred', domainMax=1000,  domainMin=1))
)

In [9]:
## Russian wikipedia stats

df_plot_wiki = df_wiki.groupby('timestamp_hours').count().revid.reset_index()
df_plot_wiki = \
    df_plot_wiki[
    (df_plot_wiki.timestamp_hours < "2023-09-20 00:00:00+00:00") & 
    (df_plot_wiki.timestamp_hours > "2023-08-01 00:00:00+00:00")]

df_plot_wiki["hour"] = df_plot_wiki.timestamp_hours.dt.hour
df_plot_wiki["weekday"] = df_plot_wiki.timestamp_hours.dt.day_name()

df_plot_wiki[~df_plot_wiki.weekday.isin(["Sunday", "Saturday"]) & (df_plot_wiki.hour > 7) & (df_plot_wiki.hour < 18)
       ].revid.sum() / df_plot_wiki.revid.sum()

0.4006335828318542

In [None]:
alt.Chart(df_plot_wiki).mark_rect().encode(
    x = alt.X('hoursminutes(timestamp_hours):O', title = 'hour of day'),
    y = alt.Y('monthdate(timestamp_hours):O', title = 'date'),
    color = alt.Color('revid:Q', title='Number of edits')
)

In [9]:
df_wiki[
    (df_wiki.timestamp_hours < "2023-08-08 10:00:00+00:00") & 
    (df_wiki.timestamp_hours > "2023-08-07 20:00:00+00:00")].page_name.value_counts()

page_name
Тринадцатый выпуск стандартных марок СССР    54
Теорема Померанчука                          50
Список умерших в 2023 году                   40
Кубок мира по шахматам 2023                  38
Метро (фильм, 2003)                          36
                                             ..
Мейтленд-Найлз, Энзли                         2
Фортепианный квартет «Anno Domini»            2
Jucha squalea                                 2
Флаг сельского поселения Часцовское           2
Сестрорецкий инструментальный завод           2
Name: count, Length: 2580, dtype: int64

In [10]:
df_plot_wiki["day_of_week"] = df_plot_wiki.timestamp_hours.dt.day_name()
df_plot_wiki["date"] = df_plot_wiki.timestamp_hours.dt.date
res_wiki = df_plot_wiki.groupby(["date", "day_of_week"]).revid.sum().reset_index() \
    .groupby("day_of_week").agg({"revid": ["mean", "std"]}).reset_index()
res_wiki.columns = ["day_of_week", "n_revisions", "std"]
order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
res_wiki['day_of_week'] = pd.Categorical(res_wiki.day_of_week, order)
res_wiki.sort_values("day_of_week")

Unnamed: 0,day_of_week,n_revisions,std
1,Monday,11165.571429,1205.765989
5,Tuesday,11154.75,1699.040212
6,Wednesday,10717.857143,837.290756
4,Thursday,10535.285714,901.663964
0,Friday,10749.857143,1024.570061
2,Saturday,10793.285714,643.068611
3,Sunday,10988.571429,577.079098


In [11]:
df_plot["day_of_week"] = df_plot.timestamp_hours.dt.day_name()
df_plot["date"] = df_plot.timestamp_hours.dt.date
res_ruwiki = df_plot.groupby(["date", "day_of_week"]).revid.sum().reset_index() \
    .groupby("day_of_week").agg({"revid": ["mean", "std"]}).reset_index()
res_ruwiki.columns = ["day_of_week", "n_revisions", "std"]
order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
res_ruwiki['day_of_week'] = pd.Categorical(res_ruwiki.day_of_week, order)
res_ruwiki.sort_values("day_of_week")

Unnamed: 0,day_of_week,n_revisions,std
1,Monday,1240.0,228.181215
5,Tuesday,1325.125,255.532183
6,Wednesday,1327.714286,153.549899
4,Thursday,1542.285714,433.576489
0,Friday,1253.857143,338.251104
2,Saturday,535.0,106.713011
3,Sunday,543.571429,212.75875


In [12]:
df_plot_wiki["day_of_week"] = df_plot_wiki.timestamp_hours.dt.day_name()
df_plot_wiki["date"] = df_plot_wiki.timestamp_hours.dt.date
res_wiki = df_plot_wiki.groupby(["date", "day_of_week"]).revid.sum().reset_index()
res_wiki["date"] = res_wiki["date"].apply(str)
order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
res_wiki['day_of_week'] = pd.Categorical(res_wiki.day_of_week, order)
res_wiki = res_wiki.sort_values("day_of_week").reset_index(drop=True)

In [None]:
line = alt.Chart(res_wiki).mark_line().encode(
    x=alt.X('day_of_week', sort=order),
    y='mean(revid)'
)

band = alt.Chart(res_wiki).mark_errorband(extent='ci').encode(
    x=alt.X('day_of_week', sort=order),
    y=alt.Y('revid'),
)

plot_1 = band + line
plot_1

In [None]:
df_plot["day_of_week"] = df_plot.timestamp_hours.dt.day_name()
df_plot["date"] = df_plot.timestamp_hours.dt.date
res_ruwiki = df_plot.groupby(["date", "day_of_week"]).revid.sum().reset_index()
res_ruwiki["date"] = res_ruwiki["date"].apply(str)
order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
res_ruwiki['day_of_week'] = pd.Categorical(res_ruwiki.day_of_week, order)
res_ruwiki = res_ruwiki.sort_values("day_of_week").reset_index(drop=True)

line = alt.Chart(res_ruwiki).mark_line().encode(
    x=alt.X('day_of_week', sort=order),
    y='mean(revid)'
)

band = alt.Chart(res_ruwiki).mark_errorband(extent='ci').encode(
    x=alt.X('day_of_week', sort=order),
    y=alt.Y('revid'),
)

plot_2 = band + line
plot_2

In [None]:
alt.layer(plot_2, plot_1).resolve_scale(
    y='independent'
)

In [16]:
df_plot_wiki["hour"] = df_plot_wiki.timestamp_hours.dt.hour
df_plot_wiki["date"] = df_plot_wiki.timestamp_hours.dt.date
df_plot_wiki = df_plot_wiki.sort_values("hour")

line = alt.Chart(df_plot_wiki[["revid", "hour"]]).mark_line().encode(
    x=alt.X('hour', sort=order),
    y='mean(revid)'
)

band = alt.Chart(df_plot_wiki[["revid", "hour"]]).mark_errorband(extent='ci').encode(
    x=alt.X('hour', sort=order),
    y=alt.Y('revid'),
)

plot_2 = band + line
plot_2

In [17]:
df_plot["day_of_week"] = df_plot.timestamp_hours.dt.day_name()
# df_plot = df_plot[~df_plot.day_of_week.isin(["Sunday", "Saturday"])]
df_plot["hour"] = df_plot.timestamp_hours.dt.hour
df_plot["date"] = df_plot.timestamp_hours.dt.date
df_plot = df_plot.sort_values("hour")

line = alt.Chart(df_plot[["revid", "hour"]]).mark_line().encode(
    x=alt.X('hour', sort=order),
    y='mean(revid)'
)

band = alt.Chart(df_plot[["revid", "hour"]]).mark_errorband(extent='ci').encode(
    x=alt.X('hour', sort=order),
    y=alt.Y('revid'),
)

plot_2 = band + line
plot_2