# Czech Wikipedia in the covid era - explore data

This is a notebook prepared by Martin Urbanec, Wikimedia Czech Republic, to explore data related to usage of Wikipedia during the covid era.

This is a WIP notebook, queries are not finetuned and data may be unaccurate. Do not rely on it at all.

In [1]:
from wmfdata import spark
import pandas as pd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Config

In [2]:
SNAPSHOT = '2021-05'
PROJECT='cs.wikipedia'
DBNAME = 'cswiki'

## Helper methods

In [3]:
def merge_dataframes(file_2019, file_2020, suffixes=('_2019', '_2020')):
    df_2019 = pd.read_csv(file_2019, sep='\t')
    df_2019['date'] = df_2019.date.str.replace('2019-', 'year-')
    df_2019.set_index('date', inplace=True)

    df_2020 = pd.read_csv(file_2020, sep='\t')
    df_2020['date'] = df_2020.date.str.replace('2020-', 'year-')
    df_2020.set_index('date', inplace=True)

    df = df_2019.merge(df_2020, left_index=True, right_index=True, suffixes=suffixes)
    df.reset_index(inplace=True)
    df['date'] = df.date.str.replace('year-', '2020-')
    df.set_index('date', inplace=True)
    
    return df

## Important dates

In [4]:
pd.read_csv('data/nouzove_stavy.tsv', sep='\t')

Unnamed: 0,date,event
0,2020-03-12,Vyhlášení prvního nouzového stavu
1,2020-05-17,Konec prvního nouzového stavu
2,2020-10-05,Vyhlášení druhého nouzového stavu
3,2021-02-14,Konec druhého nouzového stavu


## Daily edits

In [5]:
def get_total_edits_daily(start_ts, end_ts):
    return spark.run('''
    WITH refined_edit_hourly AS (
        SELECT
            TO_DATE(ts) AS `date`,
            CASE user_tenure_bucket
                WHEN 'Under 1 day' THEN 'newcomer'
                WHEN '1 to 7 days' THEN 'newcomer'
                WHEN '7 to 30 days' THEN 'newcomer'
                ELSE 'experienced'
            END AS experienced_newcomer,
            edit_count
        FROM wmf.edit_hourly
        WHERE
                snapshot="{snapshot}"
            AND project="{project}"

            -- we're interested in time-limited data
            AND ts >= '{start_ts}'
            AND ts <= '{end_ts}'

            -- exclude known bots
            AND user_is_bot=false

            -- filter for content edits only
            AND namespace_is_content=true
    )

    SELECT
        `date`,
        experienced_newcomer,
        SUM(edit_count) AS edits
    FROM refined_edit_hourly
    GROUP BY
        `date`,
        experienced_newcomer
    ORDER BY `date`
    '''.format(
        snapshot=SNAPSHOT,
        project=PROJECT,
        start_ts=start_ts,
        end_ts=end_ts
    ))

In [6]:
df = get_total_edits_daily('2020-01-01T00:00:00', '2020-12-31T23:59:59')

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


In [7]:
dfAgg = df.pivot_table(index='date', columns=['experienced_newcomer'], values='edits', fill_value=0, aggfunc=sum)
dfAgg.to_csv('data/edits_hourly_refined_2020.tsv', sep='\t')
dfAgg.head(10)

experienced_newcomer,experienced,newcomer
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-02,2014,80
2020-01-03,1987,100
2020-01-04,2245,75
2020-01-05,2170,58
2020-01-06,2506,37
2020-01-07,2661,53
2020-01-08,2682,45
2020-01-09,2681,79
2020-01-10,2871,50
2020-01-11,2803,87


In [8]:
df = get_total_edits_daily('2019-01-01T00:00:00', '2019-12-31T23:59:59')

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


In [9]:
dfAgg = df.pivot_table(index='date', columns=['experienced_newcomer'], values='edits', fill_value=0, aggfunc=sum)
dfAgg.to_csv('data/edits_hourly_refined_2019.tsv', sep='\t')
dfAgg.head(10)

experienced_newcomer,experienced,newcomer
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-02,1928,47
2019-01-03,1843,64
2019-01-04,1691,35
2019-01-05,1873,57
2019-01-06,1527,61
2019-01-07,1391,73
2019-01-08,1783,44
2019-01-09,1731,62
2019-01-10,1639,71
2019-01-11,1472,48


In [10]:
df = merge_dataframes('data/edits_hourly_refined_2019.tsv', 'data/edits_hourly_refined_2020.tsv')
df.to_csv('data/edits_hourly_refined_2019_2020.tsv', sep='\t')
df.head()

Unnamed: 0_level_0,experienced_2019,newcomer_2019,experienced_2020,newcomer_2020
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-02,1928,47,2014,80
2020-01-03,1843,64,1987,100
2020-01-04,1691,35,2245,75
2020-01-05,1873,57,2170,58
2020-01-06,1527,61,2506,37


## Daily unique editors

In [17]:
def get_unique_editors_daily(starting_date, ending_date):
    return spark.run('''
    SELECT
        dt AS `date`,
        value AS unique_editors
    FROM wmf.mediawiki_metrics
    WHERE
        snapshot='{snapshot}'
        AND wiki_db='{dbname}'
        AND metric='daily_unique_editors'

        -- filter for given dataframe
        AND dt > '{starting_date}'
        AND dt < '{ending_date}'
    ORDER BY dt
    '''.format(
        snapshot=SNAPSHOT,
        dbname=DBNAME,
        starting_date=starting_date,
        ending_date=ending_date
    ))

In [18]:
df = get_unique_editors_daily('2020-01-01', '2020-12-31')
df.to_csv('data/unique_editors_2020.tsv', sep='\t', index=False)
df.head()

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


Unnamed: 0,date,unique_editors
0,2020-01-02,513
1,2020-01-03,525
2,2020-01-04,475
3,2020-01-05,501
4,2020-01-06,586


In [19]:
df = get_unique_editors_daily('2019-01-01', '2019-12-31')
df.to_csv('data/unique_editors_2019.tsv', sep='\t', index=False)
df.head()

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


Unnamed: 0,date,unique_editors
0,2019-01-02,453
1,2019-01-03,469
2,2019-01-04,431
3,2019-01-05,461
4,2019-01-06,469


In [20]:
df = merge_dataframes('data/unique_editors_2019.tsv', 'data/unique_editors_2020.tsv')
df.to_csv('data/unique_editors_2019_2020.tsv', sep='\t')
df.head()

Unnamed: 0_level_0,unique_editors_2019,unique_editors_2020
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-02,453,513
2020-01-03,469,525
2020-01-04,431,475
2020-01-05,461,501
2020-01-06,469,586


## Daily unique page creators

In [21]:
def get_unique_page_creators_daily(starting_date, ending_date):
    return spark.run('''
    SELECT
        dt AS `date`,
        value AS unique_page_creators
    FROM wmf.mediawiki_metrics
    WHERE
        snapshot='{snapshot}'
        AND wiki_db='{dbname}'
        AND metric='daily_unique_page_creators'

        -- filter for given dataframe
        AND dt > '{starting_date}'
        AND dt < '{ending_date}'
    ORDER BY dt
    '''.format(
        snapshot=SNAPSHOT,
        dbname=DBNAME,
        starting_date=starting_date,
        ending_date=ending_date
    ))

In [26]:
df = get_unique_page_creators_daily('2020-01-01', '2020-12-31')
df.to_csv('data/unique_page_creators_2020.tsv', sep='\t', index=False)
df.head()

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


Unnamed: 0,date,unique_page_creators
0,2020-01-02,106
1,2020-01-03,105
2,2020-01-04,104
3,2020-01-05,119
4,2020-01-06,110


In [27]:
df = get_unique_page_creators_daily('2019-01-01', '2019-12-31')
df.to_csv('data/unique_page_creators_2019.tsv', sep='\t', index=False)
df.head()

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


Unnamed: 0,date,unique_page_creators
0,2019-01-02,88
1,2019-01-03,88
2,2019-01-04,85
3,2019-01-05,82
4,2019-01-06,75


In [28]:
df = merge_dataframes('data/unique_page_creators_2019.tsv', 'data/unique_page_creators_2020.tsv')
df.to_csv('data/unique_page_creators_2019_2020.tsv', sep='\t')
df.head()

Unnamed: 0_level_0,unique_page_creators_2019,unique_page_creators_2020
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-02,88,106
2020-01-03,88,105
2020-01-04,85,104
2020-01-05,82,119
2020-01-06,75,110
