# Czech Wikipedia in the covid era: New pages

This notebook shows data about pages created at Czech Wikipedia during years 2020 and 2019.

Created by Martin Urbanec, Wikimedia Czech Republic. WIP notebook, do not rely on (yet).

In [1]:
from wmfdata import spark
import pandas as pd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Configuration

In [2]:
SNAPSHOT = '2021-05'
PROJECT='cs.wikipedia'
DBNAME = 'cswiki'

## Helper methods

In [3]:
def merge_dataframes(file_2019, file_2020, suffixes=('_2019', '_2020')):
    df_2019 = pd.read_csv(file_2019, sep='\t')
    df_2019['date'] = df_2019.date.str.replace('2019-', 'year-')
    df_2019.set_index('date', inplace=True)

    df_2020 = pd.read_csv(file_2020, sep='\t')
    df_2020['date'] = df_2020.date.str.replace('2020-', 'year-')
    df_2020.set_index('date', inplace=True)

    df = df_2019.merge(df_2020, left_index=True, right_index=True, suffixes=suffixes)
    df.reset_index(inplace=True)
    df['date'] = df.date.str.replace('year-', '2020-')
    df.set_index('date', inplace=True)
    
    return df

In [12]:
def merge_multiple_dataframes(dfs, main_year='2020', values='new_pages'):
    formattedDfs = []
    for df in dfs:
        year = df.date[0].split('-')[0]
        df['year'] = [year] * len(df.index)
        df['date'] = df.date.str.replace('%s-' % year, '%s-' % main_year)
        formattedDfs.append(df)
    
    res = pd.concat(formattedDfs)
    return res.pivot_table(index='date', columns=['year'], values=values, fill_value=0, aggfunc=sum)

## New survived pages

In [4]:
def get_new_pages_daily(start_ts, end_ts):
    df = spark.run('''
    SELECT
        TO_DATE(ts) AS `date`,
        SUM(edit_count) AS new_pages
    FROM wmf.edit_hourly
    WHERE
            snapshot="{snapshot}"
        AND project="{project}"

        -- we're interested in data from given timeframe
        AND ts >= '{start_ts}'
        AND ts <= '{end_ts}'

        -- exclude known bots
        AND user_is_bot=false

        -- filter for content edits only
        AND namespace_is_content=true
        
        -- and only for new pages...
        AND creates_new_page=true
        
        -- and only for survived pages
        AND is_deleted=false
    
    GROUP BY
        `date`
    ORDER BY `date`
    '''.format(
        snapshot=SNAPSHOT,
        project=PROJECT,
        start_ts=start_ts,
        end_ts=end_ts
    ))
    return df

In [5]:
df = get_new_pages_daily('2020-01-01T00:00:00', '2020-12-31T23:59:59')
df.to_csv('data/new_survived_articles_daily_2020.tsv', sep='\t', index=False)
df.head()

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


Unnamed: 0,date,new_pages
0,2020-01-02,124
1,2020-01-03,96
2,2020-01-04,150
3,2020-01-05,157
4,2020-01-06,125


In [7]:
df = get_new_pages_daily('2019-01-01T00:00:00', '2019-12-31T23:59:59')
df.to_csv('data/new_survived_articles_daily_2019.tsv', sep='\t', index=False)
df.head()

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


Unnamed: 0,date,new_pages
0,2019-01-02,121
1,2019-01-03,82
2,2019-01-04,119
3,2019-01-05,96
4,2019-01-06,84


In [8]:
df = get_new_pages_daily('2018-01-01T00:00:00', '2018-12-31T23:59:59')
df.to_csv('data/new_survived_articles_daily_2018.tsv', sep='\t', index=False)
df.head()

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


Unnamed: 0,date,new_pages
0,2018-01-02,132
1,2018-01-03,181
2,2018-01-04,105
3,2018-01-05,191
4,2018-01-06,252


In [9]:
df = get_new_pages_daily('2017-01-01T00:00:00', '2017-12-31T23:59:59')
df.to_csv('data/new_survived_articles_daily_2017.tsv', sep='\t', index=False)
df.head()

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


Unnamed: 0,date,new_pages
0,2017-01-02,150
1,2017-01-03,138
2,2017-01-04,129
3,2017-01-05,165
4,2017-01-06,126


In [11]:
df = merge_multiple_dataframes([pd.read_csv(x, sep='\t') for x in [
    'data/new_survived_articles_daily_2017.tsv',
    'data/new_survived_articles_daily_2018.tsv',
    'data/new_survived_articles_daily_2019.tsv',
    'data/new_survived_articles_daily_2020.tsv'
]], values='new_pages')
df.to_csv('data/new_survived_articles_daily_2016_2020.tsv', sep='\t')
df.head()

year,2017,2018,2019,2020
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-02,150,132,121,124
2020-01-03,138,181,82,96
2020-01-04,129,105,119,150
2020-01-05,165,191,96,157
2020-01-06,126,252,84,125


## Unique page creators

Non-redirect pages created by human users that survived (were not deleted by an admin).

In [None]:
def get_unique_page_creators(start_ts, end_ts):
    return spark.run('''
    SELECT
        TO_DATE(event_timestamp) AS `date`,
        COUNT(DISTINCT coalesce(event_user_id, event_user_text)) AS unique_page_creators
    FROM wmf.mediawiki_history
    WHERE
            snapshot='{snapshot}'

        -- we're interested in dbname's event
        AND wiki_db='{dbname}'

        -- we're interested in new pages created by human users, excluding redirects
        AND event_entity='page'
        AND event_type='create'
        AND user_is_bot_by_historical IS NULL
        AND page_is_redirect=false

        -- in given date
        AND event_timestamp >= '{start_ts}'
        AND event_timestamp <= '{end_ts}'

        -- and only pages that survived
        AND page_is_deleted=false

    GROUP BY `date`
    ORDER BY `date`
    '''.format(
        snapshot=SNAPSHOT,
        dbname=DBNAME,
        start_ts=start_ts,
        end_ts=end_ts
    ))

In [None]:
df = get_unique_page_creators('2020-01-01T00:00:00', '2020-12-31T23:59:59')
df.to_csv('data/unique_page_creators_2020.tsv', sep='\t', index=False)
df.head()

In [None]:
df = get_unique_page_creators('2019-01-01T00:00:00', '2019-12-31T23:59:59')
df.to_csv('data/unique_page_creators_2019.tsv', sep='\t', index=False)
df.head()

In [None]:
df = merge_dataframes('data/unique_page_creators_2019.tsv', 'data/unique_page_creators_2020.tsv')
df.to_csv('data/unique_page_creators_2019_2020.tsv', sep='\t')
df.head()