# Czech Wikipedia in the covid era: New pages

This notebook shows data about pages created at Czech Wikipedia during years 2020 and 2019.

Created by Martin Urbanec, Wikimedia Czech Republic. WIP notebook, do not rely on (yet).

In [1]:
from wmfdata import spark
import pandas as pd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Configuration

In [2]:
SNAPSHOT = '2021-05'
PROJECT='cs.wikipedia'
DBNAME = 'cswiki'

## Helper methods

In [4]:
def merge_dataframes(file_2019, file_2020, suffixes=('_2019', '_2020')):
    df_2019 = pd.read_csv(file_2019, sep='\t')
    df_2019['date'] = df_2019.date.str.replace('2019-', 'year-')
    df_2019.set_index('date', inplace=True)

    df_2020 = pd.read_csv(file_2020, sep='\t')
    df_2020['date'] = df_2020.date.str.replace('2020-', 'year-')
    df_2020.set_index('date', inplace=True)

    df = df_2019.merge(df_2020, left_index=True, right_index=True, suffixes=suffixes)
    df.reset_index(inplace=True)
    df['date'] = df.date.str.replace('year-', '2020-')
    df.set_index('date', inplace=True)
    
    return df

## Newly created pages

In [3]:
def get_new_pages_daily(start_ts, end_ts):
    df = spark.run('''
    SELECT
        TO_DATE(ts) AS `date`,
        SUM(edit_count) AS new_pages
    FROM wmf.edit_hourly
    WHERE
            snapshot="{snapshot}"
        AND project="{project}"

        -- we're interested in data from given timeframe
        AND ts >= '{start_ts}'
        AND ts <= '{end_ts}'

        -- exclude known bots
        AND user_is_bot=false

        -- filter for content edits only
        AND namespace_is_content=true
        
        -- and only for new pages...
        AND creates_new_page=true
    
    GROUP BY
        `date`
    ORDER BY `date`
    '''.format(
        snapshot=SNAPSHOT,
        project=PROJECT,
        start_ts=start_ts,
        end_ts=end_ts
    ))
    return df

In [5]:
df = get_new_pages_daily('2020-01-01T00:00:00', '2020-12-31T23:59:59')
df.to_csv('data/new_articles_daily_2020.tsv', sep='\t', index=False)
df.head()

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


Unnamed: 0,date,new_pages
0,2020-01-02,147
1,2020-01-03,125
2,2020-01-04,169
3,2020-01-05,193
4,2020-01-06,159


In [6]:
df = get_new_pages_daily('2019-01-01T00:00:00', '2019-12-31T23:59:59')
df.to_csv('data/new_articles_daily_2019.tsv', sep='\t', index=False)
df.head()

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


Unnamed: 0,date,new_pages
0,2019-01-02,144
1,2019-01-03,100
2,2019-01-04,129
3,2019-01-05,111
4,2019-01-06,94


In [7]:
df = merge_dataframes('data/new_articles_daily_2019.tsv', 'data/new_articles_daily_2020.tsv')
df.to_csv('data/new_articles_daily_2019_2020.tsv', sep='\t')
df.head()

Unnamed: 0_level_0,new_pages_2019,new_pages_2020
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-02,144,147
2020-01-03,100,125
2020-01-04,129,169
2020-01-05,111,193
2020-01-06,94,159
