# Czech Wikipedia in the covid era: Pageviews

This notebook shows data about pageviews for Czech Wikipedia pages during years 2020 and 2019.

Created by Martin Urbanec, Wikimedia Czech Republic. WIP notebook, do not rely on (yet).

In [2]:
from wmfdata import spark
import pandas as pd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Configuration

In [6]:
SNAPSHOT = '2021-05'
PROJECT='cs.wikipedia'
DBNAME = 'cswiki'

## Helper methods

In [5]:
def merge_dataframes(file_2019, file_2020, suffixes=('_2019', '_2020')):
    df_2019 = pd.read_csv(file_2019, sep='\t')
    df_2019['date'] = df_2019.date.str.replace('2019-', 'year-')
    df_2019.set_index('date', inplace=True)

    df_2020 = pd.read_csv(file_2020, sep='\t')
    df_2020['date'] = df_2020.date.str.replace('2020-', 'year-')
    df_2020.set_index('date', inplace=True)

    df = df_2019.merge(df_2020, left_index=True, right_index=True, suffixes=suffixes)
    df.reset_index(inplace=True)
    df['date'] = df.date.str.replace('year-', '2020-')
    df.set_index('date', inplace=True)
    
    return df

## Total pageviews at Czech Wikipedia
Daily pageviews of all Czech Wikipedia pages during 2020 and 2019.

In [17]:
def get_views_per_year(year):
    return spark.run('''
    SELECT
        CONCAT(year, '-', LPAD(month, 2, '0'), '-', LPAD(day, 2, '0')) AS `date`,
        SUM(view_count) AS views
    FROM wmf.projectview_hourly
    WHERE
            year={year}
        AND project="{project}"
    GROUP BY `date`
    ORDER BY `date`
    '''.format(
        project=PROJECT,
        year=year
    ))

In [18]:
df = get_views_per_year(2020)
df.to_csv('data/cswiki_pageviews_2020.tsv', sep='\t', index=False)
df.head()

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


Unnamed: 0,date,views
0,2020-01-01,3586942
1,2020-01-02,3534903
2,2020-01-03,3483584
3,2020-01-04,3783583
4,2020-01-05,4026342


In [19]:
df = get_views_per_year(2019)
df.to_csv('data/cswiki_pageviews_2019.tsv', sep='\t', index=False)
df.head()

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


Unnamed: 0,date,views
0,2019-01-01,3338366
1,2019-01-02,3454958
2,2019-01-03,3420861
3,2019-01-04,3221445
4,2019-01-05,3802748


In [20]:
df = merge_dataframes('data/cswiki_pageviews_2019.tsv', 'data/cswiki_pageviews_2020.tsv')
df.to_csv('data/cswiki_pageviews_2019_2020.tsv', sep='\t')
df.head()

Unnamed: 0_level_0,views_2019,views_2020
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-01,3338366,3586942
2020-01-02,3454958,3534903
2020-01-03,3420861,3483584
2020-01-04,3221445,3783583
2020-01-05,3802748,4026342


## Pageviews difference -- helper methods

This section has helper methods to calculate pageview differences below.

In [19]:
def calculate_pageviews_difference(month, day):
    fillPageviewsQueryTemplate = '''
    INSERT INTO urbanecm.blogpost_covid_and_cswiki_pageviews

    SELECT
        year,
        month,
        day,
        page_title,
        SUM(view_count) AS views_{year}
    FROM wmf.pageview_hourly
    WHERE
            year = {year}
        AND month = {month}
        AND day = {day}
        AND project = '{project}'
        
        -- user traffic only
        AND agent_type='user'
        
        -- exclude false positive
        AND page_title != 'Venuše_(planeta)'
    GROUP BY
        year,
        month,
        day,
        page_title
    '''
    
    queries = [
        'DROP TABLE IF EXISTS urbanecm.blogpost_covid_and_cswiki_pageviews',
        '''
        CREATE TABLE urbanecm.blogpost_covid_and_cswiki_pageviews(
            year bigint,
            month bigint,
            day bigint,
            page_title string,
            views bigint
        )
        ''',
        fillPageviewsQueryTemplate.format(project=PROJECT, year=2019, month=month, day=day),
        fillPageviewsQueryTemplate.format(project=PROJECT, year=2020, month=month, day=day),
        '''
        SELECT
            year,
            month,
            day,
            SUM(views)
        FROM urbanecm.blogpost_covid_and_cswiki_pageviews
        GROUP BY
            year,
            month,
            day
        '''
    ]

    # fill temp pageviews table -- urbanecm.blogpost_covid_and_cswiki_pageviews
    spark.run(queries)
    
    # return data
    return spark.run('''
    WITH pv_2020 AS (
        SELECT
            month, day, page_title, views AS views_2020
        FROM urbanecm.blogpost_covid_and_cswiki_pageviews
        WHERE
            year=2020
    ),
    pv_2019 AS (
        SELECT
            month, day, page_title, views AS views_2019
        FROM urbanecm.blogpost_covid_and_cswiki_pageviews
        WHERE
            year=2019
    )

    SELECT
        pv_2020.month,
        pv_2020.day,
        pv_2020.page_title,
        views_2020,
        views_2019,
        views_2020 - views_2019 AS views_difference
    FROM pv_2020
    JOIN pv_2019 ON
        pv_2020.month=pv_2019.month
        AND pv_2020.day=pv_2019.day
        AND pv_2020.page_title=pv_2019.page_title
    ORDER BY views_difference DESC
    LIMIT 20
    ''')

In [20]:
def calculate_pageviews_difference_month(month):
    fillPageviewsQueryTemplate = '''
    INSERT INTO urbanecm.blogpost_covid_and_cswiki_pageviews

    SELECT
        year,
        month,
        page_title,
        SUM(view_count) AS views_{year}
    FROM wmf.pageview_hourly
    WHERE
            year = {year}
        AND month = {month}
        AND project = '{project}'
        
        -- user traffic only
        AND agent_type='user'
        
        -- exclude false positive articles
        AND page_title != 'Venuše_(planeta)'
    GROUP BY
        year,
        month,
        page_title
    '''
    
    queries = [
        'DROP TABLE IF EXISTS urbanecm.blogpost_covid_and_cswiki_pageviews',
        '''
        CREATE TABLE urbanecm.blogpost_covid_and_cswiki_pageviews(
            year bigint,
            month bigint,
            page_title string,
            views bigint
        )
        ''',
        fillPageviewsQueryTemplate.format(project=PROJECT, year=2019, month=month),
        fillPageviewsQueryTemplate.format(project=PROJECT, year=2020, month=month),
        '''
        SELECT
            year,
            month,
            SUM(views)
        FROM urbanecm.blogpost_covid_and_cswiki_pageviews
        GROUP BY
            year,
            month
        '''
    ]

    # fill temp pageviews table -- urbanecm.blogpost_covid_and_cswiki_pageviews
    spark.run(queries)
    
    # return data
    return spark.run('''
    WITH pv_2020 AS (
        SELECT
            month, page_title, views AS views_2020
        FROM urbanecm.blogpost_covid_and_cswiki_pageviews
        WHERE
            year=2020
    ),
    pv_2019 AS (
        SELECT
            month, page_title, views AS views_2019
        FROM urbanecm.blogpost_covid_and_cswiki_pageviews
        WHERE
            year=2019
    )

    SELECT
        pv_2020.month,
        pv_2020.page_title,
        views_2020,
        views_2019,
        views_2020 - views_2019 AS views_difference
    FROM pv_2020
    JOIN pv_2019 ON
        pv_2020.month=pv_2019.month
        AND pv_2020.page_title=pv_2019.page_title
    ORDER BY views_difference DESC
    LIMIT 20
    ''')

## Pageviews -- biggest difference

In this section, I at months with higher projectviews in 2020 than in 2019 in more details, to figure out if the pandemic is the cause of the increased page views. For each of the months, I calculate pageviews of all articles in 2020 and 2019 (for that month only) and show 20 with the top difference.

An alternative could be to use [Topviews](https://pageviews.toolforge.org/topviews), but that tool looks at top visited articles in any given month, without any regard to the previous year(s).

### March

In [16]:
df = calculate_pageviews_difference_month(month=3)
df.to_csv('data/cswiki_pageviews_difference_2019_2020_March.tsv', sep='\t', index=False)
df

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


Unnamed: 0,month,page_title,views_2020,views_2019,views_difference
0,3,Hlavní_strana,3239383,2277621,961762
1,3,Koronavirus,348317,297,348020
2,3,Španělská_chřipka,241003,1888,239115
3,3,Speciální:Hledání,1193172,962198,230974
4,3,Adam_Vojtěch,164863,3259,161604
5,3,Morfologie_květu,162691,1380,161311
6,3,SARS,153297,1640,151657
7,3,Pandemie,124315,732,123583
8,3,Roman_Prymula,99698,277,99421
9,3,Itálie,108232,15568,92664


### April

In [18]:
df = calculate_pageviews_difference_month(month=4)
df.to_csv('data/cswiki_pageviews_difference_2019_2020_April.tsv', sep='\t', index=False)
df

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


Unnamed: 0,month,page_title,views_2020,views_2019,views_difference
0,4,Hlavní_strana,2978429,1806557,1171872
1,4,Speciální:Hledání,1249382,887297,362085
2,4,-,769123,620484,148639
3,4,Emanuel_Moravec,135808,2325,133483
4,4,Adam_Vojtěch,73634,1775,71859
5,4,Španělská_chřipka,65865,1221,64644
6,4,Koronavirus,54880,272,54608
7,4,Kim_Čong-un,52369,4186,48183
8,4,Spojené_státy_americké,71005,24343,46662
9,4,Seznam_států_světa_podle_počtu_obyvatel,58777,12624,46153


### July

In [17]:
df = calculate_pageviews_difference_month(month=7)
df.to_csv('data/cswiki_pageviews_difference_2019_2020_July.tsv', sep='\t', index=False)
df

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


Unnamed: 0,month,page_title,views_2020,views_2019,views_difference
0,7,Hlavní_strana,1772466,1639013,133453
1,7,Kateřina_II._Veliká,106685,2347,104338
2,7,Česká_příjmení,50829,412,50417
3,7,Mistrovství_Evropy_ve_fotbale_2004,35085,1799,33286
4,7,Miloš_Jakeš,42289,10472,31817
5,7,Jiří_Procházka_(sportovec),29115,1329,27786
6,7,Ludvík_XIV.,30949,3758,27191
7,7,Petr_III._Ruský,27705,812,26893
8,7,Dýmějový_mor,28392,1848,26544
9,7,Jan_Skopeček,27487,2668,24819


### November