# imports

In [1]:
import numpy as np
import pandas as pd
import wmfdata as wmf

pd.options.display.max_columns = None

import re
import requests
import time

import os
import json
import warnings
import importlib


os.environ.pop('HTTP_PROXY', None)
os.environ.pop('HTTPS_PROXY', None)
os.environ.pop('http_proxy', None)
os.environ.pop('https_proxy', None)

import data_gathering_functions as dg




You are using Wmfdata v2.0.0, but v2.0.1 is available.

To update, run `pip install --upgrade git+https://github.com/wikimedia/wmfdata-python.git@release`.

To see the changes, refer to https://github.com/wikimedia/wmfdata-python/blob/release/CHANGELOG.md.


In [2]:
def rdg(): importlib.reload(dg)
def ig_warn(): warnings.filterwarnings('ignore')

In [3]:
rdg()

# 2015

In [4]:
ig_warn()

categories_201516_query = """
WITH base AS (
    SELECT
        CASE 
            WHEN {YEAR} = 2015 THEN REPLACE(page_title, '{SURVEY_TITLE}/', '')
            WHEN {YEAR} = 2016 THEN REPLACE(page_title, '{SURVEY_TITLE}/Categories/', '')
        END AS category,
        page_title AS category_title
    FROM
        categorylinks cl
        JOIN page p
        ON cl.cl_from = p.page_id
    WHERE
        cl_to = '{CATEGORY_TITLE}' 
        AND (
            ({YEAR} = 2015 AND page_title LIKE '{SURVEY_TITLE}/%' AND NOT page_title LIKE '{SURVEY_TITLE}/%/%')
            OR
            ({YEAR} = 2016 AND page_title LIKE '{SURVEY_TITLE}/Categories/%')
        )
)

SELECT category, category_title
FROM base
WHERE 
    NOT (
        category LIKE '%report%'
        OR category LIKE '%Result%'
        OR category LIKE '%Archive%'
        OR category LIKE '%Translations%'
    )
"""

In [64]:
data = {}
error_log = {}

In [65]:
%%time

rdg()

for year in [2015, 2016]:
    start_time = time.time()
    
    categories = wmf.mariadb.run(categories_201516_query
                                 .format(YEAR=year, 
                                         CATEGORY_TITLE=dg.get_title(year, 'category_subpages'), 
                                         SURVEY_TITLE=dg.get_title(year, 'main_page')), 
                                 'metawiki')
    
    categories.sort_values('category', inplace=True, ignore_index=True)
    
    data[year] = {}
    error_log[year] = {}

    for i in categories.index:
        category_name = categories.loc[i, 'category']
        category_title = categories.loc[i, 'category_title']
        wishes = dg.process_wishes_201516(category_title, year=year)

        data[year][category_name] = {}

        for wish_index, wish in wishes.items():
            try:
                wish_text = dg.get_wikitext(category_title, section_index=wish_index)['parse']['wikitext']['*']
                
                if year == 2015:
                    proposal, discussion = dg.split_proposal_2015(wish_text)
                    proposer = dg.extract_usernames_from_text(proposal)
                    discussion_participants = dg.extract_usernames_from_text(discussion)
                elif year == 2016:
                    proposal = dg.split_sections_l2(wish_text)[0]
                    proposer = dg.extract_proposer_username(proposal)
                    discussion_participants = dg.extract_usernames_from_parser(dg.parse_iwlinks(category_title, section_index=wish['discussion_index']))

                phab_tickets = dg.extract_phab_tickets(proposal)
                voters = dg.extract_usernames_from_parser(dg.parse_iwlinks(category_title, section_index=wish['votes_index']))

                data[year][category_name][wish['title']] = {
                    'proposer': proposer,
                    'phab_tickets': phab_tickets,
                    'discussion_participants': discussion_participants,
                    'voters': voters
                }
                
            except Exception as e:
                error_log[year][wish['title']] = {
                    'category': category_name,
                    'error': repr(e)
                }
    
    end_time = time.time()
    elapsed_time = round((end_time - start_time)/60, 2)
    print(f"{year} data was extracted in {elapsed_time} minutes")

2015 data was extracted in 0.65 minutes
2016 data was extracted in 1.9 minutes
CPU times: user 22.1 s, sys: 761 ms, total: 22.9 s
Wall time: 2min 33s


In [66]:
with open("data/01-cws_proposals_data.json", "w") as outfile:
    json.dump(data, outfile)

# 2016 & beyond

In [69]:
%%time

ig_warn()
rdg()

for year in range(2017, 2023+1):
    
    start_time = time.time()
    
    if year == 2018:
        continue
    
    data[year] = {}
    error_log[year] = {}
    
    categories = dg.get_categories_std(dg.get_title(year, 'category_subpages'), dg.get_title(year, 'main_page'), year)
    
    for i in categories.index:
        category = categories.loc[i, 'category']
        category_title = categories.loc[i, 'category_title']
        proposals_list = dg.extract_proposals(dg.get_wikitext(category_title)['parse']['wikitext']['*'])
        
        if category != 'Archive':
            data[year][category] = {}

            for proposal in proposals_list:
                proposal_name = proposal.replace(category_title.replace('_', ' '), '').strip('/')
                
                try:
                    if year >= 2022:
                        if (year == 2023) & (category == 'Larger_suggestions'):
                            page_wikitext = dg.get_wikitext(proposal)['parse']['wikitext']['*']
                        else:
                            page_wikitext = dg.get_wikitext(f'{proposal}/Proposal')['parse']['wikitext']['*']
                    else:
                        page_wikitext = dg.get_wikitext(proposal)['parse']['wikitext']['*']
                    
                    page_sections = dg.parse_page_sections(proposal)['parse']['sections']

                    
                    data[year][category][proposal_name] = {
                        'phab_tickets': dg.extract_phab_tickets(page_wikitext),
                        'discussion_participants': dg.extract_usernames_from_parser(dg.parse_iwlinks(proposal, section_index=dg.get_section_index('discussion', page_sections))),
                        'voters': dg.extract_usernames_from_parser(dg.parse_iwlinks(proposal, section_index=dg.get_section_index('voting', page_sections)))
                    }
                    
                    if year >= 2022:
                        if (year == 2023) & (category == 'Larger_suggestions'):
                            data[year][category][proposal_name]['proposer'] = dg.extract_proposer_username(page_wikitext)
                        else:
                            data[year][category][proposal_name]['proposer'] = dg.extract_usernames_from_parser(dg.parse_iwlinks(f'{proposal}/Proposal'))[0]
                    else:
                        data[year][category][proposal_name]['proposer'] = dg.extract_proposer_username(page_wikitext)
                        
                    
                except Exception as e:
                    error_log[year][proposal_name] = {
                        'category': category,
                        'error': e
                    }
        
        elif category == 'Archive':
            for proposal in proposals_list:
                try:
                    if year == 2017:
                        proposal = dg.get_redirect_target(proposal)
                        
                    proposal_initial_title = dg.get_ar_category(proposal, year)

                    initial_category = proposal_initial_title.split('/')[1]
                    proposal_name = proposal_initial_title.split('/')[2]

                    page_wikitext = dg.get_wikitext(proposal)['parse']['wikitext']['*']
                        
                    page_sections = dg.parse_page_sections(proposal)['parse']['sections']
                    
                    try:
                        data[year][initial_category]
                    except:
                        data[year][initial_category] = {}
                
                    data[year][initial_category][proposal_name] = {
                        'proposer': dg.extract_proposer_username(page_wikitext),
                        'phab_tickets': dg.extract_phab_tickets(page_wikitext),
                        'discussion_participants': dg.extract_usernames_from_parser(dg.parse_iwlinks(proposal, section_index=dg.get_section_index('discussion', page_sections))),
                    }

                    try:
                        data[year][initial_category][proposal_name]['reject_reason'] = dg.extract_reject_reason(page_wikitext)
                    except Exception as e:
                        data[year][initial_category][proposal_name]['reject_reason'] = 'archived_no_reason'
                        
                except Exception as e:
                    try:
                        error_log[year][proposal.split('/')[2]] = {
                            'category': category,
                            'error': repr(e)
                        }
                    except:
                        error_log[year][proposal] = {
                            'category': category,
                            'error': 'error_logging_failed'
                        }
                    
    end_time = time.time()
    elapsed_time = round((end_time - start_time)/60, 2)
    print(f"{year} data was extracted in {elapsed_time} minutes")

2017 data was extracted in 2.66 minutes
2019 data was extracted in 2.69 minutes
2020 data was extracted in 0.91 minutes
2021 data was extracted in 3.49 minutes
2022 data was extracted in 3.92 minutes
2023 data was extracted in 2.28 minutes
CPU times: user 2min 51s, sys: 5.11 s, total: 2min 56s
Wall time: 15min 57s


In [None]:
with open("data/01-cws_proposals_data.json", "w") as outfile:
    json.dump(data, outfile)
    
error_log = dg.convert_errors_to_strings(error_log)
with open("data/02-cws_proposals_error_log.json", "w") as outfile:
    json.dump(error_log, outfile)