# Data-Gathering

## imports

In [None]:
import pandas as pd
import wmfdata as wmf

pd.options.display.max_columns = None

In [None]:
import numpy as np

In [None]:
import warnings
import re

In [None]:
from bs4 import BeautifulSoup

In [None]:
import os

os.environ.pop('HTTP_PROXY', None)
os.environ.pop('HTTPS_PROXY', None)
os.environ.pop('http_proxy', None)
os.environ.pop('https_proxy', None)

In [None]:
import requests

## spark_session

In [None]:
spark_session = wmf.spark.get_active_session()

if type(spark_session) != type(None):
    spark_session.stop()
else:
    print('no active session')

In [None]:
spark_session = wmf.spark.create_custom_session(
    master="yarn",
    app_name='cws-data',
    spark_config={
        "spark.driver.memory": "4g",
        "spark.dynamicAllocation.maxExecutors": 64,
        "spark.executor.memory": "16g",
        "spark.executor.cores": 4,
        "spark.sql.shuffle.partitions": 256,
        "spark.driver.maxResultSize": "2g"
        
    }
)

In [None]:
spark_session

In [None]:
spark_session.sparkContext.setLogLevel("ERROR")

## query

In [None]:

cws_links

In [None]:
mediawiki_snapshot = '2023-07'

# 2015

In [None]:
warnings.filterwarnings('ignore')

categories_2015_query = """
WITH base AS (
    SELECT
        REPLACE(page_title, '{SURVEY_TITLE}/', '') AS category,
        page_title AS category_title
    FROM 
        categorylinks cl
        JOIN page p
        ON cl.cl_from = p.page_id
    WHERE
        cl_to = '{CATEGORY_TITLE}'
        AND cl_type = 'page'
        AND page_title LIKE '{SURVEY_TITLE}/%'
        AND NOT page_title LIKE '{SURVEY_TITLE}/%/%'
)
    
SELECT 
    *
FROM 
    base
WHERE 
    NOT (
        category LIKE '%report%'
        OR category LIKE '%Result%'
        OR category LIKE '%Archive%'
    )
"""

categories_2015 = wmf.mariadb.run(categories_2015_query.format(CATEGORY_TITLE=get_title(2015, 'category_subpages'), SURVEY_TITLE=get_title(2015, 'main_page')), 'metawiki')
categories_2015

In [None]:
api_endpoint = 'https://api-ro.discovery.wmnet/w/api.php'
host_wiki = {'Host': 'meta.wikimedia.org'}

In [None]:
%%time

data = {}

for i in categories_2015.index:
    category_name = categories_2015.loc[i, 'category']
    category_title = categories_2015.loc[i, 'category_title']

    wishes = process_wishes(category_title)
    data[category_name] = {}
    
    for wish_index, wish in wishes.items():
        
        wish_text = get_section_text(category_title, wish_index)
        break

In [None]:
%%time

data = {}

for i in categories_2015.index:
    category_name = categories_2015.loc[i, 'category']
    category_title = categories_2015.loc[i, 'category_title']

    wishes = process_wishes(category_title, year=2015)

    data[category_name] = {}

    for wish_index, wish in wishes.items():
        wish_text = get_section_text(category_title, wish_index)['parse']['wikitext']['*']
        votes = get_voters_api(category_title, wish)

        proposal, discussion = split_proposal_parts(wish_text)

        data[category_name][wish['title']] = {
            'proposers': extract_usernames(proposal),
            'phab_tickets': extract_phab_tickets(proposal),
            'discussion_participants': extract_usernames(discussion),
            'voters': usernames_from_parser(votes)
        }

# 2016

In [None]:
warnings.filterwarnings('ignore')

categories_2016_query = """
SELECT
    REPLACE(page_title, '{SURVEY_TITLE}/Categories/', '') AS category,
    page_title AS category_title
FROM
    categorylinks cl
    JOIN page p
    ON cl.cl_from = p.page_id
WHERE
    cl_to = '{CATEGORY_TITLE}' 
    AND page_title LIKE '{SURVEY_TITLE}/Categories/%'
"""

categories_2016 = wmf.mariadb.run(categories_2016_query.format(CATEGORY_TITLE=get_title(2016, 'category_subpages'), SURVEY_TITLE=get_title(2016, 'main_page')), 'metawiki')
categories_2016