# Data-Gathering

## imports

In [None]:
import pandas as pd
import wmfdata as wmf

pd.options.display.max_columns = None

import numpy as np

import warnings
import re

import os
import requests
os.environ.pop('HTTP_PROXY', None)
os.environ.pop('HTTPS_PROXY', None)
os.environ.pop('http_proxy', None)
os.environ.pop('https_proxy', None)

In [None]:
mediawiki_snapshot = '2023-08'

## spark_session

In [None]:
spark_session = wmf.spark.get_active_session()

if type(spark_session) != type(None):
    spark_session.stop()
else:
    print('no active session')

In [None]:
spark_session = wmf.spark.create_custom_session(
    master="yarn",
    app_name='cws-data',
    spark_config={
        "spark.driver.memory": "4g",
        "spark.dynamicAllocation.maxExecutors": 64,
        "spark.executor.memory": "16g",
        "spark.executor.cores": 4,
        "spark.sql.shuffle.partitions": 256,
        "spark.driver.maxResultSize": "2g"
        
    }
)

In [None]:
spark_session

In [None]:
spark_session.sparkContext.setLogLevel("ERROR")

In [None]:
api_endpoint = 'https://api-ro.discovery.wmnet/w/api.php'
host_wiki = {'Host': 'meta.wikimedia.org'}

## query

In [None]:
cws_links = pd.read_csv('cws_page_links.tsv', sep='\t')
cws_links

In [None]:
def get_title(year, page, replace_space=True, df=cws_links):
    title = df.query("year == @year")[page].values[0]
    
    if replace_space:
        return title.replace(' ', '_')
    else:
        return title

In [None]:
def get_cateogries(category_subpages, survey_title):
    query = """
    SELECT
        REPLACE(page_title, '{SURVEY_TITLE}/', '') AS category,
        page_title AS category_title
    FROM
        categorylinks cl
        JOIN page p
        ON cl.cl_from = p.page_id
    WHERE
        cl_to = '{CATEGORY_TITLE}' 
        AND page_title LIKE '{SURVEY_TITLE}/%'
    ORDER BY
        category_title
    """
    
    return wmf.mariadb.run(query.format(CATEGORY_TITLE=category_subpages, SURVEY_TITLE=survey_title), dbs='metawiki')

In [None]:
proposal_categories = get_cateogries(category_subpages, survey_title)
proposal_categories

In [None]:
archive_category = pd.DataFrame({
    'category': 'Archive',
    'category_title': get_title(year, 'archive_page')
}, index=pd.Index([0]))

In [None]:
proposal_categories = pd.concat([proposal_categories, archive_category], ignore_index=True)

In [None]:
proposal_categories

In [None]:
warnings.filterwarnings('ignore')
data = {}

for i in proposal_categories.index:
    i = 15
    category = proposal_categories.loc[i, 'category']
    category_title = proposal_categories.loc[i, 'category_title']
    proposals_list = extract_proposals(get_wikitext(category_title)['parse']['wikitext']['*'])
    
    for proposal in proposals_list:
        
        proposal_name = proposal.replace(category_title.replace('_', ' '), '').strip('/')
        
        page_wikitext = get_wikitext(proposal)['parse']['wikitext']['*']
        page_sections = get_page_sections(proposal)['parse']['sections']
        
        data[category][proposal_name] = {
            'proposer': extract_proposer(page_wikitext),
            'phab_tickets': extract_phab_tickets(page_wikitext),
            'discussion_participants': usernames_from_parser(get_iwlinks(proposal, get_section_index('discussion'))),
            'voters': usernames_from_parser(get_iwlinks(proposal, get_section_index('voting')))
        }        
        
    