# Step 2: gather user data
For each username, gather:
* Home-wiki
    + wiki with highest number of edits during the prceeding two years
* Edit bucket on home-wiki
* Edit bucket on Meta Wiki
* User rights on home-wiki
* User account age

## Setup

In [None]:
import numpy as np
import pandas as pd
import wmfdata as wmf

pd.options.display.max_columns = None
from IPython.display import clear_output

import warnings
import json
import time




You are using Wmfdata v2.0.0, but v2.0.1 is available.

To update, run `pip install --upgrade git+https://github.com/wikimedia/wmfdata-python.git@release`.

To see the changes, refer to https://github.com/wikimedia/wmfdata-python/blob/release/CHANGELOG.md.


In [2]:
# mediawiki snapshot
snapshot = '2023-09'

In [4]:
spark_session = wmf.spark.get_active_session()

if type(spark_session) == type(None):
    spark_session = wmf.spark.create_custom_session(
        master="yarn",
        app_name='cws-user-data',
        spark_config={
            "spark.driver.memory": "4g",
            "spark.dynamicAllocation.maxExecutors": 64,
            "spark.executor.memory": "16g",
            "spark.executor.cores": 4,
            "spark.sql.shuffle.partitions": 256,
            "spark.driver.maxResultSize": "2g"
        }
    )

spark_session.sparkContext.setLogLevel("ERROR")

clear_output()

spark_session

## query

In [None]:
# manually curated file with information related to varioys iterations of the survey
# such as links to category page, results page, category structure etc.

cws_links = pd.read_csv('data/cws_page_links.tsv', sep='\t')

# load proposals data gathered in step 1
with open('data/01-cws_proposals_data.json') as file:
    cws_data = json.load(file)

In [15]:
user_data_query = """
WITH
    yearly_edits AS (
        SELECT
            event_user_text AS username,
            wiki_db,
            SUM(IF(wiki_db = 'wikidatawiki', 0.1, 1)) AS edit_count,
            MAX(event_timestamp) AS last_edit
        FROM 
            wmf.mediawiki_history
        WHERE 
            snapshot='{HIVE_SNAPSHOT}' 
            AND NOT event_user_is_anonymous
            AND event_type = 'create'
            AND event_entity = 'revision'
            AND event_user_text IN {USERS_LIST}
            AND DATE(event_timestamp) BETWEEN DATE_SUB('{END_OF_DATA}', 365*2) AND DATE('{END_OF_DATA}')
        GROUP BY 
            event_user_text, 
            wiki_db
    ),

    home_wiki_ranked AS (
        SELECT 
            *,
            ROW_NUMBER() OVER (
                PARTITION BY username 
                ORDER BY edit_count DESC, last_edit DESC) AS rank
        FROM yearly_edits
    ),

    -- homewiki is considered as the wiki where the user had highest number of edits during the two preceeding years
    -- due to the granular nature of Wikidata edits, they were considered at the ratio of 10:1
    home_wiki AS (
        SELECT username, wiki_db AS home_wiki
        FROM home_wiki_ranked
        WHERE rank = 1
    ),

    edit_bucket_data AS (
        SELECT
            mwh.revision_id,
            mwh.event_user_text,
            mwh.event_user_revision_count AS edit_count,
            CASE
                WHEN mwh.event_user_revision_count < 100 THEN '0-99'
                WHEN mwh.event_user_revision_count BETWEEN 100 AND 999 THEN '100-999'
                WHEN mwh.event_user_revision_count BETWEEN 1000 AND 4999 THEN '1000-4999'
                ELSE '5000+'
            END AS edit_bucket,
            mwh.event_timestamp,
            ROW_NUMBER() OVER (
                PARTITION BY mwh.event_user_text, mwh.wiki_db 
                ORDER BY mwh.event_timestamp DESC) AS rank,
            CASE 
                WHEN mwh.wiki_db = 'metawiki' THEN NULL 
                ELSE ARRAY_DISTINCT(ARRAY_UNION(mwh.event_user_groups, mwh.event_user_groups_historical)) 
            END AS user_groups,
            mwh.wiki_db,
            hw.home_wiki
        FROM 
            wmf.mediawiki_history mwh
        JOIN 
            home_wiki hw 
            ON mwh.event_user_text = hw.username
        WHERE 
            mwh.snapshot = '{HIVE_SNAPSHOT}'
            AND mwh.event_user_text IN {USERS_LIST}
            AND DATE(mwh.event_timestamp) <= DATE('{END_OF_DATA}')
            AND (mwh.wiki_db = hw.home_wiki 
                OR mwh.wiki_db = 'metawiki')
    ),

    home_wiki_activity AS (
        SELECT 
            * 
        FROM 
            edit_bucket_data
        WHERE 
            wiki_db = home_wiki 
            AND rank = 1
    ),
    
    meta_wiki_activity AS (
        SELECT 
            * 
        FROM 
            edit_bucket_data
        WHERE 
            wiki_db = 'metawiki' 
            AND rank = 1
    )

SELECT
    hw.event_user_text AS username,
    hw.home_wiki,
    hw.edit_count AS hw_edit_count,
    hw.edit_bucket AS hw_edit_bucket,
    hw.user_groups AS hw_user_groups,
    mw.edit_count AS mw_edit_count,
    mw.edit_bucket AS mw_bucket
FROM 
    home_wiki_activity hw
JOIN 
    meta_wiki_activity mw 
    ON hw.event_user_text = mw.event_user_text
"""

guc_query = """
SELECT
    gu_name AS username,
    gu_registration AS reg_ts
FROM
    globaluser
WHERE
    gu_name IN {USERS_LIST}
"""

## Processing 

In [8]:
# categorize users by year

def users_by_year(data):
    
    # input: proposals data as inputs
    # returns dict years as keys and list of usernames as values
    
    users_by_year = {}
    
    for year, categories in data.items():
        
        users = set()
        
        for category in categories.values():
            for proposal in category.values():
                for key in ['proposer', 'discussion_participants', 'voters']:
                    participants = proposal.get(key, [])
                    if participants != None:
                        users.update(participants)
        
        users_by_year[year] = list(users)
        
    return users_by_year

cws_users_by_year = users_by_year(cws_data)

In [47]:
# processes the data for each users as a dataframe
# processing include datatypes conversion, age calculation from timestamps etc.

def get_user_data(survey_year, cws_info=cws_links, usernames=cws_users_by_year, user_data_query=user_data_query, guc_query=guc_query):
    
    data_end = cws_info.query("""year == @survey_year""")['data_end'].values[0]
    participants = wmf.utils.sql_tuple(usernames[str(survey_year)])

    users_data = wmf.spark.run(user_data_query.format(HIVE_SNAPSHOT=snapshot, END_OF_DATA=data_end, USERS_LIST=participants))
    guc_data = wmf.mariadb.run(guc_query.format(USERS_LIST=participants), dbs='centralauth', use_x1=True)
    
    guc_data['reg_ts'] = pd.to_datetime(guc_data.reg_ts)
    guc_data['reg_dt'] = guc_data['reg_ts'].apply(lambda x:x.date())
    
    guc_data['account_age'] = pd.to_datetime(data_end) - pd.to_datetime(guc_data['reg_dt'])
    
    guc_data = guc_data.assign(
        account_age_days=lambda x: x['account_age'].dt.days,
        account_age_months=lambda x: round(x['account_age_days'] / 30, 2),
        account_age_years=lambda x: round(x['account_age_days'] / 365, 2)
    )
    
    users_data = pd.merge(users_data, guc_data[['username', 'account_age_months', 'account_age_years']], how='left', on='username')

    users_data['survey_year'] = survey_year
    
    return users_data

In [53]:
%%time
warnings.filterwarnings('ignore')
users_data = pd.DataFrame()

for year in range(2015, 2023+1):
    if year != 2018:
        
        start_time = time.time()
        
        user_data_yearly = get_user_data(year)
        user_data_yearly.to_csv(f'secrets/cws_user_data_{year}.tsv', sep='\t')
        
        users_data = pd.concat([users_data, user_data_yearly], ignore_index=True)
        users_data.to_csv(f'secrets/cws_user_data_merged.tsv', sep='\t')
        
        end_time = time.time()
        elapsed_time = round((end_time - start_time)/60, 2)
        print(f"{year} data was extracted in {elapsed_time} minutes.")

                                                                                ]]]]

2015 data was extracted in 3.27 minutes.


                                                                                92]]]

2016 data was extracted in 3.11 minutes.


                                                                                56]]]

2017 data was extracted in 2.72 minutes.


                                                                                92]]

2019 data was extracted in 4.09 minutes.


                                                                                92]]]

2020 data was extracted in 4.29 minutes.


                                                                                192]]

2021 data was extracted in 5.6 minutes.


                                                                                 256]]]

2022 data was extracted in 5.49 minutes.


                                                                                192]]]]

2023 data was extracted in 4.49 minutes.
CPU times: user 6.43 s, sys: 1.07 s, total: 7.49 s
Wall time: 33min 3s


In [54]:
users_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9308 entries, 0 to 9307
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   username            9308 non-null   object 
 1   home_wiki           9308 non-null   object 
 2   hw_edit_count       8676 non-null   float64
 3   hw_edit_bucket      9308 non-null   object 
 4   hw_user_groups      8986 non-null   object 
 5   mw_edit_count       8920 non-null   float64
 6   mw_bucket           9308 non-null   object 
 7   account_age_months  9308 non-null   float64
 8   account_age_years   9308 non-null   float64
 9   survey_year         9308 non-null   int64  
dtypes: float64(4), int64(1), object(5)
memory usage: 727.3+ KB
