In [2]:
import pandas as pd
import duckdb
import json
import warnings

In [3]:
proposals_data_path = 'data/01-cws_proposals_data.json'
with open(proposals_data_path) as data:
    proposals = json.load(data)
    
error_log_path = 'data/02-cws_proposals_error_log.json'
with open(error_log_path) as data:
    errors = json.load(data)
    
user_data = pd.read_csv('secrets/cws_user_data_merged.tsv', sep='\t', index_col=0)

# Processing functions

In [4]:
def process_proposals(data):
    rows = []
    
    for year, categories in data.items():
        for category, proposals in categories.items():
            for proposal in proposals:
                rows.append({'year': year, 'category': category, 'proposal': proposal})
    
    return pd.DataFrame(rows)

def process_participant_roles(data):
    rows = []
    
    roles_mapping = {
        'proposer': 'proposer',
        'discussion_participants': 'discussant',
        'voters': 'voter'
    }
    
    for year, categories in data.items():
        for category, proposals in categories.items():
            for proposal, details in proposals.items():
                for role_key, role_singular in roles_mapping.items():
                    rows.append({
                        'year': year,
                        'wish': proposal,
                        'role': role_singular,
                        'username': details[role_key] if role_key in details else []
                    })
    
    df = pd.DataFrame(rows)

    df = df.explode('username', ignore_index=True)
    
    return df

def process_phab_tickets(data):
    rows = []
    
    for year, categories in data.items():
        for category, proposals in categories.items():
            for proposal, details in proposals.items():
                rows.append({
                    'year': year,
                    'proposal': proposal,
                    'phab_tickets': details.get('phab_tickets', [])
                })
    
    return pd.DataFrame(rows)

def process_user_data(df):
    df['edit_count'] = df.apply(lambda row: {'hw': row['hw_edit_count'], 'mw': row['mw_edit_count']}, axis=1)
    df['edit_bucket'] = df.apply(lambda row: {'hw': row['hw_edit_bucket'], 'mw': row['mw_bucket']}, axis=1)
    df['account_age'] = df.apply(lambda row: {'months': row['account_age_months'], 'years': row['account_age_years']}, axis=1)

    df = df.drop(columns=['hw_edit_count', 'mw_edit_count', 'hw_edit_bucket', 'mw_bucket', 'account_age_months', 'account_age_years'])
    
    return df

def process_rejects(data):
    rows = []
    
    for year, categories in data.items():
        for category, proposals in categories.items():
            for proposal, details in proposals.items():
                try:
                    rows.append({
                        'year': year,
                        'proposal': proposal,
                        'reason': details.get('reject_reason', 0)
                    })
                except:
                    pass
    
    return pd.DataFrame(rows).query("""reason != 0""").reset_index(drop=True)

def process_error_log(data):
    rows = []

    for year, proposals in data.items():
        for proposal_name, details in proposals.items():
            category = details['category']
            error = details['error']
            rows.append({'year': year, 
                         'proposal_name': proposal_name, 
                         'category': category, 
                         'error': error})
                
    return pd.DataFrame(rows)

# Write to DB

In [24]:
conn = duckdb.connect('secrets/cws_data.db')

In [12]:
categories_mapping = {
    'Admins_and_stewards': 'Moderation_tools', 
    'Admins_and_patrollers': 'Moderation_tools',
    'Moderation_and_admin_tools': 'Moderation_tools',
    'Commons': 'Multimedia_and_Commons',
    'Multimedia': 'Multimedia_and_Commons',
    'Notifications,_Watchlists_and_Talk_Pages': 'Communication_tools',
    'Notifications': 'Communication_tools',
    'Talk_pages': 'Communication_tools',
    'Special_pages': 'Communication_tools',
    'Search_and_Categories': 'Navigation_tools',
    'Search': 'Navigation_tools',
    'Categories': 'Navigation_tools'
    }

In [13]:
proposals_by_year = process_proposals(proposals).replace(dict(category = categories_mapping))

conn.execute("""
CREATE OR REPLACE TABLE proposals AS 
SELECT
    year::INTEGER AS year,
    category,
    proposal
FROM
    proposals_by_year
""")

<duckdb.DuckDBPyConnection at 0x7f1091b06f30>

In [11]:
participant_roles = process_participant_roles(proposals)

conn.execute("""
CREATE OR REPLACE TABLE roles AS 
SELECT
    year::INTEGER AS year,
    wish AS proposal,
    role,
    username
FROM
    participant_roles
""")

<duckdb.DuckDBPyConnection at 0x7f17457481b0>

In [7]:
phab_tickets = process_phab_tickets(proposals)

conn.execute("""
CREATE OR REPLACE TABLE phab_tickets AS 
SELECT
    year::INTEGER AS year,
    proposal,
    phab_tickets
FROM
    phab_tickets
""")

<duckdb.DuckDBPyConnection at 0x7f17457481b0>

In [17]:
user_data_processed = process_user_data(user_data)

conn.execute("""
CREATE OR REPLACE TABLE user_data AS 
SELECT
    survey_year::INTEGER AS year,
    username,
    home_wiki,
    hw_user_groups,
    edit_count,
    edit_bucket,
    account_age
FROM
    user_data_processed
""")

<duckdb.DuckDBPyConnection at 0x7f17457481b0>

In [58]:
rejected_proposals = process_rejects(proposals)

conn.execute("""
CREATE OR REPLACE TABLE rejects AS 
SELECT
    year::INTEGER AS year,
    proposal,
    reason
FROM
    rejected_proposals
""")

<duckdb.DuckDBPyConnection at 0x7f170cbd9bb0>

In [21]:
error_log = process_error_log(errors).replace(dict(category = categories_mapping))

conn.execute("""
CREATE OR REPLACE TABLE errors AS 
SELECT
    year::INTEGER AS year,
    category,
    proposal_name AS proposal,
    error
FROM
    error_log
""")

<duckdb.DuckDBPyConnection at 0x7f10886d00b0>

In [29]:
wiki_comparision.head()

Unnamed: 0,overall size rank,monthly unique devices,mobile unique devices,monthly pageviews,mobile web pageviews,mobile app pageviews,unique devices per editor,monthly editors,majority mobile editors,monthly active editors,...,content pages,all time content edits,all time edits per content page,script direction,wiki_db,project code,language code,language name,domain name,wiki name
0,1,806906000.0,0.7015,7244414000.0,0.615,0.023,6377.9096,126515.75,0.2669,31491.75,...,6614510,696533897,105.3039,left-to-right,enwiki,wikipedia,en,English,en.wikipedia.org,English Wikipedia
1,2,145458600.0,0.7365,842508000.0,0.6895,0.0074,9270.8329,15689.9167,0.3233,4319.1667,...,1837386,101324297,55.1459,left-to-right,eswiki,wikipedia,es,Spanish,es.wikipedia.org,Spanish Wikipedia
2,3,102381500.0,0.7442,994408300.0,0.6627,0.0101,6680.9011,15324.5,0.3004,5330.25,...,1361481,71951024,52.8476,left-to-right,jawiki,wikipedia,ja,Japanese,ja.wikipedia.org,Japanese Wikipedia
3,4,97089160.0,0.6266,816103700.0,0.5496,0.0491,5272.5254,18414.1667,0.1619,5134.9167,...,2769985,141779143,51.1841,left-to-right,dewiki,wikipedia,de,German,de.wikipedia.org,German Wikipedia
4,5,93493820.0,0.6784,754624100.0,0.5366,0.0156,5045.47,18530.25,0.2023,4933.5,...,2492047,117511241,47.1545,left-to-right,frwiki,wikipedia,fr,French,fr.wikipedia.org,French Wikipedia


In [41]:
cd_wikis = pd.read_csv('https://raw.githubusercontent.com/wikimedia-research/canonical-data/master/wiki/wikis.tsv', sep='\t')
wiki_comparision = (pd
                    .read_csv('https://raw.githubusercontent.com/wikimedia-research/wiki-comparison/main/data-collection/snapshots/Jan_2023.tsv', sep='\t')
                    .rename({'database code': 'wiki_db'}, axis=1))

conn.execute("""
CREATE OR REPLACE TABLE cdw AS 
SELECT
    cd_wikis.database_code AS wiki_db,
    database_group AS db_group,
    language_name AS lang_name,
    english_name AS wiki_name,
    "overall size rank" AS size_rank
FROM
    cd_wikis 
    LEFT JOIN wiki_comparision wc ON cd_wikis.database_code = wc.wiki_db
""")

<duckdb.DuckDBPyConnection at 0x7f0f0f6facf0>

# Check data types

In [42]:
conn.sql("""DESCRIBE proposals""").show()
conn.sql("""DESCRIBE roles""").show()
conn.sql("""DESCRIBE phab_tickets""").show()
conn.sql("""DESCRIBE user_data""").show()
conn.sql("""DESCRIBE rejects""").show()
conn.sql("""DESCRIBE errors""").show()
conn.sql("""DESCRIBE cdw""").show()

┌─────────────┬─────────────┬─────────┬─────────┬─────────┬───────┐
│ column_name │ column_type │  null   │   key   │ default │ extra │
│   varchar   │   varchar   │ varchar │ varchar │ varchar │ int32 │
├─────────────┼─────────────┼─────────┼─────────┼─────────┼───────┤
│ year        │ INTEGER     │ YES     │ NULL    │ NULL    │  NULL │
│ category    │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ proposal    │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
└─────────────┴─────────────┴─────────┴─────────┴─────────┴───────┘

┌─────────────┬─────────────┬─────────┬─────────┬─────────┬───────┐
│ column_name │ column_type │  null   │   key   │ default │ extra │
│   varchar   │   varchar   │ varchar │ varchar │ varchar │ int32 │
├─────────────┼─────────────┼─────────┼─────────┼─────────┼───────┤
│ year        │ INTEGER     │ YES     │ NULL    │ NULL    │  NULL │
│ proposal    │ VARCHAR     │ YES     │ NULL    │ NULL    │  NULL │
│ role        │ VARCHAR     │ YES     │ NULL   

In [23]:
conn.close()