As an addition to the general stratified random sampling process in 2022, we will oversample by continents to be able to conduct breakout analyses for contributors to large wikis by geographic region. These samples will be integrated into the larger dataset but will be weighted separately as they do not meet the criteria for random (and thus representative) selection into the sample. As an example for why this is necessary: even though there is a substantial active community of enwiki editors in India, the enwiki random sample is mostly occupied by editors from English speaking countries (which reflects the actual population of enwiki editors). The random sample can thus inform us broadly about the global enwiki community, while an additional over-sample is used to understand whether contributors from particular regions report certain experiences differently. 

In [29]:
import wmfdata as wmf
import pandas as pd

RANDOM_STATE = 55

In [12]:
# listing sampled users from both the intial and the follow-up sample
sampling_frame = pd.read_parquet('secrets/sampling-frame.parquet')
print('population', sampling_frame.shape)
sampled_users = pd.concat([pd.read_table('secrets/sampled_users.tsv'),
                           pd.read_table('secrets/followup-sampled-users.tsv')])
sampled_users.set_index('user_name', inplace=True)
print('sampled users', sampled_users.shape)

population (34144, 5)
sampled users (25792, 5)


In [11]:
# unsampled users (population - sampled users)
unsampled_users = sampling_frame.drop(sampled_users.index)
print('unsampled users', unsampled_users.shape)

unsampled users (8352, 5)


## Unsampled users by continent

In [6]:
def make_sql_tuple(i):
   
    if type(i) != list:
        i = [x for x in i]

    list_repr = repr(i)

    return "(" + list_repr[1:-1] + ")"

sql_usernames = make_sql_tuple(unsampled_users.index)

In [None]:
users_geo_query = f"""
WITH users AS (
    SELECT 
        user_name,
        user_id, 
        wiki_db
    FROM 
        wmf_raw.mediawiki_user
    WHERE
        user_editcount > 10
        AND user_name in {sql_usernames}
        AND snapshot = '2022-06'
)

SELECT 
    DISTINCT users.user_name,
    edaily.user_fingerprint_or_id, 
    edaily.wiki_db, 
    edaily.country_code
FROM 
    wmf.editors_daily edaily
JOIN users 
    ON edaily.user_fingerprint_or_id = users.user_id AND edaily.wiki_db = users.wiki_db
WHERE 
    user_is_anonymous = false
    AND country_code IS NOT NULL
    AND country_code <> '--'
    AND SIZE(user_is_bot_by) = 0
"""

users_geodata = wmf.spark.run(users_geo_query)

In [8]:
# number of times users log location by country
# based on the geo-data if a user has location logged in only one country for all edits, that is considered their location
# for users with location in two or more countries, we consider the country from where more than 50% of the edits have been made as their location
n_pings = (users_geodata.groupby(['user_name', 'country_code'])
           .count()[['user_fingerprint_or_id']]
           .rename({'user_fingerprint_or_id': 'count'}, axis=1)
           .reset_index())

n_pings = (pd.merge(n_pings, (n_pings.groupby('user_name')
                              .sum().reset_index()
                              .rename({'count': 'total'}, axis=1)), 
                    on='user_name', how='left')
           
n_pings['perc'] = n_pings['count'] / n_pings['total']
cleaned_geodata = n_pings[n_pings['perc'] > 0.5]

In [9]:
# for 2022, geo-oversample is only done for English Wikipedia
unsampled_users = pd.merge(unsampled_users, cleaned_geodata[['user_name', 'country_code']], 
                           on='user_name', how='left')
unsampled_users.country_code.isna().sum()
unsampled_users.dropna(inplace=True)
en_unsampled = unsampled_users.query("""project_group == 'enwiki'""")

In [10]:
# unsampled enwiki users by continent
# countries-continents mapping from https://github.com/wikimedia-research/canonical-data/
cd_countries = wmf.spark.run(""" SELECT * FROM canonical_data.countries """).rename({'iso_code': 'country_code'}, axis=1)
en_unsampled = pd.merge(en_unsampled, cd_countries[['country_code', 'maxmind_continent']], 
                        on='country_code', how='left')
unsampled_population = en_unsampled.groupby(['maxmind_continent', 'edit_bin']).size().unstack()
unsampled_population

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


edit_bin,10-29,30-149,150-599,600-1199,1200+
maxmind_continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Africa,0,25,35,9,41
Asia,0,247,243,98,258
Europe,0,411,476,210,630
North America,0,725,851,320,964
Oceania,0,105,87,44,135
South America,0,28,39,16,27


## Sampling targets
For 2022, it is all unsampled users from Africa, Asia and Oceania.

In [16]:
sampling_targets = unsampled_population.copy()
sampling_targets.loc[['Europe', 'North America', 'South America'], :] = 0
sampling_targets

edit_bin,10-29,30-149,150-599,600-1199,1200+
maxmind_continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Africa,0,25,35,9,41
Asia,0,247,243,98,258
Europe,0,0,0,0,0
North America,0,0,0,0,0
Oceania,0,105,87,44,135
South America,0,0,0,0,0


In [25]:
targets = (sampling_targets
           .unstack()
           .reset_index()
           .rename({0: 'sample_size', 'maxmind_continent': 'continent'}, axis=1))
targets.tail()

Unnamed: 0,edit_bin,continent,sample_size
25,1200+,Asia,258
26,1200+,Europe,0
27,1200+,North America,0
28,1200+,Oceania,135
29,1200+,South America,0


## Pull and save samples

In [76]:
def sample_stratum(edit_bin, continent, target_size, df=en_unsampled):
    stratum = df[(df.edit_bin == edit_bin) & (df.maxmind_continent == continent)]
    return stratum.sample(n=target_size, random_state=RANDOM_STATE)

In [78]:
geo_oversampled_users = pd.DataFrame(columns=en_unsampled.columns)
for row in targets.itertuples(index=False):
    geo_oversampled_users = pd.concat([geo_oversampled_users, sample_stratum(*row)])
geo_oversampled_users = geo_oversampled_users.reset_index(drop=True)
geo_oversampled_users.groupby(['maxmind_continent', 'edit_bin']).size().unstack()

edit_bin,10-29,30-149,150-599,600-1199,1200+
maxmind_continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Africa,0,25,35,9,41
Asia,0,247,243,98,258
Oceania,0,105,87,44,135


In [79]:
geo_oversampled_users.to_csv('secrets/geo_oversampled_users.tsv', sep='\t', index=False)