# Creating raw frame

In [2]:
import wmfdata as wmf
import pandas as pd

In [11]:
HIVE_SNAPSHOT = "2020-07"
END_OF_DATA = "2020-08-01 00:00:00"
DAYS_IN_LAST_3_MO = 92

In [12]:
# This selects all active editors from the past three months.
frame_query = """
-- TABLE OF
with mo_edits as (
    select
        event_user_text as user,
        year(event_timestamp) as year,
        month(event_timestamp) as month,
        sum(if(wiki_db = "wikidatawiki", 0.1, 1)) as edits
    from wmf.mediawiki_history
    where
        -- REGISTERED
        event_user_is_anonymous = false and
        
        -- NON-BOT
        size(event_user_is_bot_by) = 0 and
        not array_contains(event_user_groups, "bot") and
        
        -- CONTENT
        page_namespace_is_content_historical = true and
        
        -- EDITS
        event_entity = "revision" and
        event_type = "create" and
        
        -- FROM THE LAST 3 MONTHS
        unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") 
            > (unix_timestamp("{end_of_data}") - (60 * 60 * 24 * {days_in_last_3_mo})) and

        -- FROM THE LATEST SNAPSHOT
        snapshot = "{hive_snapshot}"    
    -- PER USER, PER MONTH
    group by event_user_text, year(event_timestamp), month(event_timestamp)
), 

-- TABLE OF
yr_proj_edits as (
    select
        event_user_text as user,
        wiki_db as proj,
        sum(if(wiki_db = "wikidatawiki", 0.1, 1)) as edits,
        max(event_timestamp) as latest_edit
    from wmf.mediawiki_history
    where
        -- REGISTERED
        event_user_is_anonymous = false and
        
        -- NON-BOT
        size(event_user_is_bot_by) = 0 and
        not array_contains(event_user_groups, "bot") and
        
        -- EDITS
        event_entity = "revision" and
        event_type = "create" and
        
        -- FROM THE LAST YEAR
        unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") 
            > (unix_timestamp("{end_of_data}") - (60 * 60 * 24 * 365)) and
        
        -- FROM THE LATEST SNAPSHOT
        snapshot = "{hive_snapshot}"
    
    -- PER USER, PER WIKI
    group by event_user_text, wiki_db
)

-- FINAL SELECT OF
select 
    recent_actives.user as user,
    yr_edits.proj as home_proj,
    global_edits

-- USERS ACTIVE IN 2 OF 3 MONTHS
from (
    select
        user,
        sum(if(edits >= 5, 1, 0)) as active_months
    from mo_edits
    where
        -- WHO ARE NOT WMF STAFF
        user not like "%WMF%"
    group by user
    having active_months >= 2
) recent_actives

-- JOINED TO THEIR HOME WIKI AND GLOBAL EDITS
left join (
    select
        user,
        proj,
        -- in the unlikely event that wikis are tied by edit count and latest edit, 
        -- row_number() will break it somehow
        row_number() over (partition by user order by edits desc, latest_edit desc) as rank,
        sum(edits) over (partition by user) as global_edits
    from yr_proj_edits
) yr_edits
on 
    recent_actives.user = yr_edits.user and
    rank = 1
""".format(
  hive_snapshot = HIVE_SNAPSHOT,
  end_of_data = END_OF_DATA,
  days_in_last_3_mo = DAYS_IN_LAST_3_MO
)

In [13]:
raw_frame = wmf.spark.run(frame_query)

In [39]:
raw_frame = raw_frame.assign(
  global_edits=lambda df: df["global_edits"].astype("float")
)

raw_frame.head()

Unnamed: 0,user,home_proj,global_edits
0,Aammiinn11,fawiki,486.3
1,Abbe98,commonswiki,901.3
2,Ahmad.aea.99,arwiki,2586.0
3,Aineireland,enwiki,643.0
4,Altanner1991,enwiki,483.4


## Limiting to emailable users and adding emails

In [222]:
def make_sql_tuple(iterable):
  """
  Making an SQL 'tuple', for use in an IN clause, is hard. Doing it manually using 
  `", ".join` requires a lot of messing around with quote marks and escaping. Using the
  string representation of a Python tuple *almost* works, but fails when there's just
  one element, because SQL doesn't accept the trailing comma that Python needs.
  
  What we really want is the string representation of a Python list, but using parentheses
  instead of brackets. This function turns an iterable into just that.
  """
  if type(iterable) != list:
    iterable = [x for x in iterable]
  
  list_repr = repr(iterable)
  list_repr = "(" + list_repr[1:-1] + ")"

  return list_repr

In [191]:
# Make a tuple of the usernames. It isn't necessary to escape quotes because Python will
# do this in an SQL-compatible way automatically.
users = tuple(raw_frame["user"].tolist())

# Query centralauth.globaluser table in replicas to obtain email addresses
# This seems to be more reliable that the user tables of individual wikis
# since the global email address is what's shown to the user on every
# local preferences page; if the address is changed on an individual wiki, 
# the global address and local address will be immediately updated but the
# local addresses at other wikis may not be
#
# We also want confirmed email addresses only, since we don't want to email
# people whose addresses have been used without their consent.
email_query = f"""
SELECT 
  gu_name AS user,
  gu_email AS email
FROM centralauth.globaluser
WHERE
  gu_name IN {users} AND
  gu_email_authenticated IS NOT NULL AND
  gu_email != ""
"""

user_emails = wmf.mariadb.run(
  email_query,
  dbs="centralauth"
)

In [192]:
user_emails.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36484 entries, 0 to 36483
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   user    36484 non-null  object
 1   email   36484 non-null  object
dtypes: object(2)
memory usage: 570.2+ KB


In [112]:
# The left join will exclude users without confirmed email addresses.
frame = pd.merge(user_emails, raw_frame, on="user", how="left")

In [226]:
disablemail_fragments = []
projects = frame["home_proj"].unique()
for proj in projects:
  proj_users = frame.query("home_proj == @proj")["user"]
  proj_users = make_sql_tuple(proj_users.tolist())
  
  props = wmf.mariadb.run(f"""
  SELECT
      user_name as user
  FROM user
  LEFT JOIN user_properties
  ON user_id = up_user
  WHERE
    user_name in {proj_users} AND
    up_property = "disablemail" AND
    up_value = 1
  """, dbs=proj)
  
  disablemail_fragments.append(props)

disablemail_users = pd.concat(disablemail_fragments)

In [234]:
disablemail_list = disablemail_users["user"].tolist()
frame = frame.set_index("user").drop(disablemail_list)

# Enriching frame

In [113]:
# Add project groups
grouped_projects = pd.read_table("definitions/project-group-assignments.tsv").drop("project_name", axis = 1)

In [114]:
frame = (
  frame
  .merge(grouped_projects, how = "left", left_on = "home_proj", right_on = "project_key" )
  .drop("project_key", axis = 1)
)

In [115]:
# Any project without an explcit assignment is in the "other" group
frame["project_group"] = frame["project_group"].fillna("other")

In [117]:
frame["project_group"].value_counts()

enwiki       10938
commons       3218
dewiki        2887
cee_wps       2648
frwiki        2220
ruwiki        1757
meaf_wps      1619
jawiki        1500
eswiki        1471
other         1458
weur_wps      1207
zhwiki        1172
itwiki        1027
ptwiki         532
nlwiki         518
wikidata       488
sasia_wps      421
arwiki         348
malay_wps      281
asia_wps       248
kowiki         241
viwiki         197
metawiki        90
Name: project_group, dtype: int64

# Removing users

Removed any identified users. This consists of a combination of the "2019 contributor opt-outs" and the "Dashboard Leaders by home wiki 2018-2019" as our opt-out list.

For the 2019 Sampling pull, we decided to pull additional users from each target project. We excluded users in the first sample pulled to get only new users. 

In [119]:
# These are users who are program leaders or who opted out in the past
optouts = pd.read_table("secrets/optouts.tsv")

In [None]:
optouts

In [85]:
to_remove = frame[frame["user"].isin(optouts)].index

# How many users are we removing?
len(to_remove)

10670

In [86]:
frame = frame.drop(to_remove)

# Check validity

In [121]:
frame_len = len(frame["user"])

In [122]:
# Are all our users unique?
frame.user.nunique() == frame_len

True

In [123]:
# Are all our users in a group?
frame.groupby("project_group")["user"].count().sum() == frame_len

True

In [126]:
# Do we have the 23 groups we want?
frame["project_group"].nunique() == 23

True

In [237]:
# Note: This file now contains senstive information.
frame.to_csv("secrets/sampling-frame.tsv", sep = "\t")