# Creating raw frame

In [1]:
from wmfdata import hive, mariadb
import pandas as pd


You can find the source for `wmfdata` at https://github.com/neilpquinn/wmfdata


In [2]:
HIVE_SNAPSHOT = "2019-07"
END_OF_DATA = "2019-08-01 00:00:00"
DAYS_IN_LAST_3_MO = 90

In [3]:
frame_query = """
-- TABLE OF
with mo_edits as (
    select
        event_user_text as user,
        year(event_timestamp) as year,
        month(event_timestamp) as month,
        sum(if(wiki_db = "wikidatawiki", 0.1, 1)) as edits
    from wmf.mediawiki_history
    where
        -- REGISTERED
        event_user_is_anonymous = false and
        
        -- NON-BOT
        size(event_user_is_bot_by) = 0 and
        not array_contains(event_user_groups, "bot") and
        
        -- CONTENT
        page_namespace_is_content_historical = true and
        
        -- EDITS
        event_entity = "revision" and
        event_type = "create" and
        
        -- FROM THE LAST 3 MONTHS
        unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") 
            > (unix_timestamp("{end_of_data}") - (60 * 60 * 24 * {days_in_last_3_mo})) and

        -- FROM THE LATEST SNAPSHOT
        snapshot = "{hive_snapshot}"    
    -- PER USER, PER MONTH
    group by event_user_text, year(event_timestamp), month(event_timestamp)
), 

-- TABLE OF
yr_proj_edits as (
    select
        event_user_text as user,
        wiki_db as proj,
        sum(if(wiki_db = "wikidatawiki", 0.1, 1)) as edits,
        max(event_timestamp) as latest_edit
    from wmf.mediawiki_history
    where
        -- REGISTERED
        event_user_is_anonymous = false and
        
        -- NON-BOT
        size(event_user_is_bot_by) = 0 and
        not array_contains(event_user_groups, "bot") and
        
        -- EDITS
        event_entity = "revision" and
        event_type = "create" and
        
        -- FROM THE LAST YEAR
        unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") 
            > (unix_timestamp("{end_of_data}") - (60 * 60 * 24 * 365)) and
        
        -- FROM THE LATEST SNAPSHOT
        snapshot = "{hive_snapshot}"
    
    -- PER USER, PER WIKI
    group by event_user_text, wiki_db
)

-- FINAL SELECT OF
select 
    recent_actives.user as user,
    yr_edits.proj as home_proj,
    global_edits

-- USERS ACTIVE IN 2 OF 3 MONTHS
from (
    select
        user,
        sum(if(edits >= 5, 1, 0)) as active_months
    from mo_edits
    where
        -- WHO ARE NOT WMF STAFF
        user not like "%WMF%"
    group by user
    having active_months >= 2
) recent_actives

-- JOINED TO THEIR HOME WIKI AND GLOBAL EDITS
left join (
    select
        user,
        proj,
        -- in the unlikely event that wikis are tied by edit count and latest edit, 
        -- row_number() will break it somehow
        row_number() over (partition by user order by edits desc, latest_edit desc) as rank,
        sum(edits) over (partition by user) as global_edits
    from yr_proj_edits
) yr_edits
on 
    recent_actives.user = yr_edits.user and
    rank = 1
"""

In [4]:
raw_frame = hive.run(
    frame_query.format(
        hive_snapshot = HIVE_SNAPSHOT,
        end_of_data = END_OF_DATA,
        days_in_last_3_mo = DAYS_IN_LAST_3_MO
    )
)

In [5]:
raw_frame.head()

Unnamed: 0,user,home_proj,global_edits
0,!Silent,ptwiki,2766.3
1,!dea4u,enwiki,400.0
2,!nnovativ,dewiki,206.0
3,"""Colorado Campeão""!",ptwiki,651.0
4,$Mathe94$,dewiki,62.0


In [6]:
raw_frame.to_csv("data/raw/sampling-frame.tsv", sep = "\t", index = False)

## Obtain user email addresses if available

In [7]:
#Create list of active uses from raw_frame 
TARGET_USERS = raw_frame['user'].tolist()

In [11]:
#query centralauth.globaluser table in replicas to obtain email addresses
#Note: not clear if this is more reliable or up to date than info in individual wikis but I did not spot
#check and it appeared both held the same info. 

email_query = """
SELECT 
 gu_name as user, gu_email as user_email, 
 CAST(gu_email_authenticated as datetime) as email_verification_date
FROM centralauth.globaluser
WHERE gu_name IN ({target_users})
"""


In [12]:
#run target users list through query. Made adjustment to account for speical characters in user names. 
user_email = mariadb.run(
    email_query.format(target_users = ','.join(["'"+ str(tu).replace("'", "\\'") + "'" for tu in TARGET_USERS]))
        , dbs = 'centralauth'
)

In [13]:
#Note: This data is private and should not be uploaded publically. Any output with email from this notebook also removed.
user_email.to_csv("data/interim/user-email.tsv", sep = "\t", index = False)

# Enriching frame

In [14]:
# Import without human-readable name column
proj_groups = pd.read_table("data/raw/project-groups.tsv").drop("project_name", axis = 1)
proj_groups.head()

Unnamed: 0,project_key,project_group
0,aawiki,meaf_wps
1,abwiki,meaf_wps
2,acewiki,asia_wps
3,adywiki,cee_wps
4,afwiki,meaf_wps


In [15]:
frame = raw_frame.merge(proj_groups, how = "left", left_on = "home_proj", right_on = "project_key" )
frame = frame.drop("project_key", axis = 1)
frame.head()

Unnamed: 0,user,home_proj,global_edits,project_group
0,!Silent,ptwiki,2766.3,ptwiki
1,!dea4u,enwiki,400.0,enwiki
2,!nnovativ,dewiki,206.0,dewiki
3,"""Colorado Campeão""!",ptwiki,651.0,ptwiki
4,$Mathe94$,dewiki,62.0,dewiki


In [16]:
domains = pd.read_table("data/raw/project-domains.tsv", index_col = 0).reset_index()
domains.head()

Unnamed: 0,project_key,project_domain
0,aawiki,aa.wikipedia.org
1,aawiktionary,aa.wiktionary.org
2,aawikibooks,aa.wikibooks.org
3,abwiki,ab.wikipedia.org
4,abwiktionary,ab.wiktionary.org


In [17]:
frame = frame.merge(domains, how = "left", left_on = "home_proj", right_on = "project_key" )
frame = frame.drop("project_key", axis = 1)
frame.head()

Unnamed: 0,user,home_proj,global_edits,project_group,project_domain
0,!Silent,ptwiki,2766.3,ptwiki,pt.wikipedia.org
1,!dea4u,enwiki,400.0,enwiki,en.wikipedia.org
2,!nnovativ,dewiki,206.0,dewiki,de.wikipedia.org
3,"""Colorado Campeão""!",ptwiki,651.0,ptwiki,pt.wikipedia.org
4,$Mathe94$,dewiki,62.0,dewiki,de.wikipedia.org


In [18]:
# Any project not in project-groups.tsv is in the "other" group
frame["project_group"] = frame["project_group"].fillna("other")


## Add user emails to sampling frame

In [19]:
emails = pd.read_table("data/interim/user-email.tsv", index_col = 0).reset_index()

In [None]:
frame = frame.merge(emails, how = "left", left_on = "user", right_on = "user")

# Removing users

In [22]:
# These are users who are program leaders or who opted out in the past
optouts = pd.read_table("secrets/optouts.tsv", squeeze = True)

In [23]:
to_remove = frame[frame["user"].isin(optouts)].index

# How many users are we removing?
len(to_remove)

502

In [24]:
frame = frame.drop(to_remove)

# Check validity

In [25]:
frame_len = len(frame["user"])

In [26]:
# Are all our users unique?
frame.user.nunique() == frame_len

True

In [27]:
# Are all our users in a group?
frame.groupby("project_group")["user"].count().sum() == frame_len

True

In [28]:
# Do we have the 22 groups we want?
frame["project_group"].nunique() == 22

True

In [None]:
# Pick 10 random users to check against [an edit counter](https://xtools.wmflabs.org/ec)
# to make sure their edit counts and home projects are correct
#Note: Output hidden due to privacy reasons
frame.sample(n = 10, random_state = 123)

In [30]:
# Those 10 users check out, let's write the enhanced frame to a TSV!
# Note: This file now contains senstive information.
frame.to_csv("data/interim/sampling-frame.tsv", sep = "\t", index = False)