# Create program and event users table

Create a table Programs and Events Dashboard users from 2018-2019  with emails so they can be appended to the active editors sample

In [21]:
from wmfdata import mariadb
import pandas as pd

In [22]:
pe_users = pd.read_table("secrets/programs-events-dashboard-users.tsv")
pe_users.head()

Unnamed: 0,home_project,user_name
0,Not specified,Blowpuncher
1,Not specified,Everettrsmithllc
2,Not specified,GualdimG
3,Not specified,Natharchives
4,Not specified,Realmarce


# Obtain user email addresses if available

In [23]:
TARGET_USERS = pe_users['user_name'].tolist()

In [24]:
#query centralauth.globaluser table in replicas to obtain email addresses

email_query = """
SELECT 
 gu_name as user_name, gu_email as email_address, 
 CAST(gu_email_authenticated as datetime) as email_verification_date
FROM centralauth.globaluser
WHERE gu_name IN ({target_users})
"""

In [25]:
#run target users list through query. Made adjustment to account for speical characters in user names. 
pe_user_email = mariadb.run(
    email_query.format(target_users = ','.join(["'"+ str(tu).replace("'", "\\'") + "'" for tu in TARGET_USERS]))
        , dbs = 'centralauth'
)

In [26]:
#Note: This data is private
pe_user_email.to_csv("data/interim/pe_user_email.tsv", sep = "\t", index = False)

# Enriching Frame

In [27]:
# join emails with pe_users table
# Note: home_projects info was pulled from the Dashboard and Program Event Leaders spreadsheet for this data.
pe_users = pe_users.merge(pe_user_email, how = "left", left_on = "user_name", right_on = "user_name")

In [28]:
#changes to make in similar format to sampling frame table so it can easily be appended.
domains = pd.read_table("data/raw/project-domains.tsv", index_col = 0).reset_index()
domains.head()


Unnamed: 0,project_key,project_domain
0,aawiki,aa.wikipedia.org
1,aawiktionary,aa.wiktionary.org
2,aawikibooks,aa.wikibooks.org
3,abwiki,ab.wikipedia.org
4,abwiktionary,ab.wiktionary.org


In [29]:
pe_users = pe_users.merge(domains, how = "left", left_on = "home_project", right_on = "project_domain" )
pe_users= pe_users.drop(["home_project", "project_domain"], axis = 1) 

In [None]:
# label all program and events dashboard leaders as the same group: pe_dashboard_users to help with filtering if needed
pe_users['project_group'] = "pe_dashboard_users"

In [None]:
pe_users.columns = ['user_name','email_address',
                     'email_verification_date','home_project', 'project_group']

In [None]:
column_titles = ['user_name', 'home_project', 'email_address', 'email_verification_date', 'project_group']
pe_users.reindex(columns=column_titles)

# Check Validity

In [33]:
pe_users_len = len(pe_users["user_name"])

In [34]:
# Are all our users unique?
pe_users.user_name.nunique() == pe_users_len

True

In [35]:
# Do we have the 1325 users we want?
pe_users["user_name"].nunique() == 1325

True

In [36]:
# Those 10 users check out, let's write the enhanced frame to a TSV!
# Note: This file now contains senstive information.
pe_users.to_csv("data/interim/pe_users_table.tsv", sep = "\t", index = False)