# Generate list of project domains

In [11]:
from wmfdata import mariadb
import pandas as pd

In [12]:
proj_domains = mariadb.run("""
    SELECT
        site_global_key as project_key,
        trim(leading "." from reverse(site_domain)) as project_domain
    FROM 
     sites
""", dbs = "enwiki")

proj_domains.head()

Unnamed: 0,project_key,project_domain
0,aawiki,aa.wikipedia.org
1,aawiktionary,aa.wiktionary.org
2,aawikibooks,aa.wikibooks.org
3,abwiki,ab.wikipedia.org
4,abwiktionary,ab.wiktionary.org


In [13]:
proj_domains.to_csv("data/raw/project-domains.tsv", sep = "\t", index = False)

# Add the groups of any new Wikipedias

In [14]:
cur_wps = mariadb.run("""
    select 
        site_global_key as project_key,
        trim(leading "." from reverse(site_domain)) as project_domain
    from
        sites
    where
        site_group = "wikipedia"
""", dbs = "enwiki")

cur_wps.head()

Unnamed: 0,project_key,project_domain
0,aawiki,aa.wikipedia.org
1,abwiki,ab.wikipedia.org
2,acewiki,ace.wikipedia.org
3,afwiki,af.wikipedia.org
4,akwiki,ak.wikipedia.org


In [15]:
prev_wps = pd.read_csv("data/raw/project-groups.tsv", sep = "\t")
prev_wps.head()

Unnamed: 0,project_name,project_key,project_group
0,Afar Wikipedia,aawiki,meaf_wps
1,Abkhazian Wikipedia,abwiki,meaf_wps
2,Acehnese Wikipedia,acewiki,asia_wps
3,Adyghe Wikipedia,adywiki,cee_wps
4,Afrikaans Wikipedia,afwiki,meaf_wps


In [16]:
# Find any projects in cur_wps but not in prev_wps
set(cur_wps["project_key"]) - set(prev_wps["project_key"])

set()

In [17]:
# Add the new projects manually—congratulations to the pioneer editors in these languages!

new_wps = pd.DataFrame.from_records(
    [ ["Nko Wikipedia", "nqowiki", "meaf_wps"]],
    columns = ["project_name", "project_key", "project_group"])

In [18]:
proj_info = prev_wps.append(new_wps)
proj_info = proj_info.sort_values(by = "project_key")
proj_info.head()

Unnamed: 0,project_name,project_key,project_group
0,Afar Wikipedia,aawiki,meaf_wps
1,Abkhazian Wikipedia,abwiki,meaf_wps
2,Acehnese Wikipedia,acewiki,asia_wps
3,Adyghe Wikipedia,adywiki,cee_wps
4,Afrikaans Wikipedia,afwiki,meaf_wps


In [19]:
# Make sure it's fixed
set(cur_wps["project_key"]) - set(proj_info["project_key"])

set()

In [20]:
# It's fixed—update the file!
proj_info.to_csv("data/raw/project-groups.tsv", sep = "\t", index = False)