In [2]:
import wmfdata as wmf
import pandas as pd

# Load existing list of grouped projects

This file is the canonical source for which project keys belong to which groups (any keys not found in the file belong to the "other" group). That information needs to be preserved.

On the other hand, the human-readable project names are just for the convenience of anyone reading the file, so they don't need to be preserved and will just get in the way.

In [39]:
grouped_projects = (
  pd.read_table("definitions/project-group-assignments.tsv")
  .set_index("project_key")
  .drop("project_name", axis=1)
)

grouped_projects.head()

Unnamed: 0_level_0,project_group
project_key,Unnamed: 1_level_1
aawiki,meaf_wps
abwiki,meaf_wps
acewiki,asia_wps
adywiki,cee_wps
afwiki,meaf_wps


# Update list

Let's pull an up-to-date list of all open content projects so we can update our existing list with new projects, closed projects, and updated names.

In [108]:
# It's likely necessary to update the `canonical_data.wikis` table first.
# See github.com/wikimedia-research/canonical-data/ for more info.

all_projects = wmf.spark.run("""
  SELECT
    database_code as project_key,
    english_name as project_name
  FROM
    canonical_data.wikis
  WHERE
    database_group in (
      "commons", "incubator", "foundation", "mediawiki", "meta", "sources",
      "species","wikibooks", "wikidata", "wikinews", "wikipedia", "wikiquote",
      "wikisource", "wikiversity", "wikivoyage", "wiktionary"
    ) AND
    status = "open" AND
    visibility = "public" AND
    editability = "public"
""").set_index("project_key")

all_projects.head()

Unnamed: 0_level_0,project_name
project_key,Unnamed: 1_level_1
abwiki,Abkhazian Wikipedia
acewiki,Achinese Wikipedia
adywiki,Adyghe Wikipedia
afwiki,Afrikaans Wikipedia
afwikibooks,Afrikaans Wikibooks


In [64]:
projects = pd.merge(
  all_projects, 
  grouped_projects,
  on="project_key",
  # This ensures that keys present only in the old list, which 
  # represent closed or deleted projects, are dropped.
  how="left"
)

projects.head()

Unnamed: 0_level_0,project_name,project_group
project_key,Unnamed: 1_level_1,Unnamed: 2_level_1
abwiki,Abkhazian Wikipedia,meaf_wps
acewiki,Achinese Wikipedia,asia_wps
adywiki,Adyghe Wikipedia,cee_wps
afwiki,Afrikaans Wikipedia,meaf_wps
afwikibooks,Afrikaans Wikibooks,


Most Wikipedias should be placed in one of the regional groupings, so let's see if there are new ones that should be grouped.

In [74]:
projects.query("project_name.str.contains('ikipedia') & project_group.isna()")

Unnamed: 0_level_0,project_name,project_group
project_key,Unnamed: 1_level_1,Unnamed: 2_level_1
arywiki,Moroccan Arabic Wikipedia,
avkwiki,Kotava Wikipedia,
awawiki,Awadhi Wikipedia,
banwiki,Balinese Wikipedia,
gcrwiki,Guianan Creole Wikipedia,
lldwiki,Ladin Wikipedia,
mnwwiki,Mon Wikipedia,
szywiki,Sakizaya Wikipedia,


Let's add the groups for these, explicitly marking them as "other" if necessary so they don't show up as new next time.

In [78]:
new_wiki_groups = pd.DataFrame([
  ["arywiki", "meaf_wps"],
  ["avkwiki", "other"],
  ["awawiki", "sasia_wps"],
  ["banwiki", "asia_wps"],
  ["gcrwiki", "other"],
  ["lldwiki", "weur_wps"],
  ["mnwwiki", "asia_wps"],
  ["szywiki", "asia_wps"]
], columns=["project_key", "project_group"]).set_index("project_key")

In [79]:
projects.update(new_wiki_groups)

This year, there are also some requested group changes.

In [87]:
changed_groups = pd.DataFrame([
  ["aswiki", "sasia_wps"],
  ["bhwiki", "sasia_wps"],
  ["bnwiki", "sasia_wps"],
  ["dtywiki", "sasia_wps"],
  ["knwiki", "sasia_wps"],
  ["mrwiki", "sasia_wps"],
  ["newiki", "sasia_wps"],
  ["orwiki", "sasia_wps"],
  ["pswiki", "sasia_wps"],
  ["satwiki", "sasia_wps"],
  ["sawiki", "sasia_wps"],
  ["sdwiki", "sasia_wps"],
  ["siwiki", "sasia_wps"],
  ["tcywiki", "sasia_wps"],
  ["tewiki", "sasia_wps"],
  ["metawiki", "metawiki"]
], columns=["project_key", "project_group"]).set_index("project_key")

In [88]:
projects.update(changed_groups)

# Save the updated list

We'll strip anything with an undefined group (but *not* an explicit group of "other") before saving, so the file is limited to projects that have been manually categorized.

In [107]:
(projects
.query("project_group.notna()")
# Put this is a nice order for humans
.sort_values("project_name")
.reset_index()
[["project_name", "project_key", "project_group"]]
# For some reason, this doesn't overwrite the old file sometimes
.to_csv("definitions/project-group-assignments.tsv", sep = "\t", index=False)
)